Exemple #1
0
    def test_full_tokenizer(self):
        tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "adapt react readapt apt"
        bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]

        input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
    def test_tokenization_ctrl(self):
        # Given
        self.base_tokenizer = CTRLTokenizer.from_pretrained(
            'ctrl', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyCtrlTokenizer(
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
                ['ctrl']),
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['merges_file']
                ['ctrl']))
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for rust, baseline in zip(output_rust, output_baseline):
            assert (rust.token_ids == baseline['input_ids'])
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
Exemple #3
0
    def setup_class(self):
        self.use_gpu = torch.cuda.is_available()
        self.test_dir = Path(tempfile.mkdtemp())

        self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl', do_lower_case=True,
                                                            cache_dir=self.test_dir)
        self.rust_tokenizer = PyCtrlTokenizer(
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['ctrl']),
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['ctrl'])
        )
        self.model = CTRLModel.from_pretrained('ctrl',
                                               output_attentions=False).eval()
        if self.use_gpu:
            self.model.cuda()
        self.sentence_list = ['For instance, on the planet Earth, man had always assumed that he was more intelligent '
                              'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst'
                              ' all the dolphins had ever done was muck about in the water having a good time. But '
                              'conversely, the dolphins had always believed that they were far more intelligent than '
                              'man—for precisely the same reasons.'] * 1

        # Pre-allocate GPU memory
        tokens_list = [self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list]
        features = [self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list]
        features = [self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for
                    input
                    in features]
        all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)

        if self.use_gpu:
            all_input_ids = all_input_ids.cuda()

        with torch.no_grad():
            _ = self.model(all_input_ids)[0].cpu().numpy()
    def test_tokenization_ctrl(self):
        # Given
        self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl',
                                                            do_lower_case=True,
                                                            cache_dir=self.test_dir)
        self.rust_tokenizer = PyCtrlTokenizer(
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['ctrl']),
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['ctrl']),
            do_lower_case=True
        )
        output_baseline = []
        for example in self.examples:
            output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
                                                                   add_special_tokens=True,
                                                                   return_overflowing_tokens=True,
                                                                   return_special_tokens_mask=True,
                                                                   max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
                                                      max_len=128,
                                                      truncation_strategy='longest_first',
                                                      stride=0)

        # Then
        for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f' Python {baseline["input_ids"]}'
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
Exemple #5
0
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Exemple #6
0
 def setup_base_tokenizer(self):
     self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl', do_lower_case=True,
                                                         cache_dir=self.test_dir)