コード例 #1
0
    def test_roberta(self):
        for tokenizer_name in RobertaTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
            tokenizer_p = RobertaTokenizer.from_pretrained(tokenizer_name)
            tokenizer_r = RobertaTokenizerFast.from_pretrained(tokenizer_name)

            # Check we have the same number of added_tokens for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))

            # Check we have the correct max_length for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)

            # Assert the set of special tokens match.
            self.assertSequenceEqual(
                tokenizer_p.special_tokens_map.items(),
                tokenizer_r.special_tokens_map.items(),
                "Roberta tokenizers doesn't have the same set of special_tokens",
            )

            # Assure tokenization overlap between python and rust impl.
            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.01)

            # Ensure add_tokens and add_special_tokens return the correct vocab size
            self.assert_add_tokens(tokenizer_r)

            # Check for offsets mapping
            self.assert_offsets_mapping(tokenizer_r)

            # Check for dynamic encoding sequence handling in batch_encode_plus
            self.assert_batch_encode_dynamic_overflowing(tokenizer_r)

            # Check alignment for build_inputs_with_special_tokens
            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
コード例 #2
0
    def create_tokenizer(self):
        if self.isComputed():
            logger.info("Tokenizer for this dataset has already been created")
            self.tokenizer = RobertaTokenizerFast.from_pretrained(
                f"{self.data_dir}", max_len=512)
            return

        logger.info(f"Training tokenizer on data in {self.data_dir}")

        self.train()
        azure_storage.upload(self.data_dir / "vocab.json")
        azure_storage.upload(self.data_dir / "merges.txt")
コード例 #3
0
 def get_rust_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)