def test_roberta(self): for tokenizer_name in RobertaTokenizer.pretrained_vocab_files_map["vocab_file"].keys(): tokenizer_p = RobertaTokenizer.from_pretrained(tokenizer_name) tokenizer_r = RobertaTokenizerFast.from_pretrained(tokenizer_name) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False)) self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) # Assert the set of special tokens match. self.assertSequenceEqual( tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(), "Roberta tokenizers doesn't have the same set of special_tokens", ) # Assure tokenization overlap between python and rust impl. self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.01) # Ensure add_tokens and add_special_tokens return the correct vocab size self.assert_add_tokens(tokenizer_r) # Check for offsets mapping self.assert_offsets_mapping(tokenizer_r) # Check for dynamic encoding sequence handling in batch_encode_plus self.assert_batch_encode_dynamic_overflowing(tokenizer_r) # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
def create_tokenizer(self): if self.isComputed(): logger.info("Tokenizer for this dataset has already been created") self.tokenizer = RobertaTokenizerFast.from_pretrained( f"{self.data_dir}", max_len=512) return logger.info(f"Training tokenizer on data in {self.data_dir}") self.train() azure_storage.upload(self.data_dir / "vocab.json") azure_storage.upload(self.data_dir / "merges.txt")
def get_rust_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)