def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("<special>", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a <special> token") special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0] self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: tokenizer_p = self.tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) p_output = tokenizer_p.encode("Hey this is a <special> token") cr_output = tokenizer_r.encode("Hey this is a <special> token") self.assertEqual(p_output, r_output) self.assertEqual(cr_output, r_output) self.assertTrue(special_token_id in p_output) self.assertTrue(special_token_id in cr_output)
def __init__( self, vocab_file, merges_file, errors="replace", bos_token="<BOS>", eos_token="<EOS>", sep_token="<&&&>", cls_token="<s>", unk_token="<UNK>", pad_token="<PAD>", mask_token="<MASK>", visual_token="<VISUAL>", add_prefix_space=False, **kwargs ): visual_token = AddedToken(visual_token, lstrip=False, rstrip=False) if isinstance(visual_token, str) else visual_token super().__init__( vocab_file=vocab_file, merges_file=merges_file, errors=errors, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, add_prefix_space=add_prefix_space, **kwargs, )
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): tokenizer_list = [] if self.test_slow_tokenizer: tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) if self.test_rust_tokenizer: tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) for tokenizer_class, tokenizer_utils in tokenizer_list: with tempfile.TemporaryDirectory() as tmp_dir: tokenizer_utils.save_pretrained(tmp_dir) with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: special_tokens_map = json.load(json_file) with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: tokenizer_config = json.load(json_file) # a special token for Canine can be defined as follows: NEW_TOKEN = 0xE006 new_token_1 = chr(NEW_TOKEN) special_tokens_map["additional_special_tokens"] = [new_token_1] tokenizer_config["additional_special_tokens"] = [new_token_1] with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: json.dump(special_tokens_map, outfile) with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: json.dump(tokenizer_config, outfile) # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and # "special_tokens_map.json" files tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0) self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens) # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab self.assertEqual( [new_token_1], tokenizer_without_change_in_init.convert_ids_to_tokens( tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1]) ), ) NEW_TOKEN = 0xE007 new_token_2 = chr(NEW_TOKEN) # Now we test that we can change the value of additional_special_tokens in the from_pretrained new_added_tokens = [AddedToken(new_token_2, lstrip=True)] tokenizer = tokenizer_class.from_pretrained( tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0 ) self.assertIn(new_token_2, tokenizer.additional_special_tokens) # self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab self.assertEqual( [new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2])) )
def test_added_token_serializable(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): # a special token for Canine can be defined as follows: NEW_TOKEN = 0xE006 new_token = chr(NEW_TOKEN) new_token = AddedToken(new_token, lstrip=True) tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) with tempfile.TemporaryDirectory() as tmp_dir_name: tokenizer.save_pretrained(tmp_dir_name) tokenizer.from_pretrained(tmp_dir_name)
def __init__(self, vocab_file, do_lower_case=False, remove_space=True, keep_accents=False, bos_token="[CLS]", eos_token="[SEP]", unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", additional_special_tokens=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance( mask_token, str) else mask_token self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs super().__init__( vocab_file, do_lower_case=do_lower_case, remove_space=remove_space, keep_accents=keep_accents, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self._pad_token_type_id = 0