Exemple #1
0
    def test_special_tokens_initialization(self):
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):

                added_tokens = [AddedToken("<special>", lstrip=True)]

                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
                )
                r_output = tokenizer_r.encode("Hey this is a <special> token")

                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]

                self.assertTrue(special_token_id in r_output)

                if self.test_slow_tokenizer:
                    tokenizer_p = self.tokenizer_class.from_pretrained(
                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
                    )

                    p_output = tokenizer_p.encode("Hey this is a <special> token")

                    cr_output = tokenizer_r.encode("Hey this is a <special> token")

                    self.assertEqual(p_output, r_output)
                    self.assertEqual(cr_output, r_output)
                    self.assertTrue(special_token_id in p_output)
                    self.assertTrue(special_token_id in cr_output)
Exemple #2
0
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<BOS>",
        eos_token="<EOS>",
        sep_token="<&&&>",
        cls_token="<s>",
        unk_token="<UNK>",
        pad_token="<PAD>",
        mask_token="<MASK>",
        visual_token="<VISUAL>",
        add_prefix_space=False,
        **kwargs
    ):
        visual_token = AddedToken(visual_token, lstrip=False, rstrip=False) if isinstance(visual_token, str) else visual_token

        super().__init__(
            vocab_file=vocab_file,
            merges_file=merges_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
        tokenizer_list = []
        if self.test_slow_tokenizer:
            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))

        if self.test_rust_tokenizer:
            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))

        for tokenizer_class, tokenizer_utils in tokenizer_list:
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer_utils.save_pretrained(tmp_dir)

                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
                    special_tokens_map = json.load(json_file)

                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
                    tokenizer_config = json.load(json_file)

                # a special token for Canine can be defined as follows:
                NEW_TOKEN = 0xE006
                new_token_1 = chr(NEW_TOKEN)

                special_tokens_map["additional_special_tokens"] = [new_token_1]
                tokenizer_config["additional_special_tokens"] = [new_token_1]

                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
                    json.dump(special_tokens_map, outfile)
                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
                    json.dump(tokenizer_config, outfile)

                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
                # "special_tokens_map.json" files
                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
                self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
                # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
                self.assertEqual(
                    [new_token_1],
                    tokenizer_without_change_in_init.convert_ids_to_tokens(
                        tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
                    ),
                )

                NEW_TOKEN = 0xE007
                new_token_2 = chr(NEW_TOKEN)
                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
                new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
                tokenizer = tokenizer_class.from_pretrained(
                    tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
                )

                self.assertIn(new_token_2, tokenizer.additional_special_tokens)
                # self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
                self.assertEqual(
                    [new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
                )
    def test_added_token_serializable(self):
        tokenizers = self.get_tokenizers(do_lower_case=False)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):

                # a special token for Canine can be defined as follows:
                NEW_TOKEN = 0xE006
                new_token = chr(NEW_TOKEN)

                new_token = AddedToken(new_token, lstrip=True)
                tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})

                with tempfile.TemporaryDirectory() as tmp_dir_name:
                    tokenizer.save_pretrained(tmp_dir_name)
                    tokenizer.from_pretrained(tmp_dir_name)
    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 remove_space=True,
                 keep_accents=False,
                 bos_token="[CLS]",
                 eos_token="[SEP]",
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 additional_special_tokens=None,
                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
                 **kwargs) -> None:
        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token,
                                lstrip=True, rstrip=False) if isinstance(
                                    mask_token, str) else mask_token

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        super().__init__(
            vocab_file,
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )
        self._pad_token_type_id = 0