def test_full_tokenizer(self): tokenizer = PhobertTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "Tôi là VinAI Research" bpe_tokens = "T@@ ô@@ i l@@ à V@@ i@@ n@@ A@@ I R@@ e@@ s@@ e@@ a@@ r@@ c@@ h".split() tokens = tokenizer.tokenize(text) print(tokens) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [4, 3, 5, 3, 3, 3, 3, 3, 3, 6, 7, 9, 3, 9, 3, 3, 3, 3, 3] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs)