def test_full_tokenizer(self): """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] with TemporaryDirectory() as tmpdirname: vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) with open(merges_file, "w") as fp: fp.write("\n".join(merges)) input_text = u"lower newer" output_text = u"lower newer" create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname) tokenizer = XLMTokenizer(vocab_file, merges_file) text = "lower" bpe_tokens = ["low", "er</w>"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + ["<unk>"] input_bpe_tokens = [14, 15, 20] self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_full_tokenizer(self): """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ tokenizer = XLMTokenizer(self.vocab_file, self.merges_file) text = "lower" bpe_tokens = ["low", "er</w>"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + ["<unk>"] input_bpe_tokens = [14, 15, 20] self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def xlmTokenizer(*args, **kwargs): """ Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file. Args: pretrained_model_name_or_path: Path to pretrained model archive or one of pre-trained vocab configs below. * xlm-mlm-en-2048 Keyword args: special_tokens: Special tokens in vocabulary that are not pretrained Default: None max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying model's sequence length. Default: None Example: >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048') >>> text = "Who was Jim Henson ?" >>> indexed_tokens = tokenizer.encode(tokenized_text) """ tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs) return tokenizer
def test_sequence_builders(self): tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) assert encoded_sentence == [1] + text + [1] assert encoded_pair == [1] + text + [1] + text_2 + [1]
def get_tokenizer(self): return XLMTokenizer.from_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs): return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)