def xlmTokenizer(*args, **kwargs): """ Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file. Args: pretrained_model_name_or_path: Path to pretrained model archive or one of pre-trained vocab configs below. * xlm-mlm-en-2048 Keyword args: special_tokens: Special tokens in vocabulary that are not pretrained Default: None max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying model's sequence length. Default: None Example: >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048') >>> text = "Who was Jim Henson ?" >>> indexed_tokens = tokenizer.encode(tokenized_text) """ tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs) return tokenizer
def test_sequence_builders(self): tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) assert encoded_sentence == [1] + text + [1] assert encoded_pair == [1] + text + [1] + text_2 + [1]
def get_tokenizer(self): return XLMTokenizer.from_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs): return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)