コード例 #1
0
def xlmTokenizer(*args, **kwargs):
    """
    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * xlm-mlm-en-2048
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying model's
             sequence length.
             Default: None

    Example:
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
コード例 #2
0
    def test_sequence_builders(self):
        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")

        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")

        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)

        assert encoded_sentence == [1] + text + [1]
        assert encoded_pair == [1] + text + [1] + text_2 + [1]
コード例 #3
0
 def get_tokenizer(self):
     return XLMTokenizer.from_pretrained(self.tmpdirname)
コード例 #4
0
 def get_tokenizer(self, **kwargs):
     return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)