def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                 "w</w>", "r</w>", "t</w>",
                 "lo", "low", "er</w>",
                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]

        with TemporaryDirectory() as tmpdirname:
            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
            with open(vocab_file, "w") as fp:
                fp.write(json.dumps(vocab_tokens))
            with open(merges_file, "w") as fp:
                fp.write("\n".join(merges))

            input_text = u"lower newer"
            output_text = u"lower newer"

            create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname)

            tokenizer = XLMTokenizer(vocab_file, merges_file)

            text = "lower"
            bpe_tokens = ["low", "er</w>"]
            tokens = tokenizer.tokenize(text)
            self.assertListEqual(tokens, bpe_tokens)

            input_tokens = tokens + ["<unk>"]
            input_bpe_tokens = [14, 15, 20]
            self.assertListEqual(
                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
Esempio n. 2
0
    def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)

        text = "lower"
        bpe_tokens = ["low", "er</w>"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + ["<unk>"]
        input_bpe_tokens = [14, 15, 20]
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
Esempio n. 3
0
def xlmTokenizer(*args, **kwargs):
    """
    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * xlm-mlm-en-2048
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying model's
             sequence length.
             Default: None

    Example:
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
Esempio n. 4
0
    def test_sequence_builders(self):
        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")

        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")

        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)

        assert encoded_sentence == [1] + text + [1]
        assert encoded_pair == [1] + text + [1] + text_2 + [1]
Esempio n. 5
0
 def get_tokenizer(self):
     return XLMTokenizer.from_pretrained(self.tmpdirname)
Esempio n. 6
0
 def get_tokenizer(self, **kwargs):
     return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)