Ejemplo n.º 1
0
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize(u'This is a test')
        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [285, 46, 10, 170, 382])

        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
            SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
            SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',',
            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
            u'é', u'.'
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(ids, [
            8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72,
            80, 6, 0, 4
        ])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, [
            SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
            SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
            SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
            u'<unk>', u'.'
        ])
Ejemplo n.º 2
0
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        with TemporaryDirectory() as tmpdirname:
            tokenizer.save_pretrained(tmpdirname)

            input_text = u"This is a test"
            output_text = u"This is a test"

            create_and_check_tokenizer_commons(self, input_text, output_text,
                                               XLNetTokenizer, tmpdirname)

            tokens = tokenizer.tokenize(u'This is a test')
            self.assertListEqual(tokens,
                                 [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])

            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                                 [285, 46, 10, 170, 382])

            tokens = tokenizer.tokenize(
                u"I was born in 92000, and this is falsé.")
            self.assertListEqual(tokens, [
                SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
                SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
                SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',',
                SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                u'é', u'.'
            ])
            ids = tokenizer.convert_tokens_to_ids(tokens)
            self.assertListEqual(ids, [
                8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46,
                72, 80, 6, 0, 4
            ])

            back_tokens = tokenizer.convert_ids_to_tokens(ids)
            self.assertListEqual(back_tokens, [
                SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
                SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
                SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
                SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                u'<unk>', u'.'
            ])
Ejemplo n.º 3
0
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [285, 46, 10, 170, 382])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(ids, [
            8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72,
            80, 6, 0, 4
        ])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )