Example #1
0
    def test_do_lower_case(self):
        # fmt: off
        sequence = " \tHeLLo!how  \n Are yoU?  "
        tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"]
        # fmt: on

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True)
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(tokens, tokens_target)

        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                do_lower_case=True)
        rust_tokens = rust_tokenizer.convert_ids_to_tokens(
            rust_tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(rust_tokens, tokens_target)
Example #2
0
    def test_do_lower_case_false_split_by_punct(self):
        # fmt: off
        sequence = "I was born in 92000, and this is falsé."
        tokens_target = [
            "▁",
            "<unk>",
            "▁was",
            "▁born",
            "▁in",
            "▁9",
            "2000",
            "▁",
            ",",
            "▁and",
            "▁this",
            "▁is",
            "▁fal",
            "s",
            "<unk>",
            "▁",
            ".",
        ]
        # fmt: on

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB,
                                       do_lower_case=False,
                                       split_by_punct=True)
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(tokens, tokens_target)

        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                do_lower_case=False,
                                                split_by_punct=True)
        rust_tokens = rust_tokenizer.convert_ids_to_tokens(
            rust_tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(rust_tokens, tokens_target)
Example #3
0
    def test_do_lower_case_false_split_by_punct_false(self):
        # fmt: off
        sequence = " \tHeLLo!how  \n Are yoU?  "
        tokens_target = [
            "▁", "<unk>", "e", "<unk>", "o", "!", "how", "▁", "<unk>", "re",
            "▁yo", "<unk>", "?"
        ]
        # fmt: on

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB,
                                       do_lower_case=False,
                                       split_by_punct=False)
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(tokens, tokens_target)

        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                do_lower_case=False,
                                                split_by_punct=False)
        rust_tokens = rust_tokenizer.convert_ids_to_tokens(
            rust_tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(rust_tokens, tokens_target)
Example #4
0
    def test_full_tokenizer(self):
        sequence = "This is a test"
        ids_target = [13, 1, 4398, 25, 21, 1289]
        tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"]
        back_tokens_target = ["▁", "<unk>", "his", "▁is", "▁a", "▁test"]

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                keep_accents=True)

        ids = tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(ids, ids_target)
        tokens = tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, tokens_target)
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, back_tokens_target)

        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(rust_ids, ids_target)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(rust_tokens, tokens_target)
        rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
        self.assertListEqual(rust_back_tokens, back_tokens_target)

        # fmt: off
        sequence = "I was born in 92000, and this is falsé."
        ids_target = [
            13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9
        ]
        tokens_target = [
            "▁",
            "I",
            "▁was",
            "▁born",
            "▁in",
            "▁9",
            "2000",
            ",",
            "▁and",
            "▁this",
            "▁is",
            "▁fal",
            "s",
            "é",
            ".",
        ]
        back_tokens_target = [
            "▁",
            "<unk>",
            "▁was",
            "▁born",
            "▁in",
            "▁9",
            "2000",
            ",",
            "▁and",
            "▁this",
            "▁is",
            "▁fal",
            "s",
            "<unk>",
            ".",
        ]
        # fmt: on

        ids = tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(ids, ids_target)
        tokens = tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, tokens_target)
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, back_tokens_target)

        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(rust_ids, ids_target)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(rust_tokens, tokens_target)
        rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
        self.assertListEqual(rust_back_tokens, back_tokens_target)