def test_do_lower_case(self): # fmt: off sequence = " \tHeLLo!how \n Are yoU? " tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True) tokens = tokenizer.convert_ids_to_tokens( tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True) rust_tokens = rust_tokenizer.convert_ids_to_tokens( rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target)
def test_do_lower_case_false_split_by_punct(self): # fmt: off sequence = "I was born in 92000, and this is falsé." tokens_target = [ "▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True) tokens = tokenizer.convert_ids_to_tokens( tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True) rust_tokens = rust_tokenizer.convert_ids_to_tokens( rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target)
def test_do_lower_case_false_split_by_punct_false(self): # fmt: off sequence = " \tHeLLo!how \n Are yoU? " tokens_target = [ "▁", "<unk>", "e", "<unk>", "o", "!", "how", "▁", "<unk>", "re", "▁yo", "<unk>", "?" ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False) tokens = tokenizer.convert_ids_to_tokens( tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False) rust_tokens = rust_tokenizer.convert_ids_to_tokens( rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target)
def test_full_tokenizer(self): sequence = "This is a test" ids_target = [13, 1, 4398, 25, 21, 1289] tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"] back_tokens_target = ["▁", "<unk>", "his", "▁is", "▁a", "▁test"] tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True) rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, keep_accents=True) ids = tokenizer.encode(sequence, add_special_tokens=False) self.assertListEqual(ids, ids_target) tokens = tokenizer.tokenize(sequence) self.assertListEqual(tokens, tokens_target) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, back_tokens_target) rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) self.assertListEqual(rust_ids, ids_target) rust_tokens = rust_tokenizer.tokenize(sequence) self.assertListEqual(rust_tokens, tokens_target) rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids) self.assertListEqual(rust_back_tokens, back_tokens_target) # fmt: off sequence = "I was born in 92000, and this is falsé." ids_target = [ 13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9 ] tokens_target = [ "▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", ".", ] back_tokens_target = [ "▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", ".", ] # fmt: on ids = tokenizer.encode(sequence, add_special_tokens=False) self.assertListEqual(ids, ids_target) tokens = tokenizer.tokenize(sequence) self.assertListEqual(tokens, tokens_target) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, back_tokens_target) rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) self.assertListEqual(rust_ids, ids_target) rust_tokens = rust_tokenizer.tokenize(sequence) self.assertListEqual(rust_tokens, tokens_target) rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids) self.assertListEqual(rust_back_tokens, back_tokens_target)