def test_tokens_to_ids(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert'] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) self.assertTrue(len(ids) == len(tokens)) self.assertTrue(ids.count(tokenizer.token_to_id("[CLS]")) == 1) self.assertTrue(ids.count(tokenizer.token_to_id("[MASK]")) == 1) self.assertTrue(ids.count(tokenizer.token_to_id("[SEP]")) == 2)
def test_ids_to_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert'] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) self.assertTrue(len(result) == len(tokens)) for i in range(len(result)): self.assertTrue(result[i] == tokens[i])
def test_tokens_to_ids(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) self.assertTrue(len(ids) == len(tokens)) self.assertTrue(ids.count(tokenizer.special_tokens["[CLS]"]) == 1) self.assertTrue(ids.count(tokenizer.special_tokens["[MASK]"]) == 1) self.assertTrue(ids.count(tokenizer.special_tokens["[SEP]"]) == 2)
def test_ids_to_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) self.assertTrue(len(result) == len(tokens)) for i in range(len(result)): self.assertTrue(result[i] == tokens[i])