def test_yield_value(self): r"""Return iterator which yield `str`.""" msg = 'Must return iterator which yield `str`.' examples = ('[bos]', '[eos]', '[pad]', '[unk]') self.assertIsInstance(WhitespaceDictTokenizer.special_tokens(), Iterator, msg=msg) out_tokens = list(WhitespaceDictTokenizer.special_tokens()) for i, ans_token in enumerate(examples): self.assertIsInstance(out_tokens[i], str, msg=msg) self.assertEqual(out_tokens[i], ans_token, msg=msg)
def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)', ), ('', ), ) sp_tokens_size = len(list(WhitespaceDictTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual(tokenizer.vocab_size, sp_tokens_size, msg=msg)
def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('Hello World !', 'I am a LEGEND .', 'Hello legend !'), 9, 8), (('y = f(x)', ), 12, 11), (('', ), 12, 11), ) sp_tokens_size = len(list(WhitespaceDictTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual(self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual(self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg)
def test_cased_sensitive(self): r"""Vocabulary must be case sensitive.""" msg = 'Vocabulary must be case sensitive.' examples = ( (('A B C D', 'a b c d'), 8, 4), (('e f g h i', 'E F G H I'), 10, 5), ) sp_tokens_size = len(list(WhitespaceDictTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual(self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual(self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg)