def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )
def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg )