def setUp(self): super(SubwordTextEncoderTest, self).setUp() # Vocab ids will be (offset for pad=0): # 1 2 3 4 5 self.vocab_list = ['foo_', 'bar_', 'foo', 'bar', '<EOS>'] self.encoder = subword_text_encoder.SubwordTextEncoder( vocab_list=self.vocab_list)
def test_build_with_unicode(self): text_gen = lorem_ipsum_zh_generator build_fn = subword_text_encoder.SubwordTextEncoder.build_from_corpus encoder = build_fn(text_gen(), 300) # Created some subwords self.assertGreater(encoder.vocab_size, text_encoder.NUM_BYTES + 1) base_encoder = subword_text_encoder.SubwordTextEncoder(vocab_list=[]) for line in text_gen(): # Invertible encoded = encoder.encode(line) self.assertEqual(line, encoder.decode(encoded)) # Shorter than base if len(line) > 2: self.assertLess(len(encoded), len(base_encoder.encode(line)))