def setUp(self):
     super(SubwordTextEncoderTest, self).setUp()
     # Vocab ids will be (offset for pad=0):
     #                  1       2       3      4      5
     self.vocab_list = ['foo_', 'bar_', 'foo', 'bar', '<EOS>']
     self.encoder = subword_text_encoder.SubwordTextEncoder(
         vocab_list=self.vocab_list)
    def test_build(self):
        text_gen = lorem_ipsum_generator
        build_fn = subword_text_encoder.SubwordTextEncoder.build_from_corpus
        encoder = build_fn(text_gen(), 300)
        # Created some subwords
        self.assertGreater(encoder.vocab_size, text_encoder.NUM_BYTES + 1)

        base_encoder = subword_text_encoder.SubwordTextEncoder(vocab_list=[])
        for line in text_gen():
            # Invertible
            encoded = encoder.encode(line)
            self.assertEqual(line, encoder.decode(encoded))
            # Shorter than base
            if len(line) > 2:
                self.assertLess(len(encoded), len(base_encoder.encode(line)))