Ejemplo n.º 1
0
 def setUp(self):
   super(SubwordTextEncoderTest, self).setUp()
   # Vocab ids will be (offset for pad=0):
   #                  1       2       3      4      5
   self.vocab_list = ['foo_', 'bar_', 'foo', 'bar', '<EOS>']
   self.encoder = subword_text_encoder.SubwordTextEncoder(
       vocab_list=self.vocab_list)
Ejemplo n.º 2
0
  def test_build_with_unicode(self):
    text_gen = lorem_ipsum_zh_generator
    build_fn = subword_text_encoder.SubwordTextEncoder.build_from_corpus
    encoder = build_fn(text_gen(), 300)
    # Created some subwords
    self.assertGreater(encoder.vocab_size, text_encoder.NUM_BYTES + 1)

    base_encoder = subword_text_encoder.SubwordTextEncoder(vocab_list=[])
    for line in text_gen():
      # Invertible
      encoded = encoder.encode(line)
      self.assertEqual(line, encoder.decode(encoded))
      # Shorter than base
      if len(line) > 2:
        self.assertLess(len(encoded), len(base_encoder.encode(line)))