def testBpeTokenization(self): word_vocab = test_helper.test_src_dir_path( 'core/ops/testdata/bpe_words.vocab') code_vocab = test_helper.test_src_dir_path( 'core/ops/testdata/bpe_codes.vocab') sentences = [ 'GIVE ME A PENNY', 'THEY LIVED ALONE', 'THEY GIVE ME A PENNY ALONE' ] expected_sentences = [ b'GIVE ME A PENNY </s> ', b'THEY LIVED ALONE </s> ', b'THEY GIVE ME A PENNY ', ] expected_token_ids = [ [27, 9, 30, 14, 28, 14, 52, 11, 4, 6, 6, 10, 2, 2, 2], [16, 4, 10, 12, 9, 30, 24, 7, 12, 49, 14, 2, 2, 2, 2], [16, 4, 10, 27, 9, 30, 14, 28, 14, 52, 11, 4, 6, 6, 10], ] with self.session(use_gpu=False): label_tensor = tf.constant(sentences) _, token_ids, paddings = ops.bpe_words_to_ids( label_tensor, tokenization_filepath=word_vocab, maxlen=15) seq_lens = tf.cast(tf.round(tf.reduce_sum(1 - paddings, axis=-1)), tf.int32) target_string = ops.bpe_ids_to_words(token_ids, seq_lengths=seq_lens, vocab_filepath=code_vocab) self.assertEqual(expected_sentences, target_string.eval().tolist()) self.assertEqual(expected_token_ids, token_ids.eval().tolist())
def _StringsToIdsImpl(self, strs, max_length, append_eos, languages): p = self.params return ops.bpe_words_to_ids( strs, maxlen=max_length, append_eos=append_eos, tokenization_filepath=p.words_to_ids_filepath)