def testStrToVocabTokenAppendEOSFalse(self): vocab = test_helper.test_src_dir_path('core/ops/testdata/test_vocab.txt') with self.session(use_gpu=False) as sess: token_ids, target_ids, paddings = sess.run( py_x_ops.str_to_vocab_tokens( [ 'a b c d e', '<epsilon> <S> </S> <UNK>', 'øut über ♣ 愤青 ←', ], append_eos=False, maxlen=10, vocab_filepath=vocab)) self.assertEqual( token_ids.tolist(), [[1, 5, 6, 7, 8, 9, 2, 2, 2, 2], [1, 0, 1, 2, 3, 2, 2, 2, 2, 2], [1, 10, 11, 12, 13, 3, 2, 2, 2, 2]]) self.assertEqual( target_ids.tolist(), [[5, 6, 7, 8, 9, 2, 2, 2, 2, 2], [0, 1, 2, 3, 2, 2, 2, 2, 2, 2], [10, 11, 12, 13, 3, 2, 2, 2, 2, 2]]) self.assertEqual(paddings.tolist(), [[0., 0., 0., 0., 0., 1., 1., 1., 1., 1.], [ 0., 0., 0., 0., 1., 1., 1., 1., 1., 1. ], [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]])
def testStrToVocabTokenTruncates(self): vocab = test_helper.test_src_dir_path('core/ops/testdata/test_vocab.txt') with self.session(use_gpu=False) as sess: token_ids, target_ids, paddings = sess.run( py_x_ops.str_to_vocab_tokens(['a b c d e ' * 1000], append_eos=True, maxlen=5, vocab_filepath=vocab)) self.assertEqual(token_ids.tolist(), [[1, 5, 6, 7, 8]]) self.assertEqual(target_ids.tolist(), [[5, 6, 7, 8, 9]]) self.assertEqual(paddings.tolist(), [[0., 0., 0., 0., 0.]])
def testStrToVocabTokenSplitToCharacters(self): custom_delimiter = '' vocab = test_helper.test_src_dir_path('core/ops/testdata/test_vocab.txt') with self.session(use_gpu=False) as sess: token_ids, target_ids, paddings = sess.run( py_x_ops.str_to_vocab_tokens(['abcde'], append_eos=True, maxlen=8, vocab_filepath=vocab, delimiter=custom_delimiter)) self.assertEqual(token_ids.tolist(), [[1, 5, 6, 7, 8, 9, 2, 2]]) self.assertEqual(target_ids.tolist(), [[5, 6, 7, 8, 9, 2, 2, 2]]) self.assertEqual(paddings.tolist(), [[0., 0., 0., 0., 0., 0., 1., 1.]])
def _StringsToIdsImpl(self, strs, max_length, append_eos): self._CheckParams() p = self.params if p.token_vocab_filepath: return py_x_ops.str_to_vocab_tokens( strs, maxlen=max_length, append_eos=append_eos, vocab_filepath=p.token_vocab_filepath, delimiter=p.tokens_delimiter) elif p.ngram_vocab_filepath: raise NotImplementedError( 'ngram vocab StringsToIds is not supported.')