def test_tokenize_detokenize(self): def dataset(): yield 'I have a cat.' # Character-level. tok_char = list(tf_inputs.tokenize(dataset(), vocab_type='char')) self.assertAllEqual(tok_char[0], np.array([ord(c) for c in 'I have a cat.'])) detok = tf_inputs.detokenize(tok_char[0], vocab_type='char') self.assertEqual(detok, 'I have a cat.') # Sentencepiece. tok_spc = list(tf_inputs.tokenize( dataset(), vocab_type='sentencepiece', vocab_dir=_TESTDATA, vocab_file='sentencepiece.model')) self.assertAllEqual(tok_spc[0], np.array([27, 43, 3, 9, 1712, 5])) detok = tf_inputs.detokenize( list(tok_spc[0]), vocab_type='sentencepiece', vocab_dir=_TESTDATA, vocab_file='sentencepiece.model') self.assertEqual(detok, 'I have a cat.') # Subword. tok_sbw = list(tf_inputs.tokenize( dataset(), vocab_type='subword', vocab_dir=_TESTDATA, vocab_file='en_8k.subword')) self.assertAllEqual(tok_sbw[0], np.array([139, 96, 12, 2217, 2, 21])) detok = tf_inputs.detokenize( tok_sbw[0], vocab_type='subword', vocab_dir=_TESTDATA, vocab_file='en_8k.subword') self.assertEqual(detok, 'I have a cat.')
def test_tokenize_dict(self): def dataset(): yield {'a': 'Cat.', 'b': 'Dog.'} tok_char1 = list(tf_inputs.tokenize(dataset(), vocab_type='char')) self.assertAllEqual(tok_char1[0]['a'], np.array([ord(c) for c in 'Cat.'])) self.assertAllEqual(tok_char1[0]['b'], np.array([ord(c) for c in 'Dog.'])) tok_char2 = list(tf_inputs.tokenize(dataset(), keys=['a'], vocab_type='char')) self.assertAllEqual(tok_char2[0]['a'], np.array([ord(c) for c in 'Cat.'])) self.assertEqual(tok_char2[0]['b'], 'Dog.')
def test_tokenize_keys_reservedids(self): def dataset(): yield ('Cat.', 'Dog.') tok_char1 = list(tf_inputs.tokenize( dataset(), vocab_type='char', n_reserved_ids=5)) self.assertAllEqual(tok_char1[0][0], np.array([ord(c) + 5 for c in 'Cat.'])) self.assertAllEqual(tok_char1[0][1], np.array([ord(c) + 5 for c in 'Dog.'])) tok_char2 = list(tf_inputs.tokenize( dataset(), keys=[0], vocab_type='char', n_reserved_ids=2)) self.assertAllEqual(tok_char2[0][0], np.array([ord(c) + 2 for c in 'Cat.'])) self.assertEqual(tok_char2[0][1], 'Dog.')