Esempio n. 1
0
  def test_tokenize_detokenize(self):
    def dataset():
      yield 'I have a cat.'

    # Character-level.
    tok_char = list(tf_inputs.tokenize(dataset(), vocab_type='char'))
    self.assertAllEqual(tok_char[0],
                        np.array([ord(c) for c in 'I have a cat.']))
    detok = tf_inputs.detokenize(tok_char[0], vocab_type='char')
    self.assertEqual(detok, 'I have a cat.')

    # Sentencepiece.
    tok_spc = list(tf_inputs.tokenize(
        dataset(), vocab_type='sentencepiece',
        vocab_dir=_TESTDATA, vocab_file='sentencepiece.model'))
    self.assertAllEqual(tok_spc[0], np.array([27, 43, 3, 9, 1712, 5]))
    detok = tf_inputs.detokenize(
        list(tok_spc[0]), vocab_type='sentencepiece',
        vocab_dir=_TESTDATA, vocab_file='sentencepiece.model')
    self.assertEqual(detok, 'I have a cat.')

    # Subword.
    tok_sbw = list(tf_inputs.tokenize(
        dataset(), vocab_type='subword',
        vocab_dir=_TESTDATA, vocab_file='en_8k.subword'))
    self.assertAllEqual(tok_sbw[0], np.array([139, 96, 12, 2217, 2, 21]))
    detok = tf_inputs.detokenize(
        tok_sbw[0], vocab_type='subword',
        vocab_dir=_TESTDATA, vocab_file='en_8k.subword')
    self.assertEqual(detok, 'I have a cat.')
Esempio n. 2
0
  def test_tokenize_dict(self):
    def dataset():
      yield {'a': 'Cat.', 'b': 'Dog.'}

    tok_char1 = list(tf_inputs.tokenize(dataset(), vocab_type='char'))
    self.assertAllEqual(tok_char1[0]['a'], np.array([ord(c) for c in 'Cat.']))
    self.assertAllEqual(tok_char1[0]['b'], np.array([ord(c) for c in 'Dog.']))

    tok_char2 = list(tf_inputs.tokenize(dataset(), keys=['a'],
                                        vocab_type='char'))
    self.assertAllEqual(tok_char2[0]['a'], np.array([ord(c) for c in 'Cat.']))
    self.assertEqual(tok_char2[0]['b'], 'Dog.')
Esempio n. 3
0
  def test_tokenize_keys_reservedids(self):
    def dataset():
      yield ('Cat.', 'Dog.')

    tok_char1 = list(tf_inputs.tokenize(
        dataset(), vocab_type='char', n_reserved_ids=5))
    self.assertAllEqual(tok_char1[0][0], np.array([ord(c) + 5 for c in 'Cat.']))
    self.assertAllEqual(tok_char1[0][1], np.array([ord(c) + 5 for c in 'Dog.']))

    tok_char2 = list(tf_inputs.tokenize(
        dataset(), keys=[0], vocab_type='char', n_reserved_ids=2))
    self.assertAllEqual(tok_char2[0][0], np.array([ord(c) + 2 for c in 'Cat.']))
    self.assertEqual(tok_char2[0][1], 'Dog.')