Ejemplo n.º 1
0
  def testBpeTokenization(self):
    word_vocab = test_helper.test_src_dir_path(
        'core/ops/testdata/bpe_words.vocab')
    code_vocab = test_helper.test_src_dir_path(
        'core/ops/testdata/bpe_codes.vocab')
    sentences = [
        'GIVE ME A PENNY', 'THEY LIVED ALONE', 'THEY GIVE ME A PENNY ALONE'
    ]
    expected_sentences = [
        'GIVE ME A PENNY </s> ',
        'THEY LIVED ALONE </s> ',
        'THEY GIVE ME A PENNY ',
    ]
    expected_token_ids = [
        [27, 9, 30, 14, 28, 14, 52, 11, 4, 6, 6, 10, 2, 2, 2],
        [16, 4, 10, 12, 9, 30, 24, 7, 12, 49, 14, 2, 2, 2, 2],
        [16, 4, 10, 27, 9, 30, 14, 28, 14, 52, 11, 4, 6, 6, 10],
    ]
    with self.session(use_gpu=False):
      label_tensor = tf.constant(sentences)
      _, token_ids, paddings = py_x_ops.bpe_words_to_ids(
          label_tensor, tokenization_filepath=word_vocab, maxlen=15)
      seq_lens = tf.cast(tf.reduce_sum(1 - paddings, axis=-1), tf.int32)

      target_string = py_x_ops.bpe_ids_to_words(
          token_ids, seq_lengths=seq_lens, vocab_filepath=code_vocab)
      self.assertEqual(expected_sentences, target_string.eval().tolist())
      self.assertEqual(expected_token_ids, token_ids.eval().tolist())
Ejemplo n.º 2
0
    def _StringsToIdsImpl(self, strs, max_length, append_eos):
        p = self.params

        return py_x_ops.bpe_words_to_ids(
            strs,
            maxlen=max_length,
            append_eos=append_eos,
            tokenization_filepath=p.words_to_ids_filepath)