Ejemplo n.º 1
0
    def testBpeTokenization(self):
        word_vocab = test_helper.test_src_dir_path(
            'core/ops/testdata/bpe_words.vocab')
        code_vocab = test_helper.test_src_dir_path(
            'core/ops/testdata/bpe_codes.vocab')
        sentences = [
            'GIVE ME A PENNY', 'THEY LIVED ALONE', 'THEY GIVE ME A PENNY ALONE'
        ]
        expected_sentences = [
            b'GIVE ME A PENNY </s> ',
            b'THEY LIVED ALONE </s> ',
            b'THEY GIVE ME A PENNY ',
        ]
        expected_token_ids = [
            [27, 9, 30, 14, 28, 14, 52, 11, 4, 6, 6, 10, 2, 2, 2],
            [16, 4, 10, 12, 9, 30, 24, 7, 12, 49, 14, 2, 2, 2, 2],
            [16, 4, 10, 27, 9, 30, 14, 28, 14, 52, 11, 4, 6, 6, 10],
        ]
        with self.session(use_gpu=False):
            label_tensor = tf.constant(sentences)
            _, token_ids, paddings = ops.bpe_words_to_ids(
                label_tensor, tokenization_filepath=word_vocab, maxlen=15)
            seq_lens = tf.cast(tf.round(tf.reduce_sum(1 - paddings, axis=-1)),
                               tf.int32)

            target_string = ops.bpe_ids_to_words(token_ids,
                                                 seq_lengths=seq_lens,
                                                 vocab_filepath=code_vocab)
            self.assertEqual(expected_sentences, target_string.eval().tolist())
            self.assertEqual(expected_token_ids, token_ids.eval().tolist())
Ejemplo n.º 2
0
    def _StringsToIdsImpl(self, strs, max_length, append_eos, languages):
        p = self.params

        return ops.bpe_words_to_ids(
            strs,
            maxlen=max_length,
            append_eos=append_eos,
            tokenization_filepath=p.words_to_ids_filepath)