def testSampleEncodeAndDecode(self):
        sentencepiece_model_file = self._getSentencePieceModelFile()
        processor = spm.SentencePieceProcessor()
        processor.Load(sentencepiece_model_file)
        sentences, _, _, _ = self._getExpected(processor)

        with tf.Session():
            for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
                # Round trip test.
                nbest_size = tf.constant(n)
                alpha = tf.constant(a)
                s = tf.constant(sentences)

                pieces, seq_len1 = tfspm.encode(
                    s,
                    nbest_size=nbest_size,
                    alpha=alpha,
                    model_file=sentencepiece_model_file,
                    out_type=tf.string)
                ids, seq_len2 = tfspm.encode(
                    s,
                    nbest_size=nbest_size,
                    alpha=alpha,
                    model_file=sentencepiece_model_file)
                decoded_sentences1 = tfspm.decode(
                    pieces, seq_len1, model_file=sentencepiece_model_file)
                decoded_sentences2 = tfspm.decode(
                    ids, seq_len2, model_file=sentencepiece_model_file)

                self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
                self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
  def testInvalidInput(self):
    sentences = ['Hello world.', 'This is a test.']
    ids = [[0,1],[2,3]]
    model_file = self._getSentencePieceModelFile()
    with tf.Session() as sess:
      a = tf.constant(sentences)
      b = tf.constant(ids)

      alpha = tf.constant([1.0, 2.0])
      sess.run(tfspm.encode(
          a, model_file=model_file, alpha=alpha, name='foo'))

      nbest_size = tf.constant([1, 2], dtype=tf.int32)
      sess.run(tfspm.encode(
          a, model_file=model_file, nbest_size=nbest_size, name='foo'))

      alpha = tf.constant(1.0)
      sess.run(tfspm.encode(
          a, model_file=model_file, alpha=alpha, name='foo'))

      nbest_size = tf.constant(10, dtype=tf.int32)
      sess.run(tfspm.encode(
          a, model_file=model_file, nbest_size=nbest_size, name='foo'))

      sess.run(tfspm.decode(
          b, sequence_length=tf.constant([2, 2]), model_file=model_file))

      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        alpha = tf.constant([1.0, 2.0, 3.0])
        sess.run(tfspm.encode(
            a, model_file=model_file, alpha=alpha))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
        sess.run(tfspm.encode(
            a, model_file=model_file, nbest_size=nbest_size))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        alpha = tf.constant([[1.0], [2.0]])
        sess.run(tfspm.encode(
            a, model_file=model_file, alpha=alpha))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
        sess.run(tfspm.encode(
            a, model_file=model_file, nbest_size=nbest_size))
      with self.assertRaises(ValueError):
        b = tf.constant(ids)
        sess.run(tfspm.decode(
            a, sequence_length=2, model_file=model_file))
      with self.assertRaises(ValueError):
        b = tf.constant(ids)
        sess.run(tfspm.decode(
            a, sequence_length=tf.constant([2, 2, 2]),
            model_file=model_file))
    def testEncodeAndDecode(self):
        sentencepiece_model_file = self._getSentencePieceModelFile()
        processor = spm.SentencePieceProcessor()
        processor.Load(sentencepiece_model_file)

        with tf.Session():
            for reverse, add_bos, add_eos in list(
                    it.product((True, False), repeat=3)):
                (sentences, expected_pieces, expected_ids,
                 expected_seq_len) = self._getExpected(processor, reverse,
                                                       add_bos, add_eos)

                # Encode sentences into pieces/ids.
                s = tf.constant(sentences)
                pieces, seq_len1 = tfspm.encode(
                    s,
                    model_file=sentencepiece_model_file,
                    reverse=reverse,
                    add_bos=add_bos,
                    add_eos=add_eos,
                    out_type=tf.string)
                ids, seq_len2 = tfspm.encode(
                    s,
                    model_file=sentencepiece_model_file,
                    reverse=reverse,
                    add_bos=add_bos,
                    add_eos=add_eos)

                self.assertEqual(pieces.eval().tolist(), expected_pieces)
                self.assertEqual(ids.eval().tolist(), expected_ids)
                self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
                self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)

                # Decode pieces into sentences/ids.
                pieces = tf.constant(expected_pieces)
                ids = tf.constant(expected_ids)
                seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
                decoded_sentences1 = tfspm.decode(
                    pieces,
                    seq_len,
                    model_file=sentencepiece_model_file,
                    reverse=reverse)
                decoded_sentences2 = tfspm.decode(
                    ids,
                    seq_len,
                    model_file=sentencepiece_model_file,
                    reverse=reverse)

                self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
                self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
  def testEncodeAndDecode(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()

    with tf.Session():
      for reverse, add_bos, add_eos in list(it.product(
          (True, False), repeat=3)):
        (sentences, expected_pieces,
         expected_ids, expected_seq_len) = self._getExpected(
             reverse=reverse, add_bos=add_bos, add_eos=add_eos)

        # Encode sentences into pieces/ids.
        s = tf.constant(sentences)
        pieces, seq_len1 = tfspm.encode(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos,
            out_type=tf.string)
        ids, seq_len2 = tfspm.encode(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos)

        self.assertEqual(pieces.eval().tolist(), expected_pieces)
        self.assertEqual(ids.eval().tolist(), expected_ids)
        self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
        self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)

        # Decode pieces into sentences/ids.
        pieces = tf.constant(expected_pieces)
        ids = tf.constant(expected_ids)
        seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
        decoded_sentences1 = tfspm.decode(
            pieces, seq_len, model_file=sentencepiece_model_file,
            reverse=reverse)
        decoded_sentences2 = tfspm.decode(
            ids, seq_len, model_file=sentencepiece_model_file,
            reverse=reverse)

        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
  def testSampleEncodeAndDecode(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    sentences, _, _, _ = self._getExpected()

    with tf.Session():
      for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
        # Round trip test.
        nbest_size = tf.constant(n)
        alpha = tf.constant(a)
        s = tf.constant(sentences)

        pieces, seq_len1 = tfspm.encode(
            s, nbest_size=nbest_size, alpha=alpha,
            model_file=sentencepiece_model_file, out_type=tf.string)
        ids, seq_len2 = tfspm.encode(
            s, nbest_size=nbest_size, alpha=alpha,
            model_file=sentencepiece_model_file)
        decoded_sentences1 = tfspm.decode(
            pieces, seq_len1, model_file=sentencepiece_model_file)
        decoded_sentences2 = tfspm.decode(
            ids, seq_len2, model_file=sentencepiece_model_file)

        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
    def testInvalidInput(self):
        sentences = ['Hello world.', 'This is a test.']
        ids = [[0, 1], [2, 3]]
        model_file = self._getSentencePieceModelFile()
        with tf.Session() as sess:
            a = tf.constant(sentences)
            b = tf.constant(ids)

            alpha = tf.constant([1.0, 2.0])
            sess.run(
                tfspm.encode(a, model_file=model_file, alpha=alpha,
                             name='foo'))

            nbest_size = tf.constant([1, 2], dtype=tf.int32)
            sess.run(
                tfspm.encode(a,
                             model_file=model_file,
                             nbest_size=nbest_size,
                             name='foo'))

            alpha = tf.constant(1.0)
            sess.run(
                tfspm.encode(a, model_file=model_file, alpha=alpha,
                             name='foo'))

            nbest_size = tf.constant(10, dtype=tf.int32)
            sess.run(
                tfspm.encode(a,
                             model_file=model_file,
                             nbest_size=nbest_size,
                             name='foo'))

            sess.run(
                tfspm.decode(b,
                             sequence_length=tf.constant([2, 2]),
                             model_file=model_file))

            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                alpha = tf.constant([1.0, 2.0, 3.0])
                sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
                sess.run(
                    tfspm.encode(a,
                                 model_file=model_file,
                                 nbest_size=nbest_size))
            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                alpha = tf.constant([[1.0], [2.0]])
                sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
                sess.run(
                    tfspm.encode(a,
                                 model_file=model_file,
                                 nbest_size=nbest_size))
            with self.assertRaises(ValueError):
                b = tf.constant(ids)
                sess.run(
                    tfspm.decode(a, sequence_length=2, model_file=model_file))
            with self.assertRaises(ValueError):
                b = tf.constant(ids)
                sess.run(
                    tfspm.decode(a,
                                 sequence_length=tf.constant([2, 2, 2]),
                                 model_file=model_file))
Esempio n. 7
0
def predict_batch(sess,
                  src,
                  model,
                  src_model_file,
                  tar_model_file,
                  src_offset,
                  tar_offset,
                  srcf,
                  tarf,
                  vocab_size,
                  single_vocab_size=8192,
                  batch_size=60):
    """
    sess: tf.Session
    src: list of strings
    model: tf.keras.Model
    """
    t = len(src)

    ans = []

    for i in range(t // batch_size):
        print(i)

        start = i * batch_size
        end = start + batch_size
        inp = src[start:end]

        a = tfs.encode(inp,
                       model_file=src_model_file,
                       add_bos=True,
                       add_eos=True)[0]

        if src_offset > 0:
            a_mask = tf.cast(tf.not_equal(a, 0), tf.int32) * src_offset
            a = a + a_mask

        ids, probs = predict(
            model=model,
            inputs=a,
            inpf=tf.constant(srcf),
            tarf=tf.constant(tarf),
            bos_id=tar_offset + 1,
            eos_id=tar_offset + 2,
            beam_size=5,
            vocab_size=vocab_size,
            alpha=1.0,
        )

        mask = tf.cast(tf.not_equal(ids, 0), tf.int32)
        seq_len = tf.reduce_sum(mask, axis=-1)

        if tar_offset > 0:
            ids = ids + mask * -tar_offset

        probs = tf.math.exp(probs)

        ids_, seq_len_ = sess.run([ids, seq_len])

        for cids, cseqlen in zip(list(ids_), list(seq_len_)):
            fids = tf.cast(
                tf.logical_and(tf.greater(cids, 0),
                               tf.less(cids, single_vocab_size)),
                tf.int32) * cids
            decoded = sess.run(
                tfs.decode(fids, cseqlen, model_file=tar_model_file))
            ans.append(decoded)

    return ans