Exemple #1
0
  def print_prediction(self, prediction, params=None, stream=None):
    n_best = params and params.get("n_best")
    n_best = n_best or 1

    if n_best > len(prediction["tokens"]):
      raise ValueError("n_best cannot be greater than beam_width")

    for i in range(n_best):
      target_length = prediction["length"][i] - 1  # Ignore </s>.
      tokens = prediction["tokens"][i][:target_length]
      sentence = self.target_inputter.tokenizer.detokenize(tokens)
      score = None
      attention = None
      alignment_type = None
      if params is not None and params.get("with_scores"):
        score = prediction["log_probs"][i]
      if params is not None and params.get("with_alignments"):
        attention = prediction["alignment"][i][:target_length]
        alignment_type = params["with_alignments"]
      sentence = format_translation_output(
          sentence,
          score=score,
          attention=attention,
          alignment_type=alignment_type)
      print_bytes(tf.compat.as_bytes(sentence), stream=stream)
def translate(model_dir,
              example_inputter,
              source_file,
              batch_size=32,
              beam_size=4):
    """Runs translation.

  Args:
    model_dir: The directory to load the checkpoint from.
    example_inputter: The inputter instance that produces the training examples.
    source_file: The source file.
    batch_size: The batch size to use.
    beam_size: The beam size to use. Set to 1 for greedy search.
  """
    mode = tf.estimator.ModeKeys.PREDICT

    # Create the inference dataset.
    dataset = example_inputter.make_inference_dataset(source_file, batch_size)
    iterator = dataset.make_initializable_iterator()
    source = iterator.get_next()

    # Encode the source.
    with tf.variable_scope("encoder"):
        source_embedding = source_inputter.make_inputs(source)
        memory, _, _ = encoder.encode(source_embedding,
                                      source["length"],
                                      mode=mode)

    # Generate the target.
    with tf.variable_scope("decoder"):
        target_inputter.build()
        batch_size = tf.shape(memory)[0]
        start_tokens = tf.fill([batch_size], constants.START_OF_SENTENCE_ID)
        end_token = constants.END_OF_SENTENCE_ID
        target_ids, _, target_length, _ = decoder.dynamic_decode_and_search(
            target_inputter.embedding,
            start_tokens,
            end_token,
            vocab_size=target_inputter.vocabulary_size,
            beam_width=beam_size,
            memory=memory,
            memory_sequence_length=source["length"])
        target_vocab_rev = target_inputter.vocabulary_lookup_reverse()
        target_tokens = target_vocab_rev.lookup(tf.cast(target_ids, tf.int64))

    # Iterates on the dataset.
    saver = tf.train.Saver()
    checkpoint_path = tf.train.latest_checkpoint(model_dir)
    with tf.Session() as sess:
        saver.restore(sess, checkpoint_path)
        sess.run(tf.tables_initializer())
        sess.run(iterator.initializer)
        while True:
            try:
                batch_tokens, batch_length = sess.run(
                    [target_tokens, target_length])
                for tokens, length in zip(batch_tokens, batch_length):
                    misc.print_bytes(b" ".join(tokens[0][:length[0] - 1]))
            except tf.errors.OutOfRangeError:
                break
Exemple #3
0
def translate(source_file, batch_size=32, beam_size=4):
    """Runs translation.

  Args:
    source_file: The source file.
    batch_size: The batch size to use.
    beam_size: The beam size to use. Set to 1 for greedy search.
  """

    # Create the inference dataset.
    dataset = model.examples_inputter.make_inference_dataset(
        source_file, batch_size)
    iterator = iter(dataset)

    @tf.function
    def predict_next():
        # For efficiency, we advance the iterator within the tf.function,
        # see https://github.com/tensorflow/tensorflow/issues/29075.
        source = next(iterator)

        # Run the encoder.
        source_length = source["length"]
        batch_size = tf.shape(source_length)[0]
        source_inputs = model.features_inputter(source)
        encoder_outputs, _, _ = model.encoder(source_inputs, source_length)

        # Prepare the decoding strategy.
        if beam_size > 1:
            encoder_outputs = tfa.seq2seq.tile_batch(encoder_outputs,
                                                     beam_size)
            source_length = tfa.seq2seq.tile_batch(source_length, beam_size)
            decoding_strategy = onmt.utils.BeamSearch(beam_size)
        else:
            decoding_strategy = onmt.utils.GreedySearch()

        # Run dynamic decoding.
        decoder_state = model.decoder.initial_state(
            memory=encoder_outputs, memory_sequence_length=source_length)
        decoded = model.decoder.dynamic_decode(
            model.labels_inputter.embedding,
            tf.fill([batch_size], START_OF_SENTENCE_ID),
            end_id=END_OF_SENTENCE_ID,
            initial_state=decoder_state,
            decoding_strategy=decoding_strategy,
            maximum_iterations=200)
        target_lengths = decoded.lengths
        target_tokens = model.labels_inputter.ids_to_tokens.lookup(
            tf.cast(decoded.ids, tf.int64))
        return target_tokens, target_lengths

    # Iterates on the dataset.
    while True:
        try:
            batch_tokens, batch_length = predict_next()
            for tokens, length in zip(batch_tokens.numpy(),
                                      batch_length.numpy()):
                sentence = b" ".join(tokens[0][:length[0]])
                print_bytes(sentence)
        except tf.errors.OutOfRangeError:
            break
 def print_prediction(self, prediction, params=None, stream=None):
   if params is None:
     params = {}
   with_scores = params.get("with_scores")
   alignment_type = params.get("with_alignments")
   if alignment_type and "alignment" not in prediction:
     raise ValueError("with_alignments is set but the model did not return alignment information")
   num_hypotheses = len(prediction["log_probs"])
   for i in range(num_hypotheses):
     if "tokens" in prediction:
       target_length = prediction["length"][i]
       tokens = prediction["tokens"][i][:target_length]
       sentence = self.labels_inputter.tokenizer.detokenize(tokens)
     else:
       sentence = prediction["text"][i]
     score = None
     attention = None
     if with_scores:
       score = prediction["log_probs"][i]
     if alignment_type:
       attention = prediction["alignment"][i][:target_length]
     sentence = format_translation_output(
         sentence,
         score=score,
         attention=attention,
         alignment_type=alignment_type)
     print_bytes(tf.compat.as_bytes(sentence), stream=stream)
  def print_prediction(self, prediction, params=None, stream=None):
    n_best = params and params.get("n_best")
    n_best = n_best or 1

    if n_best > len(prediction["tokens"]):
      raise ValueError("n_best cannot be greater than beam_width")

    for i in range(n_best):
      tokens = prediction["tokens"][i][:prediction["length"][i] - 1] # Ignore </s>.
      sentence = self.target_inputter.tokenizer.detokenize(tokens)
      print_bytes(tf.compat.as_bytes(sentence), stream=stream)
  def print_prediction(self, prediction, params=None, stream=None):
    n_best = params and params.get("n_best")
    n_best = n_best or 1

    if n_best > len(prediction["tokens"]):
      raise ValueError("n_best cannot be greater than beam_width")

    for i in range(n_best):
      tokens = prediction["tokens"][i][:prediction["length"][i] - 1] # Ignore </s>.
      sentence = self.target_inputter.tokenizer.detokenize(tokens)
      print_bytes(tf.compat.as_bytes(sentence), stream=stream)
Exemple #7
0
  def print_prediction(self, prediction, params=None, stream=None):
    n_best = params and params.get("n_best")
    n_best = n_best or 1

    if n_best > len(prediction["tokens"]):
      raise ValueError("n_best cannot be greater than beam_width")

    for i in range(n_best):
      tokens = prediction["tokens"][i][:prediction["length"][i] - 1] # Ignore </s>.
      sentence = b" ".join(tokens)
      print_bytes(sentence, stream=stream)
Exemple #8
0
  def detokenize_stream(self, input_stream=sys.stdin, output_stream=sys.stdout, delimiter=" "):
    """Detokenizes a stream of sentences.

    Args:
      input_stream: The input stream.
      output_stream: The output stream.
      delimiter: The token delimiter used for text serialization.
    """
    for line in input_stream:
      tokens = line.strip().split(delimiter)
      string = self.detokenize(tokens)
      print_bytes(tf.compat.as_bytes(string), stream=output_stream)
Exemple #9
0
  def tokenize_stream(self, input_stream=sys.stdin, output_stream=sys.stdout, delimiter=" "):
    """Tokenizes a stream of sentences.

    Args:
      input_stream: The input stream.
      output_stream: The output stream.
      delimiter: The token delimiter to use for text serialization.
    """
    for line in input_stream:
      line = line.strip()
      tokens = self.tokenize(line)
      merged_tokens = delimiter.join(tokens)
      print_bytes(tf.compat.as_bytes(merged_tokens), stream=output_stream)
Exemple #10
0
 def print_score(self, score, params=None, stream=None):
     if params is None:
         params = {}
     length = score["length"]
     tokens = score["tokens"][:length]
     sentence = self.decoder_inputter.tokenizer.detokenize(tokens)
     token_level_scores = None
     attention = None
     if params.get("with_token_level"):
         token_level_scores = score["cross_entropy"][:length]
     if "attention" in score:
         attention = score["attention"][:length]
     alignment_type = params.get("with_alignments")
     sentence = misc.format_translation_output(
         sentence,
         score=score["score"],
         token_level_scores=token_level_scores,
         attention=attention,
         alignment_type=alignment_type)
     misc.print_bytes(tf.compat.as_bytes(sentence), stream=stream)
 def print_prediction(self, prediction, params=None, stream=None):
     if params is None:
         params = {}
     num_hypotheses = len(prediction["tokens"])
     for i in range(num_hypotheses):
         target_length = prediction["length"][i]
         tokens = prediction["tokens"][i][:target_length]
         sentence = self.labels_inputter.tokenizer.detokenize(tokens)
         score = None
         attention = None
         alignment_type = None
         if params.get("with_scores"):
             score = prediction["log_probs"][i]
         if params.get("with_alignments"):
             attention = prediction["alignment"][i][:target_length]
             alignment_type = params["with_alignments"]
         sentence = format_translation_output(sentence,
                                              score=score,
                                              attention=attention,
                                              alignment_type=alignment_type)
         print_bytes(tf.compat.as_bytes(sentence), stream=stream)
Exemple #12
0
    def print_prediction(self, prediction, params=None, stream=None):
        n_best = params and params.get("n_best")
        n_best = n_best or 1

        if n_best > len(prediction["tokens"]):
            raise ValueError("n_best cannot be greater than beam_width")

        for i in range(n_best):
            target_length = prediction["length"][i] - 1  # Ignore </s>.
            tokens = prediction["tokens"][i][:target_length]
            sentence = self.target_inputter.tokenizer.detokenize(tokens)
            if params is not None and params.get("with_scores"):
                sentence = "%f ||| %s" % (prediction["log_probs"][i] /
                                          prediction["length"][i], sentence)
            if params is not None and params.get("with_alignments") == "hard":
                source_indices = np.argmax(
                    prediction["alignment"][i][:target_length], axis=-1)
                target_indices = range(target_length)
                pairs = ("%d-%d" % (src, tgt)
                         for src, tgt in zip(source_indices, target_indices))
                sentence = "%s ||| %s" % (sentence, " ".join(pairs))
            print_bytes(tf.compat.as_bytes(sentence), stream=stream)
 def print_prediction(self, prediction, params=None, stream=None):
     print_bytes(prediction["classes"], stream=stream)
Exemple #14
0
def inf(src, tgt, src_vocab, tgt_vocab, direction, output_file):

    # Step 1
    def load_data(input_file, input_vocab):
        """Returns an iterator over the input file.

    Args:
      input_file: The input text file.
      input_vocab: The input vocabulary.

    Returns:
      A dataset batch iterator.
    """
        dataset = tf.data.TextLineDataset(input_file)
        dataset = dataset.map(lambda x: tf.string_split([x]).values)
        dataset = dataset.map(input_vocab.lookup)
        dataset = dataset.map(lambda x: {"ids": x, "length": tf.shape(x)[0]})
        dataset = dataset.padded_batch(64, {"ids": [None], "length": []})
        return dataset.make_initializable_iterator()

    if direction == 1:
        src_file, tgt_file = src, tgt
        src_vocab_file, tgt_vocab_file = src_vocab, tgt_vocab
    else:
        src_file, tgt_file = tgt, src
        src_vocab_file, tgt_vocab_file = tgt_vocab, src_vocab

    from opennmt.utils.misc import count_lines

    tgt_vocab_size = count_lines(tgt_vocab_file) + 1
    src_vocab_size = count_lines(src_vocab_file) + 1
    src_vocab = tf.contrib.lookup.index_table_from_file(
        src_vocab_file, vocab_size=src_vocab_size - 1, num_oov_buckets=1)

    with tf.device("cpu:0"):
        src_iterator = load_data(src_file, src_vocab)

    src = src_iterator.get_next()

    # Step 2

    hidden_size = 512
    encoder = onmt.encoders.BidirectionalRNNEncoder(2, hidden_size)
    decoder = onmt.decoders.AttentionalRNNDecoder(
        2, hidden_size, bridge=onmt.layers.CopyBridge())

    with tf.variable_scope("src" if direction == 1 else "tgt"):
        src_emb = tf.get_variable("embedding", shape=[src_vocab_size, 300])
        src_gen = tf.layers.Dense(src_vocab_size)
        src_gen.build([None, hidden_size])

    with tf.variable_scope("tgt" if direction == 1 else "src"):
        tgt_emb = tf.get_variable("embedding", shape=[tgt_vocab_size, 300])
        tgt_gen = tf.layers.Dense(tgt_vocab_size)
        tgt_gen.build([None, hidden_size])

    # Step 3

    from opennmt import constants

    def encode():
        """Encodes src.

    Returns:
      A tuple (encoder output, encoder state, sequence length).
    """
        with tf.variable_scope("encoder"):
            return encoder.encode(tf.nn.embedding_lookup(src_emb, src["ids"]),
                                  sequence_length=src["length"],
                                  mode=tf.estimator.ModeKeys.PREDICT)

    def decode(encoder_output):
        """Dynamically decodes from the encoder output.

    Args:
      encoder_output: The output of encode().

    Returns:
      A tuple with: the decoded word ids and the length of each decoded sequence.
    """
        batch_size = tf.shape(src["length"])[0]
        start_tokens = tf.fill([batch_size], constants.START_OF_SENTENCE_ID)
        end_token = constants.END_OF_SENTENCE_ID

        with tf.variable_scope("decoder"):
            sampled_ids, _, sampled_length, _ = decoder.dynamic_decode_and_search(
                tgt_emb,
                start_tokens,
                end_token,
                vocab_size=tgt_vocab_size,
                initial_state=encoder_output[1],
                beam_width=5,
                maximum_iterations=200,
                output_layer=tgt_gen,
                mode=tf.estimator.ModeKeys.PREDICT,
                memory=encoder_output[0],
                memory_sequence_length=encoder_output[2])
            return sampled_ids, sampled_length

    encoder_output = encode()
    sampled_ids, sampled_length = decode(encoder_output)

    tgt_vocab_rev = tf.contrib.lookup.index_to_string_table_from_file(
        tgt_vocab_file,
        vocab_size=tgt_vocab_size - 1,
        default_value=constants.UNKNOWN_TOKEN)

    tokens = tgt_vocab_rev.lookup(tf.cast(sampled_ids, tf.int64))
    length = sampled_length

    # Step 4

    from opennmt.utils.misc import print_bytes

    saver = tf.train.Saver()
    checkpoint_path = tf.train.latest_checkpoint("model")

    def session_init_op(_scaffold, sess):
        saver.restore(sess, checkpoint_path)
        tf.logging.info("Restored model from %s", checkpoint_path)

    scaffold = tf.train.Scaffold(init_fn=session_init_op)
    session_creator = tf.train.ChiefSessionCreator(scaffold=scaffold)

    f = open(output_file, 'a')

    with tf.train.MonitoredSession(session_creator=session_creator) as sess:

        sess.run(src_iterator.initializer)
        while not sess.should_stop():
            _tokens, _length = sess.run([tokens, length])
            for b in range(_tokens.shape[0]):
                pred_toks = _tokens[b][0][:_length[b][0] - 1]
                pred_sent = b" ".join(pred_toks)
                print_bytes(pred_sent, f)

    f.close()
 def print_prediction(self, prediction, params=None, stream=None):
   print_bytes(prediction["classes"], stream=stream)
 def print_prediction(self, prediction, params=None, stream=None):
     target_length = prediction["length"]
     tokens = prediction["tokens"][:target_length]
     sentence = self.examples_inputter.tokenizer.detokenize(tokens)
     sentence = misc.format_translation_output(sentence)
     misc.print_bytes(tf.compat.as_bytes(sentence), stream=stream)
Exemple #17
0
  def score(self, features_file, predictions_file, checkpoint_path=None):
    """Scores existing predictions.

    Args:
      features_file: The input file.
      predictions_file: The predictions file to score.
      checkpoint_path: Path of a specific checkpoint to use. If ``None``,
        the latest is used.

    Raises:
      ValueError: if no checkpoint are found or if the model is not a sequence to
        sequence model.
    """
    if not hasattr(self._model, "target_inputter"):
      raise ValueError("scoring only works for sequence to sequence models")

    if checkpoint_path is None:
      checkpoint_path = tf.train.latest_checkpoint(self._estimator.model_dir)
    elif os.path.isdir(checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
    if checkpoint_path is None:
      raise ValueError("could not find a trained model in %s" % self._estimator.model_dir)

    if "score" not in self._config:
      self._config["score"] = {}
    batch_size = self._config["score"].get("batch_size", 64)
    input_fn = self._model.input_fn(
        tf.estimator.ModeKeys.EVAL,
        batch_size,
        self._config["data"],
        features_file,
        labels_file=predictions_file,
        num_threads=self._config["score"].get("num_threads"),
        prefetch_buffer_size=self._config["score"].get("prefetch_buffer_size"))

    with tf.Graph().as_default() as g:
      tf.train.create_global_step(g)
      features, labels = input_fn()
      with tf.variable_scope(self._model.name):
        logits, _ = self._model(
            features,
            labels,
            self._estimator.params,
            tf.estimator.ModeKeys.EVAL)

      cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=labels["ids_out"])
      weights = tf.sequence_mask(labels["length"], dtype=cross_entropy.dtype)
      masked_cross_entropy = cross_entropy * weights
      scores = (tf.reduce_sum(masked_cross_entropy, axis=1) /
                tf.cast(labels["length"], cross_entropy.dtype))
      results = {
          "score": scores,
          "tokens": labels["tokens"],
          "length": labels["length"] - 1  # For -1, see sequence_to_sequence.shift_target_sequence.
      }

      with tf.train.MonitoredSession(
          session_creator=tf.train.ChiefSessionCreator(
              checkpoint_filename_with_path=checkpoint_path,
              config=self._estimator.config.session_config)) as sess:
        while not sess.should_stop():
          for batch in extract_batches(sess.run(results)):
            tokens = batch["tokens"][:batch["length"]]
            sentence = self._model.target_inputter.tokenizer.detokenize(tokens)
            fmt = "%f ||| %s" % (batch["score"], sentence)
            print_bytes(tf.compat.as_bytes(fmt))
 def print_prediction(self, prediction, params=None, stream=None):
     tags = prediction["tags"][:prediction["length"]]
     sent = b" ".join(tags)
     print_bytes(sent, stream=stream)
Exemple #19
0
  vocab_size=tgt_vocab_size - 1,
  default_value=constants.UNKNOWN_TOKEN)

tokens = tgt_vocab_rev.lookup(tf.cast(sampled_ids, tf.int64))
length = sampled_length


# Step 4


from opennmt.utils.misc import print_bytes

saver = tf.train.Saver()
checkpoint_path = tf.train.latest_checkpoint(args.model_dir)

def session_init_op(_scaffold, sess):
  saver.restore(sess, checkpoint_path)
  tf.logging.info("Restored model from %s", checkpoint_path)

scaffold = tf.train.Scaffold(init_fn=session_init_op)
session_creator = tf.train.ChiefSessionCreator(scaffold=scaffold)

with tf.train.MonitoredSession(session_creator=session_creator) as sess:
  sess.run(src_iterator.initializer)
  while not sess.should_stop():
    _tokens, _length = sess.run([tokens, length])
    for b in range(_tokens.shape[0]):
      pred_toks = _tokens[b][0][:_length[b][0] - 1]
      pred_sent = b" ".join(pred_toks)
      print_bytes(pred_sent)
Exemple #20
0
 def print_prediction(self, prediction, params=None, stream=None):
   tags = prediction["tags"][:prediction["length"]]
   sent = b" ".join(tags)
   print_bytes(sent, stream=stream)
Exemple #21
0
    with tf.Session(config=config) as sess:
        saver = tf.train.Saver(opennmt_variables)
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, "translation/ckpt/model.ckpt-500000")
        sess.run(tf.tables_initializer())
        sess.run(iterator.initializer)
        resotre_model_by_pkl(sess, decoder_variables)

        iteration = 0
        while iteration < 3:
            try:
                opennmt_batch_tokens, opennmt_batch_length, \
                tf_batch_tokens, tf_batch_length, \
                op_batch_tokens, op_batch_length, source_result = sess.run([opennmt_target_tokens, opennmt_target_length,
                                                                tf_target_tokens, tf_target_length,
                                                                op_target_tokens, op_target_length, source])
                print("[INFO] opennmt: ", end='')
                for tokens, length in zip(opennmt_batch_tokens,
                                          opennmt_batch_length):
                    misc.print_bytes(b" ".join(tokens[0][:length[0] - 1]))
                print("[INFO] tf     : ", end='')
                for tokens, length in zip(tf_batch_tokens, tf_batch_length):
                    misc.print_bytes(b" ".join(tokens[0][:length[0] - 1]))
                print("[INFO] op     : ", end='')
                for tokens, length in zip(op_batch_tokens, op_batch_length):
                    misc.print_bytes(b" ".join(tokens[0][:length[0] - 1]))

                iteration += 1
            except tf.errors.OutOfRangeError:
                break
Exemple #22
0
    vocab_size=tgt_vocab_size - 1,
    default_value=constants.UNKNOWN_TOKEN)

tokens = tgt_vocab_rev.lookup(tf.cast(sampled_ids, tf.int64))
length = sampled_length

# Step 4

from opennmt.utils.misc import print_bytes

saver = tf.train.Saver()
checkpoint_path = tf.train.latest_checkpoint(args.model_dir)


def session_init_op(_scaffold, sess):
    saver.restore(sess, checkpoint_path)
    tf.logging.info("Restored model from %s", checkpoint_path)


scaffold = tf.train.Scaffold(init_fn=session_init_op)
session_creator = tf.train.ChiefSessionCreator(scaffold=scaffold)

with tf.train.MonitoredSession(session_creator=session_creator) as sess:
    sess.run(src_iterator.initializer)
    while not sess.should_stop():
        _tokens, _length = sess.run([tokens, length])
        for b in range(_tokens.shape[0]):
            pred_toks = _tokens[b][0][:_length[b][0] - 1]
            pred_sent = b" ".join(pred_toks)
            print_bytes(pred_sent)
Exemple #23
0
  def score(self, features_file, predictions_file, checkpoint_path=None):
    """Scores existing predictions.

    Args:
      features_file: The input file.
      predictions_file: The predictions file to score.
      checkpoint_path: Path of a specific checkpoint to use. If ``None``,
        the latest is used.

    Raises:
      ValueError: if no checkpoint are found or if the model is not a sequence to
        sequence model.
    """
    if not hasattr(self._model, "target_inputter"):
      raise ValueError("scoring only works for sequence to sequence models")

    if checkpoint_path is None:
      checkpoint_path = tf.train.latest_checkpoint(self._config["model_dir"])
    elif tf.gfile.IsDirectory(checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
    if checkpoint_path is None:
      raise ValueError("could not find a trained model in %s" % self._config["model_dir"])

    input_fn = self._model.input_fn(
        tf.estimator.ModeKeys.EVAL,
        self._config["score"]["batch_size"],
        self._config["data"],
        features_file,
        labels_file=predictions_file,
        num_threads=self._config["score"].get("num_threads"),
        prefetch_buffer_size=self._config["score"].get("prefetch_buffer_size"))

    with tf.Graph().as_default() as g:
      tf.train.create_global_step(g)
      features, labels = input_fn()
      labels["alignment"] = None  # Add alignment key to force the model to return attention.
      with tf.variable_scope(self._model.name):
        outputs, _ = self._model(
            features,
            labels,
            self._config["params"],
            tf.estimator.ModeKeys.EVAL)

      cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=outputs["logits"], labels=labels["ids_out"])
      weights = tf.sequence_mask(labels["length"], dtype=cross_entropy.dtype)
      masked_cross_entropy = cross_entropy * weights
      scores = tf.reduce_sum(masked_cross_entropy, axis=1)
      results = {
          "attention": outputs["attention"],
          "cross_entropy": cross_entropy,
          "score": scores,
          "tokens": labels["tokens"],
          "length": labels["length"] - 1  # For -1, see sequence_to_sequence.shift_target_sequence.
      }

      with tf.train.MonitoredSession(
          session_creator=tf.train.ChiefSessionCreator(
              checkpoint_filename_with_path=checkpoint_path,
              config=self._session_config)) as sess:
        while not sess.should_stop():
          for batch in misc.extract_batches(sess.run(results)):
            tokens = batch["tokens"][:batch["length"]]
            sentence = self._model.target_inputter.tokenizer.detokenize(tokens)
            token_level_scores = None
            if self._config["score"].get("with_token_level"):
              token_level_scores = batch["cross_entropy"][:batch["length"]]
            alignment_type = self._config["score"].get("with_alignments")
            sentence = format_translation_output(
                sentence,
                score=batch["score"],
                token_level_scores=token_level_scores,
                attention=batch["attention"][:batch["length"]],
                alignment_type=alignment_type)
            misc.print_bytes(tf.compat.as_bytes(sentence))
Exemple #24
0
    def score(self,
              features_file,
              predictions_file,
              checkpoint_path=None,
              output_file=None):
        """Scores existing predictions.

    Args:
      features_file: The input file.
      predictions_file: The predictions file to score.
      checkpoint_path: Path of a specific checkpoint to use. If ``None``,
        the latest is used.
      output_file: The file where the scores are saved. Otherwise, they will be
        printed on the standard output.

    Raises:
      ValueError: if no checkpoint are found or if the model is not a sequence to
        sequence model.
    """
        if not isinstance(self._model,
                          (models.LanguageModel, models.SequenceToSequence)):
            raise ValueError(
                "scoring only works for sequence to sequence or language models"
            )

        if checkpoint_path is None:
            checkpoint_path = tf.train.latest_checkpoint(
                self._config["model_dir"])
        elif tf.gfile.IsDirectory(checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
        if checkpoint_path is None:
            raise ValueError("could not find a trained model in %s" %
                             self._config["model_dir"])

        model = copy.deepcopy(self._model)
        with tf.Graph().as_default():
            dataset = model.examples_inputter.make_evaluation_dataset(
                features_file,
                predictions_file,
                self._config["score"]["batch_size"],
                num_threads=self._config["score"].get("num_threads"),
                prefetch_buffer_size=self._config["score"].get(
                    "prefetch_buffer_size"))
            iterator = dataset.make_initializable_iterator()
            features, labels = iterator.get_next()
            labels[
                "alignment"] = None  # Add alignment key to force the model to return attention.
            outputs, _ = model(features, labels, self._config["params"],
                               tf.estimator.ModeKeys.EVAL)

            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=outputs["logits"], labels=labels["ids_out"])
            weights = tf.sequence_mask(labels["length"],
                                       dtype=cross_entropy.dtype)
            masked_cross_entropy = cross_entropy * weights
            scores = (tf.reduce_sum(masked_cross_entropy, axis=1) /
                      tf.cast(labels["length"], cross_entropy.dtype))
            results = {
                "cross_entropy": cross_entropy,
                "score": scores,
                "tokens": labels["tokens"],
                "length": labels["length"] - 1  # -1 for the special token.
            }
            if "attention" in outputs:
                results["attention"] = outputs["attention"]

            if output_file:
                stream = io.open(output_file, encoding="utf-8", mode="w")
            else:
                stream = sys.stdout

            output_tokenizer = (self._model.labels_inputter.tokenizer
                                if not self._model.unsupervised else
                                self._model.features_inputter.tokenizer)
            with tf.train.MonitoredSession(
                    session_creator=tf.train.ChiefSessionCreator(
                        checkpoint_filename_with_path=checkpoint_path,
                        config=self._session_config)) as sess:
                sess.run(iterator.initializer)
                while not sess.should_stop():
                    for batch in misc.extract_batches(sess.run(results)):
                        tokens = batch["tokens"][:batch["length"]]
                        sentence = output_tokenizer.detokenize(tokens)
                        token_level_scores = None
                        attention = None
                        if self._config["score"].get("with_token_level"):
                            token_level_scores = batch[
                                "cross_entropy"][:batch["length"]]
                        if "attention" in batch:
                            attention = batch["attention"][:batch["length"]]
                        alignment_type = self._config["score"].get(
                            "with_alignments")
                        sentence = format_translation_output(
                            sentence,
                            score=batch["score"],
                            token_level_scores=token_level_scores,
                            attention=attention,
                            alignment_type=alignment_type)
                        misc.print_bytes(tf.compat.as_bytes(sentence),
                                         stream=stream)

            if output_file:
                stream.close()
Exemple #25
0
def inference_Dual_SVAE(config_file,
                        src_eval_file=None,
                        tgt_eval_file=None,
                        checkpoint_path=None):

    with open(config_file, "r") as stream:
        config = yaml.load(stream)

    if src_eval_file == None:
        src_eval_file = config["eval_feature_file"]

    if tgt_eval_file == None:
        tgt_eval_file = config["eval_label_file"]

    from opennmt.utils.misc import print_bytes

    graph = tf.Graph()

    with tf.Session(graph=graph,
                    config=tf.ConfigProto(log_device_placement=False,
                                          allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True))) as sess_:

        eval_model = Dual_SVAE(config_file, "Inference", src_eval_file,
                               tgt_eval_file)
        #emb_src_batch = eval_model.emb_src_batch_()
        saver = tf.train.Saver()
        tf.tables_initializer().run()
        tf.global_variables_initializer().run()

        if checkpoint_path == None:
            checkpoint_dir = config["model_dir"]
            checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)

        print(("Evaluating model %s" % checkpoint_path))
        saver.restore(sess_, checkpoint_path)

        predictions_src_from_src, predictions_src_from_tgt, predictions_tgt_from_src, predictions_tgt_from_tgt \
                    = eval_model.prediction_()

        tokens_src_from_src = predictions_src_from_src["tokens"]
        length_src_from_src = predictions_src_from_src["length"]

        tokens_src_from_tgt = predictions_src_from_tgt["tokens"]
        length_src_from_tgt = predictions_src_from_tgt["length"]

        tokens_tgt_from_src = predictions_tgt_from_src["tokens"]
        length_tgt_from_src = predictions_tgt_from_src["length"]

        tokens_tgt_from_tgt = predictions_tgt_from_tgt["tokens"]
        length_tgt_from_tgt = predictions_tgt_from_tgt["length"]

        sess_.run(eval_model.iterator_initializers())

        # pred_dict = sess_.run([predictions])
        pred_dict = None

        print("write to :%s" %
              os.path.join(config["model_dir"], "eval",
                           "*/*" + os.path.basename(checkpoint_path)))

        source_from_source_path = os.path.join(
            config["model_dir"], "eval", "source_from_source",
            os.path.basename(src_eval_file) + ".s-s." +
            os.path.basename(checkpoint_path))

        source_from_target_path = os.path.join(
            config["model_dir"], "eval", "source_from_target",
            os.path.basename(tgt_eval_file) + ".s-t." +
            os.path.basename(checkpoint_path))

        target_from_source_path = os.path.join(
            config["model_dir"], "eval", "target_from_source",
            os.path.basename(src_eval_file) + ".t-s." +
            os.path.basename(checkpoint_path))

        target_from_target_path = os.path.join(
            config["model_dir"], "eval", "target_from_target",
            os.path.basename(tgt_eval_file) + ".t-t." +
            os.path.basename(checkpoint_path))

        with open(source_from_source_path,"w") as output_0_, \
                open(source_from_target_path,"w") as output_1_, \
                    open(target_from_source_path,"w") as output_2_, \
                        open(target_from_target_path,"w") as output_3_ :
            while True:
                try:
                    _tokens_src_from_src, _length_src_from_src, \
                    _tokens_src_from_tgt, _length_src_from_tgt, \
                    _tokens_tgt_from_src, _length_tgt_from_src, \
                    _tokens_tgt_from_tgt, _length_tgt_from_tgt = sess_.run([tokens_src_from_src, length_src_from_src,
                                                                            tokens_src_from_tgt, length_src_from_tgt,
                                                                            tokens_tgt_from_src, length_tgt_from_src,
                                                                            tokens_tgt_from_tgt, length_tgt_from_tgt])

                    for b in range(_tokens_src_from_src.shape[0]):
                        pred_toks = _tokens_src_from_src[b][
                            0][:_length_src_from_src[b][0] - 1]
                        pred_sent = b" ".join(pred_toks)
                        print_bytes(pred_sent, output_0_)

                    for b in range(_tokens_src_from_tgt.shape[0]):
                        pred_toks = _tokens_src_from_tgt[b][
                            0][:_length_src_from_tgt[b][0] - 1]
                        pred_sent = b" ".join(pred_toks)
                        print_bytes(pred_sent, output_1_)

                    for b in range(_tokens_tgt_from_src.shape[0]):
                        pred_toks = _tokens_tgt_from_src[b][
                            0][:_length_tgt_from_src[b][0] - 1]
                        pred_sent = b" ".join(pred_toks)
                        print_bytes(pred_sent, output_2_)

                    for b in range(_tokens_tgt_from_tgt.shape[0]):
                        pred_toks = _tokens_tgt_from_tgt[b][
                            0][:_length_tgt_from_tgt[b][0] - 1]
                        pred_sent = b" ".join(pred_toks)
                        print_bytes(pred_sent, output_3_)

                except tf.errors.OutOfRangeError:
                    break

        print("Finish inference !")

    return source_from_source_path, source_from_target_path, target_from_source_path, target_from_target_path, pred_dict