def score(self, features_file, predictions_file, checkpoint_path=None, output_file=None): """Scores existing predictions. Args: features_file: The input file. predictions_file: The predictions file to score. checkpoint_path: Path of a specific checkpoint to use. If ``None``, the latest is used. output_file: The file where the scores are saved. Otherwise, they will be printed on the standard output. Raises: ValueError: if no checkpoint are found or if the model is not a sequence to sequence model. """ if not isinstance(self._model, (models.LanguageModel, models.SequenceToSequence)): raise ValueError("scoring only works for sequence to sequence or language models") if checkpoint_path is None: checkpoint_path = tf.train.latest_checkpoint(self._config["model_dir"]) elif tf.gfile.IsDirectory(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) if checkpoint_path is None: raise ValueError("could not find a trained model in %s" % self._config["model_dir"]) model = copy.deepcopy(self._model) with tf.Graph().as_default(): dataset = model.examples_inputter.make_evaluation_dataset( features_file, predictions_file, self._config["score"]["batch_size"], num_threads=self._config["score"].get("num_threads"), prefetch_buffer_size=self._config["score"].get("prefetch_buffer_size")) iterator = dataset.make_initializable_iterator() features, labels = iterator.get_next() labels["alignment"] = None # Add alignment key to force the model to return attention. outputs, _ = model( features, labels, self._config["params"], tf.estimator.ModeKeys.EVAL) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=outputs["logits"], labels=labels["ids_out"]) weights = tf.sequence_mask(labels["length"], dtype=cross_entropy.dtype) masked_cross_entropy = cross_entropy * weights scores = tf.reduce_sum(masked_cross_entropy, axis=1) results = { "cross_entropy": cross_entropy, "score": scores, "tokens": labels["tokens"], "length": labels["length"] - 1 # -1 for the special token. } if "attention" in outputs: results["attention"] = outputs["attention"] if output_file: stream = io.open(output_file, encoding="utf-8", mode="w") else: stream = sys.stdout output_tokenizer = ( self._model.labels_inputter.tokenizer if not self._model.unsupervised else self._model.features_inputter.tokenizer) with tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=self._session_config)) as sess: sess.run(iterator.initializer) while not sess.should_stop(): for batch in misc.extract_batches(sess.run(results)): tokens = batch["tokens"][:batch["length"]] sentence = output_tokenizer.detokenize(tokens) token_level_scores = None attention = None if self._config["score"].get("with_token_level"): token_level_scores = batch["cross_entropy"][:batch["length"]] if "attention" in batch: attention = batch["attention"][:batch["length"]] alignment_type = self._config["score"].get("with_alignments") sentence = format_translation_output( sentence, score=batch["score"], token_level_scores=token_level_scores, attention=attention, alignment_type=alignment_type) misc.print_bytes(tf.compat.as_bytes(sentence), stream=stream) if output_file: stream.close()
def score(self, features_file, predictions_file, checkpoint_path=None): """Scores existing predictions. Args: features_file: The input file. predictions_file: The predictions file to score. checkpoint_path: Path of a specific checkpoint to use. If ``None``, the latest is used. Raises: ValueError: if no checkpoint are found or if the model is not a sequence to sequence model. """ if not hasattr(self._model, "target_inputter"): raise ValueError( "scoring only works for sequence to sequence models") if checkpoint_path is None: checkpoint_path = tf.train.latest_checkpoint( self._estimator.model_dir) elif tf.gfile.IsDirectory(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) if checkpoint_path is None: raise ValueError("could not find a trained model in %s" % self._estimator.model_dir) if "score" not in self._config: self._config["score"] = {} input_fn = self._model.input_fn( tf.estimator.ModeKeys.EVAL, self._config["score"]["batch_size"], self._config["data"], features_file, labels_file=predictions_file, num_threads=self._config["score"].get("num_threads"), prefetch_buffer_size=self._config["score"].get( "prefetch_buffer_size")) with tf.Graph().as_default() as g: tf.train.create_global_step(g) features, labels = input_fn() labels[ "alignment"] = None # Add alignment key to force the model to return attention. with tf.variable_scope(self._model.name): outputs, _ = self._model(features, labels, self._estimator.params, tf.estimator.ModeKeys.EVAL) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=outputs["logits"], labels=labels["ids_out"]) weights = tf.sequence_mask(labels["length"], dtype=cross_entropy.dtype) masked_cross_entropy = cross_entropy * weights scores = tf.reduce_sum(masked_cross_entropy, axis=1) results = { "attention": outputs["attention"], "cross_entropy": cross_entropy, "score": scores, "tokens": labels["tokens"], "length": labels["length"] - 1 # For -1, see sequence_to_sequence.shift_target_sequence. } with tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=self._estimator.config.session_config)) as sess: while not sess.should_stop(): for batch in misc.extract_batches(sess.run(results)): tokens = batch["tokens"][:batch["length"]] sentence = self._model.target_inputter.tokenizer.detokenize( tokens) token_level_scores = None if self._config["score"].get("with_token_level"): token_level_scores = batch[ "cross_entropy"][:batch["length"]] alignment_type = self._config["score"].get( "with_alignments") sentence = format_translation_output( sentence, score=batch["score"], token_level_scores=token_level_scores, attention=batch["attention"][:batch["length"]], alignment_type=alignment_type) misc.print_bytes(tf.compat.as_bytes(sentence))
def print_prediction(self, prediction, params=None, stream=None): target_length = prediction["length"] tokens = prediction["tokens"][:target_length] sentence = self.examples_inputter.tokenizer.detokenize(tokens) sentence = misc.format_translation_output(sentence) misc.print_as_bytes(sentence, stream=stream)