Esempio n. 1
0
 def _make_eval_prediction_hooks_fn(self):
     external_scorers = self._config["eval"].get("external_evaluators")
     if not self._config["eval"].get("save_eval_predictions",
                                     False) and external_scorers is None:
         return None
     if self._model.unsupervised:
         raise RuntimeError(
             "This model does not support saving evaluation predictions")
     save_path = os.path.join(self._config["model_dir"], "eval")
     if not tf.gfile.Exists(save_path):
         tf.gfile.MakeDirs(save_path)
     if external_scorers is not None:
         external_evaluator = evaluator.ExternalEvaluator(
             labels_file=self._config["data"]["eval_labels_file"],
             output_dir=os.path.join(self._config["model_dir"],
                                     "external_eval"),
             scorers=evaluator.make_scorers(external_scorers))
     else:
         external_evaluator = None
     return lambda predictions: [
         hooks.SaveEvaluationPredictionHook(
             self._model,
             os.path.join(save_path, "predictions.txt"),
             post_evaluation_fn=external_evaluator,
             predictions=predictions)
     ]
Esempio n. 2
0
  def _build_eval_spec(self):
    if "eval" not in self._config:
      self._config["eval"] = {}

    eval_hooks = []
    if (self._config["eval"].get("save_eval_predictions", False)
        or self._config["eval"].get("external_evaluators") is not None):
      save_path = os.path.join(self._estimator.model_dir, "eval")
      if not os.path.isdir(save_path):
        os.makedirs(save_path)
      eval_hooks.append(hooks.SaveEvaluationPredictionHook(
          self._model,
          os.path.join(save_path, "predictions.txt"),
          post_evaluation_fn=external_evaluation_fn(
              self._config["eval"].get("external_evaluators"),
              self._config["data"]["eval_labels_file"],
              output_dir=self._estimator.model_dir)))

    eval_spec = tf.estimator.EvalSpec(
        input_fn=self._model.input_fn(
            tf.estimator.ModeKeys.EVAL,
            self._config["eval"].get("batch_size", 32),
            self._config["data"],
            self._config["data"]["eval_features_file"],
            num_threads=self._config["eval"].get("num_threads"),
            prefetch_buffer_size=self._config["eval"].get("prefetch_buffer_size"),
            labels_file=self._config["data"]["eval_labels_file"]),
        steps=None,
        hooks=eval_hooks,
        exporters=_make_exporters(
            self._config["eval"].get("exporters", "last"),
            self._model.serving_input_fn(self._config["data"])),
        throttle_secs=self._config["eval"].get("eval_delay", 18000))
    return eval_spec
Esempio n. 3
0
 def _make_eval_prediction_hooks_fn(self):
   if (not self._config["eval"].get("save_eval_predictions", False)
       and self._config["eval"].get("external_evaluators") is None):
     return None
   save_path = os.path.join(self._config["model_dir"], "eval")
   if not tf.gfile.Exists(save_path):
     tf.gfile.MakeDirs(save_path)
   return lambda predictions: [
       hooks.SaveEvaluationPredictionHook(
           self._model,
           os.path.join(save_path, "predictions.txt"),
           post_evaluation_fn=external_evaluation_fn(
               self._config["eval"].get("external_evaluators"),
               self._config["data"]["eval_labels_file"],
               output_dir=save_path),
           predictions=predictions)]
Esempio n. 4
0
def train(estimator, model, config):
    """Runs training.

  Args:
    estimator: A `tf.estimator.Estimator`.
    model: A `opennmt.models.Model`.
    config: The configuration.
  """
    batch_size = config["train"]["batch_size"]
    prefetch_buffer_size = config["train"].get("prefetch_buffer_size",
                                               batch_size * 1000)
    num_parallel_process_calls = config["train"].get(
        "num_parallel_process_calls", multiprocessing.cpu_count())

    train_hooks = [
        hooks.LogParametersCountHook(),
        hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps,
                           output_dir=estimator.model_dir)
    ]

    eval_hooks = []
    if config["train"].get("save_eval_predictions", False):
        save_path = os.path.join(estimator.model_dir, "eval")
        if not os.path.isdir(save_path):
            os.makedirs(save_path)
        eval_hooks.append(
            hooks.SaveEvaluationPredictionHook(
                model,
                os.path.join(save_path, "predictions.txt"),
                post_evaluation_fn=external_evaluation_fn(
                    config["train"].get("external_evaluators"),
                    config["data"]["eval_labels_file"],
                    output_dir=estimator.model_dir)))
    elif config["train"].get("external_evaluators") is not None:
        tf.logging.warning(
            "External evaluators only work when save_eval_predictions is enabled."
        )

    train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn(
        tf.estimator.ModeKeys.TRAIN,
        batch_size,
        prefetch_buffer_size,
        num_parallel_process_calls,
        config["data"],
        config["data"]["train_features_file"],
        labels_file=config["data"]["train_labels_file"],
        num_buckets=config["train"].get("num_buckets", 5),
        sample_buffer_size=config["train"].get("sample_buffer_size", 1000000),
        maximum_features_length=config["train"].get("maximum_features_length"),
        maximum_labels_length=config["train"].get("maximum_labels_length")),
                                        max_steps=config["train"].get(
                                            "train_steps"),
                                        hooks=train_hooks)

    eval_spec = tf.estimator.EvalSpec(
        input_fn=model.input_fn(
            tf.estimator.ModeKeys.EVAL,
            batch_size,
            prefetch_buffer_size,
            num_parallel_process_calls,
            config["data"],
            config["data"]["eval_features_file"],
            labels_file=config["data"]["eval_labels_file"]),
        steps=None,
        hooks=eval_hooks,
        exporters=tf.estimator.LatestExporter(
            "latest", model.serving_input_fn(config["data"])),
        throttle_secs=config["train"].get("eval_delay", 18000))

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Esempio n. 5
0
def train(estimator, model, config):
    """Runs training.

  Args:
    estimator: A `tf.estimator.Estimator`.
    model: A `opennmt.models.Model`.
    config: The configuration.
  """
    if "eval" not in config:
        config["eval"] = {}

    train_hooks = [
        hooks.LogParametersCountHook(),
        hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps,
                           output_dir=estimator.model_dir)
    ]

    eval_hooks = []
    if (config["eval"].get("save_eval_predictions", False)
            or config["eval"].get("external_evaluators") is not None):
        save_path = os.path.join(estimator.model_dir, "eval")
        if not os.path.isdir(save_path):
            os.makedirs(save_path)
        eval_hooks.append(
            hooks.SaveEvaluationPredictionHook(
                model,
                os.path.join(save_path, "predictions.txt"),
                post_evaluation_fn=external_evaluation_fn(
                    config["eval"].get("external_evaluators"),
                    config["data"]["eval_labels_file"],
                    output_dir=estimator.model_dir)))

    default_sample_buffer_size = 1000000
    if "sample_buffer_size" not in config["train"]:
        tf.logging.warn(
            "You did not set sample_buffer_size. By default, the "
            "training dataset is shuffled by chunk of %d examples. "
            "If your dataset is larger than this value and eval_delay "
            "is shorter than the training time of one epoch, a section "
            "of the dataset will be discarded. Consider setting "
            "sample_buffer_size to the size of your dataset." %
            default_sample_buffer_size)

    train_batch_size = config["train"]["batch_size"]
    train_batch_type = config["train"].get("batch_type", "examples")
    train_prefetch_buffer_size = config["train"].get(
        "prefetch_buffer_size",
        train_batch_size * (1000 if train_batch_type == "examples" else 50))
    train_num_parallel_process_calls = config["train"].get(
        "num_parallel_process_calls", multiprocessing.cpu_count())
    train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn(
        tf.estimator.ModeKeys.TRAIN,
        train_batch_size,
        train_prefetch_buffer_size,
        train_num_parallel_process_calls,
        config["data"],
        config["data"]["train_features_file"],
        labels_file=config["data"]["train_labels_file"],
        batch_type=train_batch_type,
        bucket_width=config["train"].get("bucket_width", 5),
        sample_buffer_size=config["train"].get("sample_buffer_size",
                                               default_sample_buffer_size),
        maximum_features_length=config["train"].get("maximum_features_length"),
        maximum_labels_length=config["train"].get("maximum_labels_length")),
                                        max_steps=config["train"].get(
                                            "train_steps"),
                                        hooks=train_hooks)

    eval_batch_size = config["eval"].get(
        "batch_size",
        train_batch_size if train_batch_type == "examples" else 30)
    eval_prefetch_buffer_size = config["eval"].get("prefetch_buffer_size",
                                                   eval_batch_size * 10)
    eval_num_parallel_process_calls = config["eval"].get(
        "num_parallel_process_calls", train_num_parallel_process_calls)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=model.input_fn(
            tf.estimator.ModeKeys.EVAL,
            eval_batch_size,
            eval_prefetch_buffer_size,
            eval_num_parallel_process_calls,
            config["data"],
            config["data"]["eval_features_file"],
            labels_file=config["data"]["eval_labels_file"]),
        steps=None,
        hooks=eval_hooks,
        exporters=tf.estimator.LatestExporter(
            "latest", model.serving_input_fn(config["data"])),
        throttle_secs=config["eval"].get("eval_delay", 18000))

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)