Exemple #1
0
  def _read_config_files(self, run_paths):
    configs = {}
    config_fpaths = {}
    for run_name, logdir in run_paths.items():
      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
      if not file_io.file_exists(config_fpath):
        # Skip runs that have no config file.
        continue
      # Read the config file.
      file_content = file_io.read_file_to_string(config_fpath).decode('utf-8')
      config = ProjectorConfig()
      text_format.Merge(file_content, config)

      if not config.model_checkpoint_path:
        # See if you can find a checkpoint file in the logdir.
        ckpt_path = latest_checkpoint(logdir)
        if not ckpt_path:
          # Or in the parent of logdir.
          ckpt_path = latest_checkpoint(os.path.join('../', logdir))
          if not ckpt_path:
            logging.warning('Cannot find model checkpoint in %s', logdir)
            continue
        config.model_checkpoint_path = ckpt_path

      # Sanity check for the checkpoint file.
      if not file_io.file_exists(config.model_checkpoint_path):
        logging.warning('Checkpoint file %s not found',
                        config.model_checkpoint_path)
        continue
      configs[run_name] = config
      config_fpaths[run_name] = config_fpath
    return configs, config_fpaths
  def testRecoverSession(self):
    # Create a checkpoint.
    checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
    except errors.OpError:
      pass  # Ignore
    gfile.MakeDirs(checkpoint_dir)

    with ops.Graph().as_default():
      v = variables.Variable(1, name="v")
      sm = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables())
      saver = saver_lib.Saver({"v": v})
      sess, initialized = sm.recover_session(
          "", saver=saver, checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      sess.run(v.initializer)
      self.assertEquals(1, sess.run(v))
      saver.save(sess,
                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
    self._test_recovered_variable(checkpoint_dir=checkpoint_dir)
    self._test_recovered_variable(
        checkpoint_filename_with_path=saver_lib.latest_checkpoint(
            checkpoint_dir))
    # Cannot set both checkpoint_dir and checkpoint_filename_with_path.
    with self.assertRaises(ValueError):
      self._test_recovered_variable(
          checkpoint_dir=checkpoint_dir,
          checkpoint_filename_with_path=saver_lib.latest_checkpoint(
              checkpoint_dir))
def _find_latest_checkpoint(dir_path):
  try:
    ckpt_path = latest_checkpoint(dir_path)
    if not ckpt_path:
      # Check the parent directory.
      ckpt_path = latest_checkpoint(os.path.join(dir_path, os.pardir))
    return ckpt_path
  except errors.NotFoundError:
    return None
  def testEvalOpAndFinalOp(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops')

    # Train a model for a single step to get a checkpoint.
    self._train_model(checkpoint_dir, num_steps=1)
    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)

    # Create the model so we have something to restore.
    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
    logistic_classifier(inputs)

    num_evals = 5
    final_increment = 9.0

    my_var = local_variable(0.0, name='MyVar')
    eval_ops = state_ops.assign_add(my_var, 1.0)
    final_ops = array_ops.identity(my_var) + final_increment

    final_hooks = [evaluation._StopAfterNEvalsHook(num_evals),]
    initial_hooks = list(final_hooks)
    final_ops_values = evaluation._evaluate_once(
        checkpoint_path=checkpoint_path,
        eval_ops=eval_ops,
        final_ops={'value': final_ops},
        hooks=final_hooks)
    self.assertEqual(final_ops_values['value'], num_evals + final_increment)
    self.assertEqual(initial_hooks, final_hooks)
  def testEvaluateWithFiniteInputs(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(),
                                  'evaluate_with_finite_inputs')

    # Train a Model to completion:
    self._train_model(checkpoint_dir, num_steps=300)

    # Run evaluation. Inputs are fed through input producer for one epoch.
    all_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
    all_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

    single_input, single_label = training.slice_input_producer(
        [all_inputs, all_labels], num_epochs=1)
    inputs, labels = training.batch([single_input, single_label], batch_size=6,
                                    allow_smaller_final_batch=True)

    logits = logistic_classifier(inputs)
    predictions = math_ops.round(logits)

    accuracy, update_op = metrics.accuracy(
        predictions=predictions, labels=labels)

    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)

    final_ops_values = evaluation._evaluate_once(
        checkpoint_path=checkpoint_path,
        eval_ops=update_op,
        final_ops={'accuracy': accuracy,
                   'eval_steps': evaluation._get_or_create_eval_step()},
        hooks=[evaluation._StopAfterNEvalsHook(None),])
    self.assertTrue(final_ops_values['accuracy'] > .99)
    # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs
    # each; the 3rd iter evaluates the remaining 4 inputs, and the last one
    # triggers an error which stops evaluation.
    self.assertEqual(final_ops_values['eval_steps'], 4)
def graph_def_from_checkpoint(checkpoint_dir, output_node_names):
  """Converts checkpoint data to GraphDef.

  Reads the latest checkpoint data and produces a GraphDef in which the
  variables have been converted to constants.

  Args:
    checkpoint_dir: Path to the checkpoints.
    output_node_names: List of name strings for the result nodes of the graph.

  Returns:
    A GraphDef from the latest checkpoint

  Raises:
    ValueError: if no checkpoint is found
  """
  checkpoint_path = saver_lib.latest_checkpoint(checkpoint_dir)
  if checkpoint_path is None:
    raise ValueError('Could not find a checkpoint at: {0}.'
                     .format(checkpoint_dir))

  saver_for_restore = saver_lib.import_meta_graph(
      checkpoint_path + '.meta', clear_devices=True)
  with session.Session() as sess:
    saver_for_restore.restore(sess, checkpoint_path)
    graph_def = ops.get_default_graph().as_graph_def()
    output_graph_def = graph_util.convert_variables_to_constants(
        sess, graph_def, output_node_names)

  return output_graph_def
  def export_fn(estimator, export_dir_base, checkpoint_path, eval_result=None):
    """Exports the given Estimator as a SavedModel.

    Args:
      estimator: the Estimator to export.
      export_dir_base: A string containing a directory to write the exported
        graph and checkpoints.
      checkpoint_path: The checkpoint path to export.  If None (the default),
        the most recent checkpoint found within the model directory is chosen.
      eval_result: placehold args matching the call signature of ExportStrategy.

    Returns:
      The string path to the exported directory.
    """
    if not checkpoint_path:
      # TODO(b/67425018): switch to
      #    checkpoint_path = estimator.latest_checkpoint()
      #  as soon as contrib is cleaned up and we can thus be sure that
      #  estimator is a tf.estimator.Estimator and not a
      #  tf.contrib.learn.Estimator
      checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
    export_checkpoint_path, export_eval_result = best_model_selector.update(
        checkpoint_path, eval_result)

    if export_checkpoint_path and export_eval_result is not None:
      checkpoint_base = os.path.basename(export_checkpoint_path)
      export_dir = os.path.join(export_dir_base, checkpoint_base)
      return best_model_export_strategy.export(
          estimator, export_dir, export_checkpoint_path, export_eval_result)
    else:
      return ''
Exemple #8
0
  def _infer_model(
      self, input_fn, feed_fn=None, outputs=None, as_iterable=False):
    # Check that model has been trained.
    checkpoint_path = saver.latest_checkpoint(self._model_dir)
    if not checkpoint_path:
      raise NotFittedError("Couldn't find trained model at %s."
                           % self._model_dir)

    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      contrib_framework.create_global_step(g)
      features = self._get_features_from_input_fn(input_fn)
      predictions = self._get_predict_ops(features)
      # If predictions is single output - wrap it into dict, and remember to
      # return not a dict.
      return_dict = isinstance(predictions, dict)
      if not return_dict:
        predictions = {'predictions': predictions}

      # Filter what to run predictions on, if outputs provided.
      if outputs:
        existing_keys = predictions.keys()
        predictions = {
            key: value for key, value in predictions.items() if key in outputs
        }
        if not predictions:
          raise ValueError('Expected to run at least one output from %s, '
                           'provided %s.' % (existing_keys, outputs))

      if as_iterable:
        return self._infer_model_as_iterable(
            checkpoint_path, predictions, feed_fn, return_dict)
      else:
        return self._infer_model_single(
            checkpoint_path, predictions, feed_fn, return_dict)
 def test_recovery(self):
   logdir = _test_dir(self.get_temp_dir(), 'test_recovery')
   with ops.Graph().as_default():
     gstep = variables_lib.get_or_create_global_step()
     do_step = state_ops.assign_add(gstep, 1)
     scaffold = monitored_session.Scaffold()
     # Use a hook to save the model every 100 steps.  It also saves it at
     # the end.
     hooks = [
         basic_session_run_hooks.CheckpointSaverHook(
             logdir, save_steps=1, scaffold=scaffold)
     ]
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold, checkpoint_dir=logdir),
         hooks=hooks) as session:
       self.assertEqual(0, session.run(gstep))
       self.assertEqual(1, session.run(do_step))
       self.assertEqual(2, session.run(do_step))
     # A restart will find the checkpoint and recover automatically.
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold, checkpoint_dir=logdir)) as session:
       self.assertEqual(2, session.run(gstep))
     # A restart will find the checkpoint and recover automatically.
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold,
             checkpoint_filename_with_path=saver_lib.latest_checkpoint(
                 logdir))) as session:
       self.assertEqual(2, session.run(gstep))
Exemple #10
0
def export_estimator(estimator, export_dir, input_fn=_default_input_fn,
                     signature_fn=_generic_signature_fn, default_batch_size=1,
                     exports_to_keep=None):
  """Exports inference graph into given dir.

  Args:
    estimator: Estimator to export
    export_dir: A string containing a directory to write the exported graph
      and checkpoints.
    input_fn: Function that given `Tensor` of `Example` strings, parses it into
      features that are then passed to the model.
    signature_fn: Function that given `Tensor` of `Example` strings,
      `dict` of `Tensor`s for features and `dict` of `Tensor`s for predictions
      and returns default and named exporting signautres.
    default_batch_size: Default batch size of the `Example` placeholder.
    exports_to_keep: Number of exports to keep.
  """
  checkpoint_path = tf_saver.latest_checkpoint(estimator._model_dir)
  with ops.Graph().as_default() as g:
    contrib_variables.create_global_step(g)
    examples = array_ops.placeholder(dtype=dtypes.string,
                                     shape=[default_batch_size],
                                     name='input_example_tensor')
    features = input_fn(estimator, examples)
    predictions = estimator._get_predict_ops(features)
    default_signature, named_graph_signatures = signature_fn(
        examples, features, predictions)
    if exports_to_keep is not None:
      exports_to_keep = gc.largest_export_versions(exports_to_keep)
    _export_graph(g, _get_saver(), checkpoint_path, export_dir,
                  default_graph_signature=default_signature,
                  named_graph_signatures=named_graph_signatures,
                  exports_to_keep=exports_to_keep)
Exemple #11
0
 def create_session(self, checkpoint_dir):
     """Creates a MonitoredSession for this predictor."""
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
     return training.MonitoredSession(
         session_creator=training.ChiefSessionCreator(
             checkpoint_filename_with_path=checkpoint_path,
             config=self._session_config()))
Exemple #12
0
  def every_n_step_end(self, step, outputs):
    super(ValidationMonitor, self).every_n_step_end(step, outputs)
    # TODO(mdan): The use of step below is probably misleading.
    # The code should probably use the step from the checkpoint, because
    # that's what is being evaluated.
    if self._estimator is None:
      raise ValueError("Missing call to set_estimator.")
    current_time = time.time()
    if (self._check_interval_secs is not None and
        self._last_checkpoint_check_time is not None and
        current_time - self._last_checkpoint_check_time <=
        self._check_interval_secs):
      logging.debug(
          "Skipping evaluation since less than %d seconds have passed since "
          "last check for a new checkpoint.", self._check_interval_secs)
      return False
    self._last_checkpoint_check_time = current_time
    # Check that we are not running evaluation on the same checkpoint.
    latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir)
    if latest_path is None:
      logging.debug("Skipping evaluation since model has not been saved yet "
                    "at step %d.", step)
      return False
    if latest_path is not None and latest_path == self._latest_path:
      logging.debug("Skipping evaluation due to same checkpoint %s for step %d "
                    "as for step %d.", latest_path, step,
                    self._latest_path_step)
      return False
    self._latest_path = latest_path
    self._latest_path_step = step

    # Run evaluation and log it.
    validation_outputs = self._evaluate_estimator()
    stats = []
    for name in validation_outputs:
      stats.append("%s = %s" % (name, str(validation_outputs[name])))
    logging.info("Validation (step %d): %s", step, ", ".join(stats))

    # Early stopping logic.
    if self.early_stopping_rounds is not None:
      if self.early_stopping_metric not in validation_outputs:
        raise ValueError("Metric %s missing from outputs %s." %
                         (self.early_stopping_metric,
                          set(validation_outputs.keys())))
      current_value = validation_outputs[self.early_stopping_metric]
      if (self._best_value is None or (self.early_stopping_metric_minimize and
                                       (current_value < self._best_value)) or
          (not self.early_stopping_metric_minimize and
           (current_value > self._best_value))):
        self._best_value = current_value
        self._best_metrics = copy.deepcopy(validation_outputs)
        self._best_value_step = step
      stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
      if stop_now:
        logging.info("Stopping. Best step: {} with {} = {}.".format(
            self._best_value_step, self.early_stopping_metric,
            self._best_value))
        self._early_stopped = True
        return True
    return False
  def testMultiEvalStepIncrements(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops')

    # Train a model for a single step to get a checkpoint.
    self._train_model(checkpoint_dir, num_steps=1)
    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)

    # Create the model so we have something to restore.
    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
    logistic_classifier(inputs)

    num_evals = 6

    my_var = local_variable(0.0, name='MyVar')
    # In eval ops, we also increase the eval step one more time.
    eval_ops = [state_ops.assign_add(my_var, 1.0),
                state_ops.assign_add(
                    evaluation._get_or_create_eval_step(), 1, use_locking=True)]
    expect_eval_update_counts = num_evals // 2

    final_ops = array_ops.identity(my_var)

    final_ops_values = evaluation._evaluate_once(
        checkpoint_path=checkpoint_path,
        eval_ops=eval_ops,
        final_ops={'value': final_ops},
        hooks=[evaluation._StopAfterNEvalsHook(num_evals),])
    self.assertEqual(final_ops_values['value'], expect_eval_update_counts)
 def testUsageGraph(self):
   """Expected usage when graph building."""
   with context.graph_mode():
     num_training_steps = 10
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     for training_continuation in range(3):
       with ops.Graph().as_default():
         network = MyNetwork()
         optimizer = adam.AdamOptimizer(0.001)
         root = checkpointable_utils.Checkpoint(
             optimizer=optimizer, network=network,
             global_step=training_util.get_or_create_global_step())
         input_value = constant_op.constant([[3.]])
         train_op = optimizer.minimize(
             network(input_value),
             global_step=root.global_step)
         checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
         with self.test_session(graph=ops.get_default_graph()) as session:
           status = root.restore(save_path=checkpoint_path)
           status.initialize_or_restore(session=session)
           if checkpoint_path is None:
             self.assertEqual(0, training_continuation)
             with self.assertRaises(AssertionError):
               status.assert_consumed()
           else:
             status.assert_consumed()
           for _ in range(num_training_steps):
             session.run(train_op)
           root.save(file_prefix=checkpoint_prefix, session=session)
           self.assertEqual((training_continuation + 1) * num_training_steps,
                            session.run(root.global_step))
           self.assertEqual(training_continuation + 1,
                            session.run(root.save_counter))
 def testAgnosticUsage(self):
   """Graph/eager agnostic usage."""
   # Does create garbage when executing eagerly due to ops.Graph() creation.
   num_training_steps = 10
   checkpoint_directory = self.get_temp_dir()
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
   for training_continuation in range(3):
     with ops.Graph().as_default(), self.test_session(
         graph=ops.get_default_graph()):
       network = MyNetwork()
       optimizer = adam.AdamOptimizer(0.001)
       root = checkpointable_utils.Checkpoint(
           optimizer=optimizer, network=network,
           global_step=training_util.get_or_create_global_step())
       checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
       status = root.restore(save_path=checkpoint_path)
       input_value = constant_op.constant([[3.]])
       train_fn = functools.partial(
           optimizer.minimize,
           functools.partial(network, input_value),
           global_step=root.global_step)
       if context.in_graph_mode():
         train_fn = functools.partial(self.evaluate, train_fn())
       status.initialize_or_restore()
       for _ in range(num_training_steps):
         train_fn()
       root.save(file_prefix=checkpoint_prefix)
       self.assertEqual((training_continuation + 1) * num_training_steps,
                        self.evaluate(root.global_step))
       self.assertEqual(training_continuation + 1,
                        self.evaluate(root.save_counter))
Exemple #16
0
  def _infer_model(self, x=None, input_fn=None, feed_fn=None, batch_size=None):
    # Converts inputs into tf.DataFrame / tf.Series.
    batch_size = -1 if batch_size is None else batch_size
    if x is not None:
      input_fn, feed_fn = _get_predict_input_fn(x, None, batch_size)

    checkpoint_path = saver.latest_checkpoint(self._model_dir)
    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      contrib_framework.create_global_step(g)
      features = self._get_features_from_input_fn(input_fn)
      predictions = self._get_predict_ops(features)
      return_dict = True
      if not isinstance(predictions, dict):
        predictions, return_dict = {'predictions': predictions}, False
      if feed_fn is None:
        preds = infer(checkpoint_path, predictions)
      else:
        preds = {}
        def _feed_fn():
          while True:
            yield feed_fn()
        outputs = graph_actions.run_feeds(
            output_dict=predictions,
            feed_dicts=_feed_fn(),
            restore_checkpoint_path=checkpoint_path)
        for key in predictions:
          preds[key] = np.concatenate(
              [output[key] for output in outputs], axis=0)
      if return_dict:
        return preds
      return preds['predictions']
Exemple #17
0
def create_session(checkpoint_path, n_cpu_threads=-1):
    """Creates a MonitoredSession.
    
    Args:
      checkpoint_path (string): Path either to checkpoint directory or
                                directly to a checkpoint file.
      n_cpu_threads (int): Number of CPU threads. If negative, we
                           assume either GPU decoding or that all
                           CPU cores can be used.
    Returns:
      A TensorFlow MonitoredSession.
    """
    try:
        if os.path.isdir(checkpoint_path):
            checkpoint_path = saver.latest_checkpoint(checkpoint_path)
        else:
            logging.info("%s is not a directory. Interpreting as direct "
                         "path to checkpoint..." % checkpoint_path)
        return training.MonitoredSession(
            session_creator=training.ChiefSessionCreator(
                checkpoint_filename_with_path=checkpoint_path,
                config=session_config(n_cpu_threads)))
    except tf.errors.NotFoundError as e:
        logging.fatal("Could not find all variables of the computation "
            "graph in the T2T checkpoint file. This means that the "
            "checkpoint does not correspond to the model specified in "
            "SGNMT. Please double-check pred_src_vocab_size, "
            "pred_trg_vocab_size, and all the t2t_* parameters. "
            "Also make sure that the checkpoint exists and is readable")
        raise AttributeError("Could not initialize TF session.")
Exemple #18
0
  def _evaluate_model(self,
                      input_fn,
                      steps,
                      feed_fn=None,
                      metrics=None,
                      name=''):
    if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'):
      return

    checkpoint_path = saver.latest_checkpoint(self._model_dir)
    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
                            'eval_' + name)
    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step = contrib_framework.create_global_step(g)
      features, targets = input_fn()
      self._check_inputs(features, targets)
      eval_dict = self._get_eval_ops(features, targets,
                                     metrics if metrics is not None else
                                     self._get_default_metric_functions())
      update_op, eval_dict = self._extract_metric_update_ops(eval_dict)
      eval_results, _ = evaluate(graph=g,
                                 output_dir=eval_dir,
                                 checkpoint_path=checkpoint_path,
                                 eval_dict=eval_dict,
                                 update_op=update_op,
                                 global_step_tensor=global_step,
                                 supervisor_master=self._config.master,
                                 feed_fn=feed_fn,
                                 max_steps=steps)
      return eval_results
Exemple #19
0
  def _infer_model(self,
                   x=None, input_fn=None, feed_fn=None,
                   batch_size=None, axis=None, proba=False):
    # Converts inputs into tf.DataFrame / tf.Series.
    batch_size = -1 if batch_size is None else batch_size
    if x is not None:
      input_fn, feed_fn = _get_predict_input_fn(x, batch_size)

    checkpoint_path = saver.latest_checkpoint(self._model_dir)
    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      contrib_framework.create_global_step(g)
      features, _ = input_fn()
      predictions = self._get_predict_ops(features)
      if not isinstance(predictions, dict):
        predictions = {'predictions': predictions}
      # TODO(ipolosukhin): Support batching
      if feed_fn is None:
        return infer(checkpoint_path, predictions)
      preds = {}
      while True:
        try:
          feed_dict = feed_fn()
        except StopIteration:
          break
        if feed_dict is None:
          break
        outputs = infer(checkpoint_path, predictions, feed_dict=feed_dict)
        for key in outputs:
          if key not in preds:
            preds[key] = []
          preds[key].append(outputs[key])
      for key in preds:
        preds[key] = np.concatenate(preds[key], axis=0)
      return preds
 def testDeferredRestorationUsageEager(self):
   """An idiomatic eager execution example."""
   num_training_steps = 10
   checkpoint_directory = self.get_temp_dir()
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
   latest_object_graph = None  # Will be saved with the checkpoint eventually.
   for training_continuation in range(3):
     with ops.Graph().as_default():
       network = MyNetwork()
       optimizer = CheckpointableAdam(0.001)
       root = Root(optimizer=optimizer, network=network)
       checkpointable.restore(
           save_path=core_saver.latest_checkpoint(checkpoint_directory),
           root_checkpointable=root,
           object_graph_proto=latest_object_graph)
       for _ in range(num_training_steps):
         # TODO(allenl): Use a Dataset and serialize/checkpoint it.
         input_value = constant_op.constant([[3.]])
         optimizer.minimize(
             lambda: network(input_value),  # pylint: disable=cell-var-from-loop
             global_step=root.global_step)
       latest_object_graph, _ = checkpointable.save(
           file_prefix=checkpoint_prefix,
           root_checkpointable=root)
       self.assertEqual((training_continuation + 1) * num_training_steps,
                        root.global_step.numpy())
  def _restore_or_save_initial_ckpt(self, session):
    # Ideally this should be run in after_create_session but is not for the
    # following reason:
    # Currently there is no way of enforcing an order of running the
    # `SessionRunHooks`. Hence it is possible that the `_DatasetInitializerHook`
    # is run *after* this hook. That is troublesome because
    # 1. If a checkpoint exists and this hook restores it, the initializer hook
    #    will override it.
    # 2. If no checkpoint exists, this hook will try to save an initialized
    #    iterator which will result in an exception.
    #
    # As a temporary fix we enter the following implicit contract between this
    # hook and the _DatasetInitializerHook.
    # 1. The _DatasetInitializerHook initializes the iterator in the call to
    #    after_create_session.
    # 2. This hook saves the iterator on the first call to `before_run()`, which
    #    is guaranteed to happen after `after_create_session()` of all hooks
    #    have been run.

    # Check if there is an existing checkpoint. If so, restore from it.
    # pylint: disable=protected-access
    latest_checkpoint_path = saver_lib.latest_checkpoint(
        self._checkpoint_saver_hook._checkpoint_dir,
        latest_filename=self._latest_filename)
    if latest_checkpoint_path:
      self._checkpoint_saver_hook._get_saver().restore(session,
                                                       latest_checkpoint_path)
    else:
      # The checkpoint saved here is the state at step "global_step".
      # Note: We do not save the GraphDef or MetaGraphDef here.
      global_step = session.run(self._checkpoint_saver_hook._global_step_tensor)
      self._checkpoint_saver_hook._save(session, global_step)
      self._checkpoint_saver_hook._timer.update_last_triggered_step(global_step)
Exemple #22
0
  def predict(self, input_fn, predict_keys=None, hooks=None, checkpoint_path=None):
    """Returns predictions for given features.

    Args:
      input_fn: Input function returning features which is a dictionary of
        string feature name to `Tensor` or `SparseTensor`. If it returns a
        tuple, first item is extracted as features. Prediction continues until
        `input_fn` raises an end-of-input exception (`OutOfRangeError` or
        `StopIteration`).
      predict_keys: list of `str`, name of the keys to predict. It is used if
        the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used
        then rest of the predictions will be filtered from the dictionary. If
        `None`, returns all.
      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
        inside the prediction call.
      checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
        latest checkpoint in `model_dir` is used.

    Yields:
      Evaluated values of `predictions` tensors.

    Raises:
      ValueError: Could not find a trained model in model_dir.
      ValueError: if batch length of predictions are not same.
      ValueError: If there is a conflict between `predict_keys` and
        `predictions`. For example if `predict_keys` is not `None` but
        `EstimatorSpec.predictions` is not a `dict`.
    """
    hooks = _check_hooks_type(hooks)
    # Check that model has been trained.
    if not checkpoint_path:
      checkpoint_path = saver.latest_checkpoint(self._model_dir)
    if not checkpoint_path:
      raise ValueError('Could not find trained model in model_dir: {}.'.format(
          self._model_dir))

    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      training.create_global_step(g)
      features = self._get_features_from_input_fn(input_fn)
      estimator_spec = self._call_model_fn(features, None,
                                           model_fn_lib.ModeKeys.PREDICT)
      predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
      with training.MonitoredSession(
          session_creator=training.ChiefSessionCreator(
              checkpoint_filename_with_path=checkpoint_path,
              scaffold=estimator_spec.scaffold,
              config=config_pb2.ConfigProto(allow_soft_placement=True)),
          hooks=hooks) as mon_sess:
        while not mon_sess.should_stop():
          preds_evaluated = mon_sess.run(predictions)
          if not isinstance(predictions, dict):
            for pred in preds_evaluated:
              yield pred
          else:
            for i in range(self._extract_batch_length(preds_evaluated)):
              yield {
                  key: value[i]
                  for key, value in six.iteritems(preds_evaluated)
              }
Exemple #23
0
def _save_first_checkpoint(keras_model, estimator, custom_objects,
                           keras_weights):
  """Save first checkpoint for the keras Estimator.

  Args:
    keras_model: an instance of compiled keras model.
    estimator: keras estimator.
    custom_objects: Dictionary for custom objects.
    keras_weights: A flat list of Numpy arrays for weights of given keras_model.

  Returns:
    The model_fn for a keras Estimator.
  """
  # Load weights and save to checkpoint if there is no checkpoint
  latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
  if not latest_path:
    with ops.Graph().as_default():
      random_seed.set_random_seed(estimator.config.tf_random_seed)
      training_util.create_global_step()
      model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
                                     custom_objects)
      # save to checkpoint
      with session.Session(config=estimator._session_config) as sess:
        if keras_weights:
          model.set_weights(keras_weights)
        # Make update ops and initialize all variables.
        if not model.train_function:
          # pylint: disable=protected-access
          model._make_train_function()
          K._initialize_variables(sess)
          # pylint: enable=protected-access
        saver = saver_lib.Saver()
        saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
Exemple #24
0
def wait_for_new_checkpoint(checkpoint_dir,
                            last_checkpoint,
                            seconds_to_sleep=1,
                            timeout=None):
  """Waits until a new checkpoint file is found.

  Args:
    checkpoint_dir: The directory in which checkpoints are saved.
    last_checkpoint: The last checkpoint path used.
    seconds_to_sleep: The number of seconds to sleep for before looking for a
      new checkpoint.
    timeout: The maximum amount of time to wait. If left as `None`, then the
      process will wait indefinitely.

  Returns:
    a new checkpoint path, or None if the timeout was reached.
  """
  logging.info('Waiting for new checkpoint at %s', checkpoint_dir)
  stop_time = time.time() + timeout if timeout is not None else None
  while True:
    checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir)
    if checkpoint_path is None or checkpoint_path == last_checkpoint:
      if stop_time is not None and time.time() + seconds_to_sleep > stop_time:
        return None
      time.sleep(seconds_to_sleep)
    else:
      logging.info('Found new checkpoint at %s', checkpoint_path)
      return checkpoint_path
Exemple #25
0
  def latest_checkpoint(self):
    """Finds the filename of latest saved checkpoint file in `model_dir`.

    Returns:
      The full path to the latest checkpoint or `None` if no checkpoint was
      found.
    """
    return saver.latest_checkpoint(self.model_dir)
Exemple #26
0
  def _evaluate_model(self,
                      input_fn,
                      hooks=None,
                      checkpoint_path=None,
                      name=''):
    """Evaluates the model using the training.evaluation library."""
    # Check that model has been trained (if nothing has been set explicitly).
    if not checkpoint_path:
      latest_path = saver.latest_checkpoint(self._model_dir)
      if not latest_path:
        raise ValueError('Could not find trained model in model_dir: {}.'.
                         format(self._model_dir))
      checkpoint_path = latest_path

    # Setup output directory.
    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
                            'eval_' + name)

    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step_tensor = self._create_and_assert_global_step(g)
      features, labels = self._get_features_and_labels_from_input_fn(
          input_fn, model_fn_lib.ModeKeys.EVAL)
      estimator_spec = self._call_model_fn(
          features, labels, model_fn_lib.ModeKeys.EVAL)

      if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
        raise ValueError(
            'Metric with name "%s" is not allowed, because Estimator ' % (
                model_fn_lib.LOSS_METRIC_KEY) +
            'already defines a default metric with the same name.')
      estimator_spec.eval_metric_ops[
          model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss)

      update_op, eval_dict = _extract_metric_update_ops(
          estimator_spec.eval_metric_ops)

      if ops.GraphKeys.GLOBAL_STEP in eval_dict:
        raise ValueError(
            'Metric with name `global_step` is not allowed, because Estimator '
            'already defines a default metric with the same name.')
      eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor

      eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
          checkpoint_path=checkpoint_path,
          master=self._config.evaluation_master,
          scaffold=estimator_spec.scaffold,
          eval_ops=update_op,
          final_ops=eval_dict,
          hooks=hooks,
          config=self._session_config)

      _write_dict_to_summary(
          output_dir=eval_dir,
          dictionary=eval_results,
          current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])

    return eval_results
Exemple #27
0
def _export_estimator(estimator,
                      export_dir,
                      signature_fn,
                      input_fn,
                      default_batch_size,
                      exports_to_keep):
  input_fn = input_fn or _default_input_fn
  checkpoint_path = tf_saver.latest_checkpoint(estimator._model_dir)
  with ops.Graph().as_default() as g:
    contrib_variables.create_global_step(g)
    examples = array_ops.placeholder(dtype=dtypes.string,
                                     shape=[default_batch_size],
                                     name='input_example_tensor')
    features = input_fn(estimator, examples)
    predictions = estimator._get_predict_ops(features)

    # Explicit signature_fn takes priority
    if signature_fn:
      default_signature, named_graph_signatures = signature_fn(examples,
                                                               features,
                                                               predictions)
    else:
      try:
        # Some estimators provide a target_column of known type
        target_column = estimator._get_target_column()
        problem_type = target_column.problem_type

        if problem_type == layers.ProblemType.CLASSIFICATION:
          signature_fn = classification_signature_fn
        elif problem_type == layers.ProblemType.LINEAR_REGRESSION:
          signature_fn = regression_signature_fn
        elif problem_type == layers.ProblemType.LOGISTIC_REGRESSION:
          signature_fn = logistic_regression_signature_fn
        else:
          raise ValueError(
              'signature_fn must be provided because the TargetColumn is a %s, '
              'which does not have a standard problem type and so cannot use a '
              'standard export signature.' % type(target_column).__name__)

        default_signature, named_graph_signatures = (
            signature_fn(examples, features, predictions))
      except AttributeError:
        logging.warn(
            'Change warning: `signature_fn` will be required after'
            '2016-08-01.\n'
            'Using generic signatures for now.  To maintain this behavior, '
            'pass:\n'
            '  signature_fn=export.generic_signature_fn\n'
            'Also consider passing a regression or classification signature; '
            'see cl/126430915 for an example.')
        default_signature, named_graph_signatures = generic_signature_fn(
            examples, features, predictions)
    if exports_to_keep is not None:
      exports_to_keep = gc.largest_export_versions(exports_to_keep)
    _export_graph(g, _get_saver(), checkpoint_path, export_dir,
                  default_graph_signature=default_signature,
                  named_graph_signatures=named_graph_signatures,
                  exports_to_keep=exports_to_keep)
 def _read_vars(self, model_dir):
   """Returns (global_step, latest_feature)."""
   with ops.Graph().as_default() as g:
     ckpt_path = saver_lib.latest_checkpoint(model_dir)
     meta_filename = ckpt_path + '.meta'
     saver_lib.import_meta_graph(meta_filename)
     saver = saver_lib.Saver()
     with self.test_session(graph=g) as sess:
       saver.restore(sess, ckpt_path)
       return sess.run(ops.get_collection('my_vars'))
Exemple #29
0
def _latest_checkpoints_changed(configs, run_path_pairs):
  """Returns true if the latest checkpoint has changed in any of the runs."""
  for run_name, logdir in run_path_pairs:
    if run_name not in configs:
      continue
    config = configs[run_name]
    if not config.model_checkpoint_path:
      continue

    # See if you can find a checkpoint file in the logdir.
    ckpt_path = latest_checkpoint(logdir)
    if not ckpt_path:
      # See if you can find a checkpoint in the parent of logdir.
      ckpt_path = latest_checkpoint(os.path.join(logdir, os.pardir))
      if not ckpt_path:
        continue
    if config.model_checkpoint_path != ckpt_path:
      return True
  return False
Exemple #30
0
  def _read_config_files(self, run_paths, logdir):
    # If there are no summary event files, the projector can still work,
    # thus treating the `logdir` as the model checkpoint directory.
    if not run_paths:
      run_paths['.'] = logdir

    configs = {}
    config_fpaths = {}
    for run_name, logdir in run_paths.items():
      config = ProjectorConfig()
      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
      if file_io.file_exists(config_fpath):
        file_content = file_io.read_file_to_string(config_fpath).decode('utf-8')
        text_format.Merge(file_content, config)

      has_tensor_files = False
      for embedding in config.embeddings:
        if embedding.tensor_path:
          has_tensor_files = True
          break

      if not config.model_checkpoint_path:
        # See if you can find a checkpoint file in the logdir.
        ckpt_path = latest_checkpoint(logdir)
        if not ckpt_path:
          # Or in the parent of logdir.
          ckpt_path = latest_checkpoint(os.path.join('../', logdir))
          if not ckpt_path and not has_tensor_files:
            logging.warning('Cannot find model checkpoint in %s', logdir)
            continue
        if ckpt_path:
          config.model_checkpoint_path = ckpt_path

      # Sanity check for the checkpoint file.
      if (config.model_checkpoint_path and
          not checkpoint_exists(config.model_checkpoint_path)):
        logging.warning('Checkpoint file %s not found',
                        config.model_checkpoint_path)
        continue
      configs[run_name] = config
      config_fpaths[run_name] = config_fpath
    return configs, config_fpaths
Exemple #31
0
 def end(self, session=None):
     super(ExportMonitor, self).end(session=session)
     latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir)
     if latest_path is None:
         logging.info(
             "Skipping export at the end since model has not been saved "
             "yet.")
         return
     try:
         self._last_export_dir = self._estimator.export(
             self.export_dir,
             exports_to_keep=self.exports_to_keep,
             signature_fn=self.signature_fn,
             input_fn=self._input_fn,
             default_batch_size=self._default_batch_size,
             input_feature_key=self._input_feature_key,
             use_deprecated_input_fn=self._use_deprecated_input_fn)
     except RuntimeError:
         logging.info("Skipping exporting for the same step.")
Exemple #32
0
    def _infer_model(self,
                     x=None,
                     input_fn=None,
                     feed_fn=None,
                     batch_size=None):
        # Converts inputs into tf.DataFrame / tf.Series.
        batch_size = -1 if batch_size is None else batch_size
        if x is not None:
            input_fn, feed_fn = _get_predict_input_fn(x, None, batch_size)

        checkpoint_path = saver.latest_checkpoint(self._model_dir)
        with ops.Graph().as_default() as g:
            random_seed.set_random_seed(self._config.tf_random_seed)
            contrib_framework.create_global_step(g)
            features = self._get_features_from_input_fn(input_fn)
            predictions = self._get_predict_ops(features)
            return_dict = True
            if not isinstance(predictions, dict):
                predictions, return_dict = {'predictions': predictions}, False
            if feed_fn is None:
                preds = infer(checkpoint_path, predictions)
            else:
                preds = {}
                while True:
                    try:
                        feed_dict = feed_fn()
                    except StopIteration:
                        break
                    if feed_dict is None:
                        break
                    outputs = infer(checkpoint_path,
                                    predictions,
                                    feed_dict=feed_dict)
                    for key in outputs:
                        if key not in preds:
                            preds[key] = []
                        preds[key].append(outputs[key])
                for key in preds:
                    preds[key] = np.concatenate(preds[key], axis=0)
            if return_dict:
                return preds
            return preds['predictions']
Exemple #33
0
def inference_run(model, hparams, output_dir):

    # Build Model
    tf.logging.info("Build Model...")
    model_fn_inference = model_builder_inference(model, hparams=hparams)

    # Build Graph
    tf.logging.info("Build Graph...")
    checkpoint_path = saver.latest_checkpoint(output_dir)
    if not checkpoint_path:
        raise NotFittedError("Couldn't find trained model at %s." % output_dir)

    with ops.Graph().as_default() as g:
        tf.train.create_global_step(g)
        inputs_ph = tf.placeholder(tf.int32, [None, None])  ## batch_size
        features = {"inputs": inputs_ph}
        labels = None
        infer_ops = model_fn_inference(features,
                                       labels)  # predictions, None, None
        predictions = infer_ops[0]
        mon_sess = tf.train.MonitoredSession(
            session_creator=tf.train.ChiefSessionCreator(
                checkpoint_filename_with_path=checkpoint_path,
                config=session_config(
                    gpu_mem_fraction=FLAGS.gpu_mem_fraction)))

    def predict_func(feed_fn=None):
        with ops.Graph().as_default() as g:
            inputs = feed_fn["inputs"]
            feed = {inputs_ph: inputs}
            preds = mon_sess.run(predictions, feed)

            first_tensor = list(preds.values())[0]
            batch_length = first_tensor.shape[0]
            for i in range(batch_length):
                yield {key: value[i] for key, value in six.iteritems(preds)}

    tf.logging.info("Begin Decoding...")
    inference.decode_from_file(predict_func, hparams, FLAGS.decode_from_file,
                               FLAGS.decode_to_file, FLAGS.decode_batch_size,
                               FLAGS.decode_beam_size,
                               FLAGS.decode_return_beams)
def predict(output_path, separator=",", mode="w+"):
    print("Setting up inference subgraph")
    predict_input = tf.placeholder(dtype=tf.float32, shape=[None, WINDOW_SIZE, CHANNELS])
    batch_logits = inference(predict_input, is_training=False)
    predicted_probabilities = tf.nn.sigmoid(batch_logits)
    mean_prediction = tf.reduce_mean(predicted_probabilities)

    print("Restoring model from training with best validation accuracy")
    sess = tf.Session()
    saver = tf.train.Saver()
    checkpoint_file = latest_checkpoint(MODEL_DIR)
    print("Restoring the model from a checkpoint:\t%s" % checkpoint_file)
    saver.restore(sess, checkpoint_file)

    print("Predicting")
    with open(output_path, mode=mode) as file_stream:
        print("File", "Class", file=file_stream, sep=separator)
        for segment, file_name in generate_test_segment(DATA_ROOT, "test"):
            predicted_probability = sess.run(mean_prediction, feed_dict={predict_input: segment, keep_prob: 1.})
            print(file_name, predicted_probability, sep=separator, file=file_stream)
def create_session():
  """Creates a MonitoredSession for this predictor."""
  if not FLAGS.checkpoint_path:
    raise AttributeError("Please set --checkpoint_path")
  try:
    if os.path.isdir(FLAGS.checkpoint_path):
      checkpoint_path = saver.latest_checkpoint(FLAGS.checkpoint_path)
    else:
       checkpoint_path = FLAGS.checkpoint_path
       tf.logging.info("%s is not a directory. Interpreting as direct "
                    "path to checkpoint..." % checkpoint_path)
    return training.MonitoredSession(
        session_creator=training.ChiefSessionCreator(
        checkpoint_filename_with_path=checkpoint_path,
        config=session_config()))
  except tf.errors.NotFoundError as e:
    tf.logging.fatal("Could not find all variables of the computation "
                  "graph in the T2T checkpoint file. This means that the "
                  "checkpoint does not correspond to the specified model")
    raise AttributeError("Could not initialize TF session.")
Exemple #36
0
def wait_for_new_checkpoint(checkpoint_dir,
                            last_checkpoint,
                            seconds_to_sleep=1):
  """Waits until a new checkpoint file is found.

  Args:
    checkpoint_dir: The directory in which checkpoints are saved.
    last_checkpoint: The last checkpoint path used.
    seconds_to_sleep: The number of seconds to sleep for before looking for a
      new checkpoint.

  Returns:
    a new checkpoint path.
  """
  while True:
    checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir)
    if checkpoint_path == last_checkpoint:
      time.sleep(seconds_to_sleep)
    else:
      return checkpoint_path
Exemple #37
0
def captcha2text(image_list, height=CAPTCHA_HEIGHT, width=CAPTCHA_WIDTH):
    if not isdir('./model'):
        print('Model directory does not exists.')
        return
    x = placeholder(float32, [None, height * width])
    keep_prob = placeholder(float32)
    y_conv = cnn_graph(x, keep_prob, (height, width))
    saver = Saver()
    with Session() as sess:
        saver.restore(sess, latest_checkpoint('./model/'))
        predict = argmax(reshape(
            y_conv, [-1, CAPTCHA_LEN, len(CAPTCHA_LIST)]), 2)
        vector_list = sess.run(predict,
                               feed_dict={
                                   x: image_list,
                                   keep_prob: 1
                               })
        vector_list = vector_list.tolist()
        text_list = [vec2text(vector) for vector in vector_list]
        return text_list
Exemple #38
0
    def every_n_step_end(self, step, unused_outputs):
        # Check that we are not running evaluation on the same checkpoint.
        latest_path = saver.latest_checkpoint(self._estimator.model_dir)
        if latest_path == self._latest_path:
            logging.info(
                "Skipping evaluation due to same checkpoint %s for step %d "
                "as for step %d.", latest_path, step, self._latest_path_step)
            return False
        self._latest_path = latest_path
        self._latest_path_step = step

        # Run evaluation and log it.
        outputs = self._estimator.evaluate(x=self.x,
                                           y=self.y,
                                           input_fn=self.input_fn,
                                           batch_size=self.batch_size,
                                           metrics=self.metrics,
                                           name=self.name)
        stats = []
        for name in outputs:
            stats.append("%s = %s" % (name, str(outputs[name])))
        logging.info("Validation (step %d): %s" % (step, ", ".join(stats)))

        # Early stopping logic.
        if self.early_stopping_rounds is not None:
            if (self._best_value is None or
                (self.early_stopping_metric_minimize
                 and outputs[self.early_stopping_metric] < self._best_value) or
                (not self.early_stopping_metric_minimize
                 and outputs[self.early_stopping_metric] > self._best_value)):
                self._best_value = outputs[self.early_stopping_metric]
                self._best_value_step = step
            stop_now = (step - self._best_value_step >=
                        self.early_stopping_rounds)
            if stop_now:
                logging.info("Stopping. Best step: {} with {} = {}.".format(
                    self._best_value_step, self.early_stopping_metric,
                    self._best_value))
                self._early_stopped = True
                return True
        return False
 def testUsageGraph(self):
     """Expected usage when graph building."""
     with context.graph_mode():
         num_training_steps = 10
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         latest_object_graph = None
         for training_continuation in range(3):
             with ops.Graph().as_default():
                 network = MyNetwork()
                 optimizer = CheckpointableAdam(0.001)
                 root = Root(optimizer=optimizer, network=network)
                 input_value = constant_op.constant([[3.]])
                 train_op = optimizer.minimize(network(input_value),
                                               global_step=root.global_step)
                 init_op = variables.global_variables_initializer()
                 checkpoint_path = core_saver.latest_checkpoint(
                     checkpoint_directory)
                 with self.test_session(
                         graph=ops.get_default_graph()) as session:
                     if checkpoint_path is None:
                         self.assertEqual(0, training_continuation)
                         session.run(init_op)
                         # Another alternative would be to run initializers automatically
                         # if no checkpoint is being loaded. This would make deferred
                         # loading a bit more useful with graph execution.
                     else:
                         checkpointable.restore(
                             save_path=checkpoint_path,
                             root_checkpointable=root,
                             object_graph_proto=latest_object_graph,
                             session=session)
                     for _ in range(num_training_steps):
                         session.run(train_op)
                     latest_object_graph, _ = checkpointable.save(
                         file_prefix=checkpoint_prefix,
                         root_checkpointable=root,
                         session=session)
                     self.assertEqual(
                         (training_continuation + 1) * num_training_steps,
                         session.run(root.global_step))
Exemple #40
0
    def _evaluate_model(self,
                        input_fn,
                        steps,
                        feed_fn=None,
                        metrics=None,
                        name=''):
        # TODO(wicke): Remove this once Model and associated code are gone.
        if (hasattr(self._config, 'execution_mode')
                and self._config.execution_mode
                not in ('all', 'evaluate', 'eval_evalset')):
            return None, None

        # Check that model has been trained.
        checkpoint_path = self._model_dir
        latest_path = saver.latest_checkpoint(checkpoint_path)
        if not latest_path:
            raise NotFittedError("Couldn't find trained model at %s." %
                                 checkpoint_path)
        # Setup output directory.
        eval_dir = os.path.join(self._model_dir,
                                'eval' if not name else 'eval_' + name)

        with ops.Graph().as_default() as g:
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step = contrib_framework.create_global_step(g)
            features, targets = input_fn()
            self._check_inputs(features, targets)
            eval_dict = self._get_eval_ops(features, targets, metrics)
            update_op, eval_dict = self._extract_metric_update_ops(eval_dict)
            eval_results, current_global_step = graph_actions.evaluate(
                graph=g,
                output_dir=eval_dir,
                checkpoint_path=checkpoint_path,
                eval_dict=eval_dict,
                update_op=update_op,
                global_step_tensor=global_step,
                supervisor_master=self._config.master,
                feed_fn=feed_fn,
                max_steps=steps)

            return eval_results, current_global_step
Exemple #41
0
    def _infer_model(self,
                     input_fn,
                     feed_fn=None,
                     outputs=None,
                     as_iterable=False):
        # Check that model has been trained.
        checkpoint_path = saver.latest_checkpoint(self._model_dir)
        if not checkpoint_path:
            raise NotFittedError("Couldn't find trained model at %s." %
                                 self._model_dir)

        with ops.Graph().as_default() as g:
            random_seed.set_random_seed(self._config.tf_random_seed)
            contrib_framework.create_global_step(g)
            features = self._get_features_from_input_fn(input_fn)
            predictions = self._get_predict_ops(features)
            # If predictions is single output - wrap it into dict, and remember to
            # return not a dict.
            return_dict = isinstance(predictions, dict)
            if not return_dict:
                predictions = {'predictions': predictions}

            # Filter what to run predictions on, if outputs provided.
            if outputs:
                existing_keys = predictions.keys()
                predictions = {
                    key: value
                    for key, value in predictions.items() if key in outputs
                }
                if not predictions:
                    raise ValueError(
                        'Expected to run at least one output from %s, '
                        'provided %s.' % (existing_keys, outputs))

            if as_iterable:
                return self._infer_model_as_iterable(checkpoint_path,
                                                     predictions, feed_fn,
                                                     return_dict)
            else:
                return self._infer_model_single(checkpoint_path, predictions,
                                                feed_fn, return_dict)
 def testDeferredRestorationUsageEager(self):
   """An idiomatic eager execution example."""
   num_training_steps = 10
   checkpoint_directory = self.get_temp_dir()
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
   for training_continuation in range(3):
     network = MyNetwork()
     optimizer = adam.AdamOptimizer(0.001)
     root = checkpointable_utils.Checkpoint(
         optimizer=optimizer, network=network,
         optimizer_step=training_util.get_or_create_global_step())
     root.restore(core_saver.latest_checkpoint(checkpoint_directory))
     for _ in range(num_training_steps):
       # TODO(allenl): Use a Dataset and serialize/checkpoint it.
       input_value = constant_op.constant([[3.]])
       optimizer.minimize(
           lambda: network(input_value),  # pylint: disable=cell-var-from-loop
           global_step=root.optimizer_step)
     root.save(file_prefix=checkpoint_prefix)
     self.assertEqual((training_continuation + 1) * num_training_steps,
                      root.optimizer_step.numpy())
Exemple #43
0
 def create_session(self):
     """Creates a MonitoredSession for this predictor."""
     try:
         if os.path.isdir(self._checkpoint_dir):
             checkpoint_path = saver.latest_checkpoint(self._checkpoint_dir)
         else:
             checkpoint_path = self._checkpoint_dir
             logging.info("%s is not a directory. Interpreting as direct "
                          "path to checkpoint..." % checkpoint_path)
         return training.MonitoredSession(
             session_creator=training.ChiefSessionCreator(
                 checkpoint_filename_with_path=checkpoint_path,
                 config=self._session_config()))
     except tf.errors.NotFoundError as e:
         logging.fatal(
             "Could not find all variables of the computation "
             "graph in the T2T checkpoint file. This means that the "
             "checkpoint does not correspond to the model specified in "
             "SGNMT. Please double-check pred_src_vocab_size, "
             "pred_trg_vocab_size, and all the t2t_* parameters.")
         raise AttributeError("Could not initialize TF session.")
    def after_save(self, session, global_step_value):
        """Evaluates and exports the model after a checkpoint is created."""
        # Load and cache the path of the most recent checkpoint to avoid duplicate
        # searches on GCS.
        logging.info("Checking for checkpoint in %s", self._model_dir)
        latest_path = saver.latest_checkpoint(self._model_dir)

        if not latest_path:
            logging.warning(
                "Skipping evaluation and export since model has not been "
                "saved yet.")
        elif latest_path == self._latest_path:
            logging.warning(
                "Skipping evaluation due to same latest checkpoint %s.",
                latest_path)
        else:
            self._latest_path = latest_path
            self._eval_result = self._eval_fn(name="intermediate_export",
                                              checkpoint_path=latest_path)
            self._export_results = self._export_fn(self._eval_result,
                                                   checkpoint_path=latest_path)
Exemple #45
0
 def testWithDefun(self):
   num_training_steps = 2
   checkpoint_directory = self.get_temp_dir()
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
   for training_continuation in range(3):
     with ops.Graph().as_default(), self.test_session(
         graph=ops.get_default_graph()), test_util.device(use_gpu=True):
       model = MyModel()
       # Don't actually train so we can test variable values
       optimizer = adam.AdamOptimizer(0.)
       root = checkpointable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           global_step=training_util.get_or_create_global_step())
       checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
       status = root.restore(save_path=checkpoint_path)
       def train_fn():
         @function.defun
         def _call_model(x):
           return model(x)
         with backprop.GradientTape() as tape:
           loss = _call_model(constant_op.constant([[3.]]))
         gradients = tape.gradient(loss, model.variables)
         return optimizer.apply_gradients(zip(gradients, model.variables),
                                          global_step=root.global_step)
       if not context.executing_eagerly():
         train_fn = functools.partial(
             self.evaluate, train_fn())
       status.initialize_or_restore()
       for _ in range(num_training_steps):
         train_fn()
       if training_continuation > 0:
         status.assert_consumed()
         self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
       else:
         self.evaluate(model.variables[0].assign([[42.]]))
       root.save(file_prefix=checkpoint_prefix)
       self.assertEqual((training_continuation + 1) * num_training_steps,
                        self.evaluate(root.global_step))
       self.assertEqual(training_continuation + 1,
                        self.evaluate(root.save_counter))
Exemple #46
0
def _save_first_checkpoint(keras_model, custom_objects, config):
  """Save first checkpoint for the keras Estimator.

  Args:
    keras_model: an instance of compiled keras model.
    custom_objects: Dictionary for custom objects.
    config: Estimator config.

  Returns:
    The path where keras model checkpoint is saved.
  """
  # save checkpoint into subdirectory to allow warm start
  keras_model_dir = os.path.join(config.model_dir, 'keras')
  # Load weights and save to checkpoint if there is no checkpoint
  latest_path = saver_lib.latest_checkpoint(keras_model_dir)
  if not latest_path:
    keras_weights = None
    if _any_weight_initialized(keras_model):
      keras_weights = keras_model.get_weights()
    if not gfile.IsDirectory(keras_model_dir):
      gfile.MakeDirs(keras_model_dir)
    with ops.Graph().as_default():
      random_seed.set_random_seed(config.tf_random_seed)
      training_util.create_global_step()
      model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
                                     custom_objects)
      # save to checkpoint
      with session.Session(config=config.session_config) as sess:
        if keras_weights:
          model.set_weights(keras_weights)
        # Make update ops and initialize all variables.
        if not model.train_function:
          # pylint: disable=protected-access
          model._make_train_function()
          K._initialize_variables(sess)
          # pylint: enable=protected-access
        saver = saver_lib.Saver()
        latest_path = os.path.join(keras_model_dir, 'keras_model.ckpt')
        saver.save(sess, latest_path)
  return latest_path
    def testEvaluateWithFiniteInputs(self):
        checkpoint_dir = os.path.join(self.get_temp_dir(),
                                      'evaluate_with_finite_inputs')

        # Train a Model to completion:
        self._train_model(checkpoint_dir, num_steps=300)

        # Run evaluation. Inputs are fed through input producer for one epoch.
        all_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
        all_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

        single_input, single_label = training.slice_input_producer(
            [all_inputs, all_labels], num_epochs=1)
        inputs, labels = training.batch([single_input, single_label],
                                        batch_size=6,
                                        allow_smaller_final_batch=True)

        logits = logistic_classifier(inputs)
        predictions = math_ops.round(logits)

        accuracy, update_op = metrics.accuracy(predictions=predictions,
                                               labels=labels)

        checkpoint_path = saver.latest_checkpoint(checkpoint_dir)

        final_ops_values = evaluation._evaluate_once(
            checkpoint_path=checkpoint_path,
            eval_ops=update_op,
            final_ops={
                'accuracy': accuracy,
                'eval_steps': evaluation._get_or_create_eval_step()
            },
            hooks=[
                evaluation._StopAfterNEvalsHook(None),
            ])
        self.assertTrue(final_ops_values['accuracy'] > .99)
        # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs
        # each; the 3rd iter evaluates the remaining 4 inputs, and the last one
        # triggers an error which stops evaluation.
        self.assertEqual(final_ops_values['eval_steps'], 4)
Exemple #48
0
    def before_run(self, run_context):
        """ Dumps graphs and loads checkpoint if there exits.

        Called before each call to run().

        Args:
            run_context: A `SessionRunContext` object.

        Returns: A `SessionRunArgs` object containing global_step.
        """
        # We do write graph and saver_def at the first call of before_run.
        # We cannot do this in begin, since we let other hooks to change graph and
        # add variables in begin. Graph is finalized after all begin calls.
        if self._is_chief and self._first_call:
            training_util.write_graph(
                ops.get_default_graph().as_graph_def(add_shapes=True),
                self._checkpoint_dir, "graph.pbtxt")
            # dump model details "model_analysis.txt"
            dump_model_analysis(self._checkpoint_dir)  # dump model configs
            graph = ops.get_default_graph()
            meta_graph_def = meta_graph.create_meta_graph_def(
                graph_def=graph.as_graph_def(add_shapes=True),
                saver_def=self._saver.saver_def)
            if self._summary_writer is not None:
                self._summary_writer.add_graph(graph)
                self._summary_writer.add_meta_graph(meta_graph_def)
            tf.logging.info("CheckpointSaverHook (before_run): dump graph...")
        checkpoint_path = saver_lib.latest_checkpoint(self._checkpoint_dir)
        if checkpoint_path and self._first_call:
            # reloading model
            self._saver.restore(run_context.session, checkpoint_path)
            gs = run_context.session.run(self._global_step)
            tf.logging.info(
                "CheckpointSaverHook (before_run): reloading models and reset global_step={}"
                .format(gs))
            StepTimer.reset_init_triggered_step(gs)
        self._first_call = False
        self._timer.register_before_run()
        return tf.train.SessionRunArgs(self._global_step)
Exemple #49
0
 def end(self, session=None):
   super(ExportMonitor, self).end(session=session)
   latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir)
   if latest_path is None:
     logging.info("Skipping export at the end since model has not been saved "
                  "yet.")
     return
   if isinstance(self._estimator, core_estimator.Estimator):
     raise ValueError(
         "ExportMonitor does not support `tf.estimator.Estimator. `. "
         "Please pass an ExportStrategy to Experiment instead.")
   try:
     self._last_export_dir = self._estimator.export(
         self.export_dir,
         exports_to_keep=self.exports_to_keep,
         signature_fn=self.signature_fn,
         input_fn=self._input_fn,
         default_batch_size=self._default_batch_size,
         input_feature_key=self._input_feature_key,
         use_deprecated_input_fn=self._use_deprecated_input_fn)
   except RuntimeError:
     logging.info("Skipping exporting for the same step.")
Exemple #50
0
def correlation_matrix(nb_batches, checkpoint_dir):
    """Computes logits and labels of the input posts and save them as numpy files.
    
    Parameters:
        checkpoint_dir: Checkpoint of the saved model during training.
    """
    with tf.Graph().as_default():
        config = _CONFIG.copy()
        config['mode'] = 'validation'
        model = DeepSentiment(config)

        # Load model
        checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir)
        scaffold = monitored_session.Scaffold(init_op=None,
                                              init_feed_dict=None,
                                              init_fn=None,
                                              saver=None)
        session_creator = monitored_session.ChiefSessionCreator(
            scaffold=scaffold,
            checkpoint_filename_with_path=checkpoint_path,
            master='',
            config=None)

        posts_logits = []
        posts_labels = []
        with monitored_session.MonitoredSession(  # Generate queue
                session_creator=session_creator, hooks=None) as session:
            for i in range(nb_batches):
                np_logits, np_labels = session.run(
                    [model.logits, model.labels])
                posts_logits.append(np_logits)
                posts_labels.append(np_labels)

    posts_logits, posts_labels = np.vstack(posts_logits), np.hstack(
        posts_labels)
    np.save('data/posts_logits.npy', posts_logits)
    np.save('data/posts_labels.npy', posts_labels)
    return posts_logits, posts_labels
  def _testSaveRestoreFromTensorsUtility(self, start, break_range, stop):
    path = self._iterator_checkpoint_prefix()
    step = 0
    meta_filename = path + "-%d.meta" % step

    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))

    with ops.Graph().as_default() as g:
      iterator = (
          dataset_ops.Dataset.from_tensors(components)
          .make_initializable_iterator())
      init_op = iterator.initializer
      get_next = iterator.get_next()
      saveable = iterator_ops.make_saveable_from_iterator(iterator)
      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
      for t in nest.flatten(get_next):
        ops.add_to_collection("get_next", t)
      saver = saver_lib.Saver()
      with self.test_session(graph=g) as sess:
        sess.run(init_op)
        for _ in range(start, break_range):
          result = sess.run(get_next)
          for component, result_component in zip(components, result):
            self.assertAllEqual(component, result_component)
        saver.save(sess, path, step)

    with ops.Graph().as_default() as g:
      saver = saver_lib.import_meta_graph(meta_filename)
      with self.test_session(graph=g) as sess:
        get_next = nest.pack_sequence_as(("a", "b", "c"),
                                         ops.get_collection("get_next"))
        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
        for _ in range(break_range, stop):
          result = sess.run(get_next)
          for component, result_component in zip(components, result):
            self.assertAllEqual(component, result_component)
        with self.assertRaises(errors.OutOfRangeError):
          sess.run(get_next)
 def testUsageGraph(self):
     """Expected usage when graph building."""
     with context.graph_mode():
         num_training_steps = 10
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         for training_continuation in range(3):
             with ops.Graph().as_default():
                 model = MyModel()
                 optimizer = adam.AdamOptimizer(0.001)
                 root = checkpointable_utils.Checkpoint(
                     optimizer=optimizer,
                     model=model,
                     global_step=training_util.get_or_create_global_step())
                 input_value = constant_op.constant([[3.]])
                 train_op = optimizer.minimize(model(input_value),
                                               global_step=root.global_step)
                 checkpoint_path = core_saver.latest_checkpoint(
                     checkpoint_directory)
                 with self.test_session(
                         graph=ops.get_default_graph()) as session:
                     status = root.restore(save_path=checkpoint_path)
                     status.initialize_or_restore(session=session)
                     if checkpoint_path is None:
                         self.assertEqual(0, training_continuation)
                         with self.assertRaises(AssertionError):
                             status.assert_consumed()
                     else:
                         status.assert_consumed()
                     for _ in range(num_training_steps):
                         session.run(train_op)
                     root.save(file_prefix=checkpoint_prefix,
                               session=session)
                     self.assertEqual(
                         (training_continuation + 1) * num_training_steps,
                         session.run(root.global_step))
                     self.assertEqual(training_continuation + 1,
                                      session.run(root.save_counter))
Exemple #53
0
def evaluate(eval_file,model_dir,summary_dir,train_steps):
    hp = hparam.create_hparam()

    eval_graph = tf.Graph()
    with eval_graph.as_default():
        input_features = HRAN.create_input_layer(mode=modekeys.EVAL,filename=eval_file,hp=hp)

        ppl  = HRAN.impl(features=input_features,hp=hp,mode=modekeys.EVAL)

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        saver = tf.train.Saver()
        checkpoint = saver_lib.latest_checkpoint(model_dir)
        saver.restore(sess=sess,save_path=checkpoint)
        sess.run(tf.local_variables_initializer())

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess,coord=coord)
        tf.logging.info('Begin evaluation')


        try:
            total_ppl = 0
            eval_step = 0
            while not coord.should_stop():
                perplexity = sess.run(fetches=ppl)
                total_ppl += perplexity
                eval_step += 1
        except tf.errors.OutOfRangeError:
            avg_ppl = total_ppl / eval_step
            tf.logging.info('Finish evaluation. The perplexity is {}'.format(avg_ppl))
            write_to_summary(summary_dir, 'eval_ppl', avg_ppl, train_steps)
        finally:
            coord.request_stop()
        coord.join(threads)

        return avg_ppl
Exemple #54
0
        def evaluate_and_export(self):
            """Evaluate and (maybe) export the current model.

      Returns:
        Evaluation results. Returns `None` if current round of evaluation is
        skipped.
      """
            latest_ckpt_path = saver.latest_checkpoint(
                self._estimator.model_dir)
            if not latest_ckpt_path:
                self._log_err_msg(
                    'Estimator is not trained yet. Will start an '
                    'evaluation when a checkpoint is ready.')
                return None

            if latest_ckpt_path == self._previous_ckpt_path:
                self._log_err_msg(
                    'No new checkpoint ready for evaluation. Skip the current '
                    'evaluation pass as evaluation results are expected to be same '
                    'for the same checkpoint.')
                return None

            eval_result = self._estimator.evaluate(
                input_fn=self._eval_spec.input_fn,
                steps=self._eval_spec.steps,
                name=self._eval_spec.name,
                checkpoint_path=latest_ckpt_path,
                hooks=self._eval_spec.hooks)

            if not eval_result:
                self._log_err_msg('Estimator evaluate returns empty result.')
                return None

            # TODO(b/65169058): Adds export once export strategies are moved.

            self._last_warning_time = 0
            self._previous_ckpt_path = latest_ckpt_path
            return eval_result
Exemple #55
0
  def _evaluate_model(self, input_fn, steps, feed_fn=None, metrics=None):
    if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'):
      return

    checkpoint_path = saver.latest_checkpoint(self._model_dir)
    eval_dir = os.path.join(self._model_dir, 'eval')
    with ops.Graph().as_default() as g:
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step = contrib_framework.create_global_step(g)
      features, targets = input_fn()
      self._check_inputs(features, targets)
      eval_dict = self._get_eval_ops(features, targets, metrics or
                                     self._get_default_metric_functions())
      eval_results, _ = evaluate(
          graph=g,
          output_dir=eval_dir,
          checkpoint_path=checkpoint_path,
          eval_dict=eval_dict,
          global_step_tensor=global_step,
          supervisor_master=self._config.master,
          feed_fn=feed_fn,
          max_steps=steps)
      return eval_results
Exemple #56
0
  def every_n_step_end(self, step, outputs):
    super(RegressionMonitor, self).every_n_step_end(step, outputs) # does it do anything now ?
    # TODO(mdan): The use of step below is probably misleading.
    # The code should probably use the step from the checkpoint, because
    # that's what is being evaluated.
    if self._estimator is None:
      raise ValueError("Missing call to set_estimator.")
    # Check that we are not running evaluation on the same checkpoint.
    latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir)
    if latest_path is None:
      logging.debug("Skipping evaluation since model has not been saved yet "
                    "at step %d.", step)
      return False
    if latest_path is not None and latest_path == self._latest_path:
      logging.debug("Skipping evaluation due to same checkpoint %s for step %d "
                    "as for step %d.", latest_path, step,
                    self._latest_path_step)
      return False
    self._latest_path = latest_path
    self._latest_path_step = step
    # Run evaluation and log it.
    stats = evaluate(self._estimator, self.x, self.y)

    print ( "Validation (step %d): AVG_ERR: %s %%  MAX_ERR: %s %%" %  (step, \
       stats['relative_avg_err'] * 100, stats['relative_max_err'] * 100))

    if (step / 1000) % (self._criteria_check_interval / 1000) == 0:
        # Stopping after not receiving progress bigger than 0.01% after 10k steps.
        if stats['relative_avg_err'] * 100 > \
            self._past_best_big_checkpoint - self._minimal_improvement_treshold:
            print("The relative average error is not improving. Stopping after %d steps" % step)
            return True
        else:
            print("The relative average error improved from %s %% to %s %% after 10k steps" \
              % (self._past_best_big_checkpoint, stats['relative_avg_err']*100))
            self._past_best_big_checkpoint = stats['relative_avg_err'] * 100
    return False
def _save_first_checkpoint(keras_model, estimator, custom_objects,
                           keras_weights):
    """Save first checkpoint for the keras Estimator.

  Args:
    keras_model: an instance of compiled keras model.
    estimator: keras estimator.
    custom_objects: Dictionary for custom objects.
    keras_weights: A flat list of Numpy arrays for weights of given keras_model.

  Returns:
    The model_fn for a keras Estimator.
  """
    # Load weights and save to checkpoint if there is no checkpoint
    latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
    if not latest_path:
        with ops.Graph().as_default():
            random_seed.set_random_seed(estimator.config.tf_random_seed)
            training_util.create_global_step()
            model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN,
                                           keras_model, custom_objects)
            if isinstance(model, models.Sequential):
                model = model.model
            # save to checkpoint
            with session.Session(config=estimator._session_config) as sess:
                model.set_weights(keras_weights)
                # Make update ops and initialize all variables.
                if not model.train_function:
                    # pylint: disable=protected-access
                    model._make_train_function()
                    K._initialize_variables(sess)
                    # pylint: enable=protected-access
                saver = saver_lib.Saver()
                saver.save(
                    sess, os.path.join(estimator.model_dir,
                                       'keras_model.ckpt'))
Exemple #58
0
    def predict(self,
                input_fn,
                predict_keys=None,
                hooks=None,
                checkpoint_path=None):
        """Returns predictions for given features.

    Args:
      input_fn: Input function returning features which is a dictionary of
        string feature name to `Tensor` or `SparseTensor`. If it returns a
        tuple, first item is extracted as features. Prediction continues until
        `input_fn` raises an end-of-input exception (`OutOfRangeError` or
        `StopIteration`).
      predict_keys: list of `str`, name of the keys to predict. It is used if
        the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used
        then rest of the predictions will be filtered from the dictionary. If
        `None`, returns all.
      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
        inside the prediction call.
      checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
        latest checkpoint in `model_dir` is used.

    Yields:
      Evaluated values of `predictions` tensors.

    Raises:
      ValueError: Could not find a trained model in model_dir.
      ValueError: if batch length of predictions are not same.
      ValueError: If there is a conflict between `predict_keys` and
        `predictions`. For example if `predict_keys` is not `None` but
        `EstimatorSpec.predictions` is not a `dict`.
    """
        hooks = _check_hooks_type(hooks)
        # Check that model has been trained.
        if not checkpoint_path:
            checkpoint_path = saver.latest_checkpoint(self._model_dir)
        if not checkpoint_path:
            raise ValueError(
                'Could not find trained model in model_dir: {}.'.format(
                    self._model_dir))

        with ops.Graph().as_default() as g:
            random_seed.set_random_seed(self._config.tf_random_seed)
            training.create_global_step(g)
            features = self._get_features_from_input_fn(input_fn)
            estimator_spec = self._call_model_fn(features, None,
                                                 model_fn_lib.ModeKeys.PREDICT)
            predictions = self._extract_keys(estimator_spec.predictions,
                                             predict_keys)
            with training.MonitoredSession(
                    session_creator=training.ChiefSessionCreator(
                        checkpoint_filename_with_path=checkpoint_path,
                        scaffold=estimator_spec.scaffold,
                        config=self._session_config),
                    hooks=hooks) as mon_sess:
                while not mon_sess.should_stop():
                    preds_evaluated = mon_sess.run(predictions)
                    if not isinstance(predictions, dict):
                        for pred in preds_evaluated:
                            yield pred
                    else:
                        for i in range(
                                self._extract_batch_length(preds_evaluated)):
                            yield {
                                key: value[i]
                                for key, value in six.iteritems(
                                    preds_evaluated)
                            }
Exemple #59
0
    def export_savedmodel(self,
                          export_dir_base,
                          serving_input_receiver_fn,
                          assets_extra=None,
                          as_text=False,
                          checkpoint_path=None):
        """Exports inference graph as a SavedModel into given dir.

    This method builds a new graph by first calling the
    serving_input_receiver_fn to obtain feature `Tensor`s, and then calling
    this `Estimator`'s model_fn to generate the model graph based on those
    features. It restores the given checkpoint (or, lacking that, the most
    recent checkpoint) into this graph in a fresh session.  Finally it creates
    a timestamped export directory below the given export_dir_base, and writes
    a `SavedModel` into it containing a single `MetaGraphDef` saved from this
    session.

    The exported `MetaGraphDef` will provide one `SignatureDef` for each
    element of the export_outputs dict returned from the model_fn, named using
    the same keys.  One of these keys is always
    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
    signature will be served when a serving request does not specify one.
    For each signature, the outputs are provided by the corresponding
    `ExportOutput`s, and the inputs are always the input receivers provided by
    the serving_input_receiver_fn.

    Extra assets may be written into the SavedModel via the extra_assets
    argument.  This should be a dict, where each key gives a destination path
    (including the filename) relative to the assets.extra directory.  The
    corresponding value gives the full path of the source file to be copied.
    For example, the simple case of copying a single file without renaming it
    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.

    Args:
      export_dir_base: A string containing a directory in which to create
        timestamped subdirectories containing exported SavedModels.
      serving_input_receiver_fn: A function that takes no argument and
        returns a `ServingInputReceiver`.
      assets_extra: A dict specifying how to populate the assets.extra directory
        within the exported SavedModel, or `None` if no extra assets are needed.
      as_text: whether to write the SavedModel proto in text format.
      checkpoint_path: The checkpoint path to export.  If `None` (the default),
        the most recent checkpoint found within the model directory is chosen.

    Returns:
      The string path to the exported directory.

    Raises:
      ValueError: if no serving_input_receiver_fn is provided, no export_outputs
          are provided, or no checkpoint can be found.
    """
        if serving_input_receiver_fn is None:
            raise ValueError('serving_input_receiver_fn must be defined.')

        with ops.Graph().as_default() as g:
            training.create_global_step(g)
            random_seed.set_random_seed(self._config.tf_random_seed)
            serving_input_receiver = serving_input_receiver_fn()

            # Call the model_fn and collect the export_outputs.
            estimator_spec = self._call_model_fn(
                features=serving_input_receiver.features,
                labels=None,
                mode=model_fn_lib.ModeKeys.PREDICT)

            # Build the SignatureDefs from receivers and all outputs
            signature_def_map = build_all_signature_defs(
                serving_input_receiver.receiver_tensors,
                estimator_spec.export_outputs)

            if not checkpoint_path:
                # Locate the latest checkpoint
                checkpoint_path = saver.latest_checkpoint(self._model_dir)
            if not checkpoint_path:
                raise ValueError("Couldn't find trained model at %s." %
                                 self._model_dir)

            export_dir = get_timestamped_export_dir(export_dir_base)

            # TODO(soergel): Consider whether MonitoredSession makes sense here
            with tf_session.Session() as session:

                saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
                    sharded=True)
                saver_for_restore.restore(session, checkpoint_path)

                # TODO(b/36111876): replace legacy_init_op with main_op mechanism
                # pylint: disable=protected-access
                local_init_op = (
                    estimator_spec.scaffold.local_init_op
                    or monitored_session.Scaffold._default_local_init_op())
                # pylint: enable=protected-access

                # Perform the export
                builder = saved_model_builder.SavedModelBuilder(export_dir)
                builder.add_meta_graph_and_variables(
                    session, [tag_constants.SERVING],
                    signature_def_map=signature_def_map,
                    assets_collection=ops.get_collection(
                        ops.GraphKeys.ASSET_FILEPATHS),
                    legacy_init_op=local_init_op)
                builder.save(as_text)

            # Add the extra assets
            if assets_extra:
                assets_extra_path = os.path.join(
                    compat.as_bytes(export_dir),
                    compat.as_bytes('assets.extra'))
                for dest_relative, source in assets_extra.items():
                    dest_absolute = os.path.join(
                        compat.as_bytes(assets_extra_path),
                        compat.as_bytes(dest_relative))
                    dest_path = os.path.dirname(dest_absolute)
                    gfile.MakeDirs(dest_path)
                    gfile.Copy(source, dest_absolute)

            return export_dir
  def _continuous_eval(self,
                       input_fn,
                       name,
                       delay_secs,
                       throttle_delay_secs,
                       evaluate_checkpoint_only_once=True,
                       continuous_eval_predicate_fn=None):
    """Run continuous eval.

    Runs infinite eval on the evaluation data set. This function starts
    evaluating after `delay_secs` seconds and then runs no more than one
    evaluation (with `self._eval_steps` steps each time) per
    `throttle_delay_secs`. If `train_steps` is not None, will return after
    global_step reaches `train_steps`.

    Args:
      input_fn: The input to use for this eval.
      name: A string appended to the folder name of evaluation results.
      delay_secs: Start evaluating after this many seconds. If None, defaults to
        self._eval_delay_secs.
      throttle_delay_secs: Do not re-evaluate unless the last evaluation was
        started at least this many seconds ago. If None, defaults to
        self._continuous_eval_throttle_secs.
      evaluate_checkpoint_only_once: Whether to skip evaluation of checkpoints
        that have already been evaluated. Default is `True`.
      continuous_eval_predicate_fn: A predicate function determining whether to
        continue eval after each iteration. `predicate_fn` takes the evaluation
        results as arguments. At the beginning of evaluation, the passed eval
        results will be None so it's expected that the predicate function
        handles that gracefully. When `predicate_fn` is not specified,
        continuous eval will run in an infinite loop (if `train_steps` is None)
        or exit once global step reaches `train_steps`.

    Raises:
      ValueError: if `continuous_eval_predicate_fn` is neither None nor
        callable.
    """
    if (continuous_eval_predicate_fn is not None and
        not callable(continuous_eval_predicate_fn)):
      raise ValueError(
          "`continuous_eval_predicate_fn` must be a callable, or None.")

    if delay_secs is None:
      delay_secs = self._eval_delay_secs
    if throttle_delay_secs is None:
      throttle_delay_secs = self._continuous_eval_throttle_secs

    if delay_secs:
      logging.info("Waiting %f secs before starting eval.", delay_secs)
      time.sleep(delay_secs)

    previous_path = None
    eval_result = None
    last_warning_time = 0
    while (not continuous_eval_predicate_fn or
           continuous_eval_predicate_fn(eval_result)):
      # Exit if we have already reached number of steps to train.
      if self._has_training_stopped(eval_result):
        logging.info("Exiting continuous eval, global_step=%s >= "
                     "train_step=%s",
                     eval_result[ops.GraphKeys.GLOBAL_STEP],
                     self._train_steps)
        return

      start = time.time()

      error_msg = None
      latest_path = saver.latest_checkpoint(self._estimator.model_dir)
      if not latest_path:
        error_msg = ("Estimator is not fitted yet. "
                     "Will start an evaluation when a checkpoint is ready.")
      elif evaluate_checkpoint_only_once and latest_path == previous_path:
        error_msg = "No new checkpoint ready for evaluation."

      if error_msg:
        # Print warning message every 10 mins.
        eval_result = {}
        if time.time() - last_warning_time > 600:
          logging.warning(error_msg)
          last_warning_time = time.time()
      else:
        eval_result = self._call_evaluate(input_fn=input_fn,
                                          steps=self._eval_steps,
                                          metrics=self._eval_metrics,
                                          name=name,
                                          checkpoint_path=latest_path,
                                          hooks=self._eval_hooks)
        # Ensure eval result is not None for next round of evaluation.
        if not eval_result:
          eval_result = {}

        self._maybe_export(eval_result, checkpoint_path=latest_path)

        # Clear warning timer and update last evaluated checkpoint
        last_warning_time = 0
        previous_path = latest_path

      duration = time.time() - start
      if duration < throttle_delay_secs:
        difference = throttle_delay_secs - duration
        logging.info("Waiting %f secs before starting next eval run.",
                     difference)
        time.sleep(difference)