Beispiel #1
0
    def __init__(self,
                 summary_op,
                 save_steps=100,
                 output_dir=None,
                 summary_writer=None,
                 scaffold=None):
        """Initializes a `SummarySaver` monitor.

    Args:
      summary_op: `Tensor` of type `string`. A serialized `Summary` protocol
          buffer, as output by TF summary methods like `scalar_summary` or
          `merge_all_summaries`.
      save_steps: `int`, save summaries every N steps. See `EveryN`.
      output_dir: `string`, the directory to save the summaries to. Only used
          if no `summary_writer` is supplied.
      summary_writer: `SummaryWriter`. If `None` and an `output_dir` was passed,
          one will be created accordingly.
      scaffold: `Scaffold` to get summary_op if it's not provided.
    """
        # TODO(ipolosukhin): Implement every N seconds.
        super(SummarySaver, self).__init__(every_n_steps=save_steps)
        self._summary_op = summary_op
        self._summary_writer = summary_writer
        if summary_writer is None and output_dir:
            self._summary_writer = summary_io.SummaryWriter(output_dir)
        self._scaffold = scaffold
Beispiel #2
0
 def __init__(self, summary_op, save_steps=100, output_dir=None,
              summary_writer=None):
   # TODO(ipolosukhin): Implement every N seconds.
   super(SummarySaver, self).__init__(every_n_steps=save_steps)
   self._summary_op = summary_op
   self._summary_writer = summary_writer
   if summary_writer is None and output_dir:
     self._summary_writer = summary_io.SummaryWriter(output_dir)
Beispiel #3
0
  def __init__(self, log_dir, summary_op=None, feed_dict=None):
    """Constructs the Summary Hook.

    Args:
      log_dir: The directory where the logs are saved to.
      summary_op: The summary op to run. If left as `None`, then all summaries
        in the tf.GraphKeys.SUMMARIES collection are used.
      feed_dict: An optional feed dictionary to use when evaluating the
        summaries.
    """
    self._summary_op = summary_op
    self._feed_dict = feed_dict
    self._summary_writer = summary_io.SummaryWriter(log_dir)
    self._global_step = variables.get_or_create_global_step()
Beispiel #4
0
def get_summary_writer(logdir):
    """Returns single SummaryWriter per logdir in current run.

  Args:
    logdir: str, folder to write summaries.

  Returns:
    Existing `SummaryWriter` object or new one if never wrote to given
    directory.
  """
    _summary_writer_lock.acquire()
    if logdir not in _SUMMARY_WRITERS:
        _SUMMARY_WRITERS[logdir] = summary_io.SummaryWriter(
            logdir, graph=ops.get_default_graph())
    _summary_writer_lock.release()
    return _SUMMARY_WRITERS[logdir]
Beispiel #5
0
    def __init__(self,
                 graph=None,
                 ready_op=USE_DEFAULT,
                 is_chief=True,
                 init_op=USE_DEFAULT,
                 init_feed_dict=None,
                 local_init_op=USE_DEFAULT,
                 logdir=None,
                 summary_op=USE_DEFAULT,
                 saver=USE_DEFAULT,
                 global_step=USE_DEFAULT,
                 save_summaries_secs=120,
                 save_model_secs=600,
                 recovery_wait_secs=30,
                 stop_grace_secs=120,
                 checkpoint_basename="model.ckpt",
                 session_manager=None,
                 summary_writer=USE_DEFAULT,
                 init_fn=None):
        """Create a `Supervisor`.

    Args:
      graph: A `Graph`.  The graph that the model will use.  Defaults to the
        default `Graph`.  The supervisor may add operations to the graph before
        creating a session, but the graph should not be modified by the caller
        after passing it to the supervisor.
      ready_op: `Operation` to check if the model is initialized.  This
        operation is run by supervisors in `prepare_or_wait_for_session()` to
        check if the model is ready to use. The model is considered ready if
        that operation succeeds.  Defaults to the operation returned from
        `tf.assert_variables_initialized()`  If `None`, the model is not checked
        for readiness.
      is_chief: If True, create a chief supervisor in charge of initializing
        and restoring the model.  If False, create a supervisor that relies
        on a chief supervisor for inits and restore.
      init_op: `Operation`.  Used by chief supervisors to initialize the model
        when it can not be recovered.  Defaults to an `Operation` that
        initializes all variables.  If `None`, no initialization is done
        automatically unless you pass a value for `init_fn`, see below.
      init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
        This feed dictionary will be used when `init_op` is evaluated.
      local_init_op: `Operation`. Used by all supervisors to run initializations
        that should run for every new supervisor instance. By default these
        are table initializers and initializers for local variables.
        If `None`, no further per supervisor-instance initialization is
        done automatically.
      logdir: A string.  Optional path to a directory where to checkpoint the
        model and log events for the visualizer.  Used by chief supervisors.
        The directory will be created if it does not exist.
      summary_op: An `Operation` that returns a Summary for the event logs.
        Used by chief supervisors if a `logdir` was specified.  Defaults to the
        operation returned from merge_all_summaries().  If `None`, summaries are
        not computed automatically.
      saver: A Saver object.  Used by chief supervisors if a `logdir` was
        specified.  Defaults to the saved returned by Saver().
        If `None`, the model is not saved automatically.
      global_step: An integer Tensor of size 1 that counts steps.  The value
        from 'global_step' is used in summaries and checkpoint filenames.
        Default to the op named 'global_step' in the graph if it exists, is of
        rank 1, size 1, and of type tf.int32 ot tf.int64.  If `None` the global
        step is not recorded in summaries and checkpoint files.  Used by chief
        supervisors if a `logdir` was specified.
      save_summaries_secs: Number of seconds between the computation of
        summaries for the event log.  Defaults to 120 seconds.  Pass 0 to
        disable summaries.
      save_model_secs: Number of seconds between the creation of model
        checkpoints.  Defaults to 600 seconds.  Pass 0 to disable checkpoints.
      recovery_wait_secs: Number of seconds between checks that the model
        is ready.  Used by supervisors when waiting for a chief supervisor
        to initialize or restore the model.  Defaults to 30 seconds.
      stop_grace_secs: Grace period, in seconds, given to running threads to
        stop when `stop()` is called.  Defaults to 120 seconds.
      checkpoint_basename: The basename for checkpoint saving.
      session_manager: `SessionManager`, which manages Session creation and
        recovery. If it is `None`, a default `SessionManager` will be created
        with the set of arguments passed in for backwards compatibility.
      summary_writer: `SummaryWriter` to use or `USE_DEFAULT`.  Can be `None`
        to indicate that no summaries should be written.
      init_fn: Optional callable used to initialize the model. Called
        after the optional `init_op` is called.  The callable must accept one
        argument, the session being initialized.

    Returns:
      A `Supervisor`.
    """
        # Set default values of arguments.
        if graph is None:
            graph = ops.get_default_graph()
        with graph.as_default():
            self._init_ready_op(ready_op=ready_op)
            self._init_init_op(init_op=init_op, init_feed_dict=init_feed_dict)
            self._init_local_init_op(local_init_op=local_init_op)
            self._init_saver(saver=saver)
            self._init_summary_op(summary_op=summary_op)
            self._init_global_step(global_step=global_step)
        self._graph = graph
        self._is_chief = is_chief
        self._coord = coordinator.Coordinator()
        self._started_threads = []
        self._recovery_wait_secs = recovery_wait_secs
        self._stop_grace_secs = stop_grace_secs
        self._init_fn = init_fn

        # Set all attributes related to checkpointing and writing events to None.
        # Afterwards, set them appropriately for chief supervisors, as these are
        # the only supervisors that can write checkpoints and events.
        self._logdir = None
        self._save_summaries_secs = None
        self._save_model_secs = None
        self._save_path = None
        self._summary_writer = None

        if self._is_chief:
            self._logdir = logdir
            self._save_summaries_secs = save_summaries_secs
            self._save_model_secs = save_model_secs
            if self._logdir:
                self._save_path = os.path.join(self._logdir,
                                               checkpoint_basename)
            if summary_writer is Supervisor.USE_DEFAULT:
                if self._logdir:
                    self._summary_writer = summary_io.SummaryWriter(
                        self._logdir)
            else:
                self._summary_writer = summary_writer

        self._init_session_manager(session_manager=session_manager)
        self._verify_setup()
        # The graph is not allowed to change anymore.
        graph.finalize()
Beispiel #6
0
 def set_estimator(self, estimator):
     super(SummarySaver, self).set_estimator(estimator)
     # TODO(mdan): This line looks redundant.
     if self._summary_writer is None:
         self._summary_writer = summary_io.SummaryWriter(
             estimator.model_dir)
Beispiel #7
0
 def set_estimator(self, estimator):
     super(SummarySaver, self).set_estimator(estimator)
     if self._summary_writer is None:
         self._summary_writer = summary_io.SummaryWriter(
             estimator.model_dir)
Beispiel #8
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=saver)

  last_checkpoint = None
  number_of_evaluations = 0
  while True:
    last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint)
    start = time.time()
    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))

    with sv.managed_session(master, start_standard_services=False) as sess:
      sv.saver.restore(sess, last_checkpoint)
      sv.start_queue_runners(sess)
      evaluation(sess,
                 num_evals=num_evals,
                 eval_op=eval_op,
                 eval_op_feed_dict=eval_op_feed_dict,
                 final_op=final_op,
                 final_op_feed_dict=final_op_feed_dict,
                 summary_op=summary_op,
                 summary_op_feed_dict=summary_op_feed_dict,
                 summary_writer=summary_writer,
                 global_step=global_step)

    logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))
    number_of_evaluations += 1
    if (max_number_of_evaluations and
        number_of_evaluations >= max_number_of_evaluations):
      logging.info('Reached max_number_of_evaluations=%s. Exit',
                   max_number_of_evaluations)
      break

    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    initial_op=None,
                    initial_op_feed_dict=None,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None,
                    session_config=None,
                    timeout=None):
    """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.summary.merge_all().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    timeout: The maximum amount of time to wait between checkpoints. If left as
      `None`, then the process will wait indefinitely.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    global_step = variables.get_or_create_global_step()

    saver = tf_saver.Saver(variables_to_restore
                           or variables.get_variables_to_restore())

    summary_writer = summary_io.SummaryWriter(logdir)

    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=logdir,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=saver)

    number_of_evaluations = 0
    for checkpoint_path in checkpoints_iterator(checkpoint_dir,
                                                eval_interval_secs, timeout):
        logging.info('Starting evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        with sv.managed_session(master,
                                start_standard_services=False,
                                config=session_config) as sess:
            sv.saver.restore(sess, checkpoint_path)
            sv.start_queue_runners(sess)
            final_op_value = evaluation(
                sess,
                num_evals=num_evals,
                initial_op=initial_op,
                initial_op_feed_dict=initial_op_feed_dict,
                eval_op=eval_op,
                eval_op_feed_dict=eval_op_feed_dict,
                final_op=final_op,
                final_op_feed_dict=final_op_feed_dict,
                summary_op=summary_op,
                summary_op_feed_dict=summary_op_feed_dict,
                summary_writer=summary_writer,
                global_step=global_step)

        logging.info('Finished evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
        number_of_evaluations += 1
        if (max_number_of_evaluations
                and number_of_evaluations >= max_number_of_evaluations):
            logging.info('Reached max_number_of_evaluations=%s. Exit',
                         max_number_of_evaluations)
            return final_op_value

    logging.info(
        'Timed-out waiting for new checkpoint file. Exiting evaluation loop.')
    return final_op_value
Beispiel #10
0
def evaluate_once(master,
                  checkpoint_path,
                  logdir,
                  num_evals=1,
                  initial_op=None,
                  initial_op_feed_dict=None,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None):
    """Evaluates the model at the given checkpoint path.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.summary.merge_all().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    global_step = variables.get_or_create_global_step()

    saver = tf_saver.Saver(variables_to_restore
                           or variables.get_variables_to_restore(),
                           write_version=saver_pb2.SaverDef.V1)

    summary_writer = summary_io.SummaryWriter(logdir)

    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=logdir,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=None)

    logging.info('Starting evaluation at ' +
                 time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
    with sv.managed_session(master,
                            start_standard_services=False,
                            config=session_config) as sess:
        saver.restore(sess, checkpoint_path)
        sv.start_queue_runners(sess)
        final_op_value = evaluation(sess,
                                    num_evals=num_evals,
                                    initial_op=initial_op,
                                    initial_op_feed_dict=initial_op_feed_dict,
                                    eval_op=eval_op,
                                    eval_op_feed_dict=eval_op_feed_dict,
                                    final_op=final_op,
                                    final_op_feed_dict=final_op_feed_dict,
                                    summary_op=summary_op,
                                    summary_op_feed_dict=summary_op_feed_dict,
                                    summary_writer=summary_writer,
                                    global_step=global_step)

    logging.info('Finished evaluation at ' +
                 time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

    return final_op_value