def __init__(self, summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None): """Initializes a `SummarySaver` monitor. Args: summary_op: `Tensor` of type `string`. A serialized `Summary` protocol buffer, as output by TF summary methods like `scalar_summary` or `merge_all_summaries`. save_steps: `int`, save summaries every N steps. See `EveryN`. output_dir: `string`, the directory to save the summaries to. Only used if no `summary_writer` is supplied. summary_writer: `SummaryWriter`. If `None` and an `output_dir` was passed, one will be created accordingly. scaffold: `Scaffold` to get summary_op if it's not provided. """ # TODO(ipolosukhin): Implement every N seconds. super(SummarySaver, self).__init__(every_n_steps=save_steps) self._summary_op = summary_op self._summary_writer = summary_writer if summary_writer is None and output_dir: self._summary_writer = summary_io.SummaryWriter(output_dir) self._scaffold = scaffold
def __init__(self, summary_op, save_steps=100, output_dir=None, summary_writer=None): # TODO(ipolosukhin): Implement every N seconds. super(SummarySaver, self).__init__(every_n_steps=save_steps) self._summary_op = summary_op self._summary_writer = summary_writer if summary_writer is None and output_dir: self._summary_writer = summary_io.SummaryWriter(output_dir)
def __init__(self, log_dir, summary_op=None, feed_dict=None): """Constructs the Summary Hook. Args: log_dir: The directory where the logs are saved to. summary_op: The summary op to run. If left as `None`, then all summaries in the tf.GraphKeys.SUMMARIES collection are used. feed_dict: An optional feed dictionary to use when evaluating the summaries. """ self._summary_op = summary_op self._feed_dict = feed_dict self._summary_writer = summary_io.SummaryWriter(log_dir) self._global_step = variables.get_or_create_global_step()
def get_summary_writer(logdir): """Returns single SummaryWriter per logdir in current run. Args: logdir: str, folder to write summaries. Returns: Existing `SummaryWriter` object or new one if never wrote to given directory. """ _summary_writer_lock.acquire() if logdir not in _SUMMARY_WRITERS: _SUMMARY_WRITERS[logdir] = summary_io.SummaryWriter( logdir, graph=ops.get_default_graph()) _summary_writer_lock.release() return _SUMMARY_WRITERS[logdir]
def __init__(self, graph=None, ready_op=USE_DEFAULT, is_chief=True, init_op=USE_DEFAULT, init_feed_dict=None, local_init_op=USE_DEFAULT, logdir=None, summary_op=USE_DEFAULT, saver=USE_DEFAULT, global_step=USE_DEFAULT, save_summaries_secs=120, save_model_secs=600, recovery_wait_secs=30, stop_grace_secs=120, checkpoint_basename="model.ckpt", session_manager=None, summary_writer=USE_DEFAULT, init_fn=None): """Create a `Supervisor`. Args: graph: A `Graph`. The graph that the model will use. Defaults to the default `Graph`. The supervisor may add operations to the graph before creating a session, but the graph should not be modified by the caller after passing it to the supervisor. ready_op: `Operation` to check if the model is initialized. This operation is run by supervisors in `prepare_or_wait_for_session()` to check if the model is ready to use. The model is considered ready if that operation succeeds. Defaults to the operation returned from `tf.assert_variables_initialized()` If `None`, the model is not checked for readiness. is_chief: If True, create a chief supervisor in charge of initializing and restoring the model. If False, create a supervisor that relies on a chief supervisor for inits and restore. init_op: `Operation`. Used by chief supervisors to initialize the model when it can not be recovered. Defaults to an `Operation` that initializes all variables. If `None`, no initialization is done automatically unless you pass a value for `init_fn`, see below. init_feed_dict: A dictionary that maps `Tensor` objects to feed values. This feed dictionary will be used when `init_op` is evaluated. local_init_op: `Operation`. Used by all supervisors to run initializations that should run for every new supervisor instance. By default these are table initializers and initializers for local variables. If `None`, no further per supervisor-instance initialization is done automatically. logdir: A string. Optional path to a directory where to checkpoint the model and log events for the visualizer. Used by chief supervisors. The directory will be created if it does not exist. summary_op: An `Operation` that returns a Summary for the event logs. Used by chief supervisors if a `logdir` was specified. Defaults to the operation returned from merge_all_summaries(). If `None`, summaries are not computed automatically. saver: A Saver object. Used by chief supervisors if a `logdir` was specified. Defaults to the saved returned by Saver(). If `None`, the model is not saved automatically. global_step: An integer Tensor of size 1 that counts steps. The value from 'global_step' is used in summaries and checkpoint filenames. Default to the op named 'global_step' in the graph if it exists, is of rank 1, size 1, and of type tf.int32 ot tf.int64. If `None` the global step is not recorded in summaries and checkpoint files. Used by chief supervisors if a `logdir` was specified. save_summaries_secs: Number of seconds between the computation of summaries for the event log. Defaults to 120 seconds. Pass 0 to disable summaries. save_model_secs: Number of seconds between the creation of model checkpoints. Defaults to 600 seconds. Pass 0 to disable checkpoints. recovery_wait_secs: Number of seconds between checks that the model is ready. Used by supervisors when waiting for a chief supervisor to initialize or restore the model. Defaults to 30 seconds. stop_grace_secs: Grace period, in seconds, given to running threads to stop when `stop()` is called. Defaults to 120 seconds. checkpoint_basename: The basename for checkpoint saving. session_manager: `SessionManager`, which manages Session creation and recovery. If it is `None`, a default `SessionManager` will be created with the set of arguments passed in for backwards compatibility. summary_writer: `SummaryWriter` to use or `USE_DEFAULT`. Can be `None` to indicate that no summaries should be written. init_fn: Optional callable used to initialize the model. Called after the optional `init_op` is called. The callable must accept one argument, the session being initialized. Returns: A `Supervisor`. """ # Set default values of arguments. if graph is None: graph = ops.get_default_graph() with graph.as_default(): self._init_ready_op(ready_op=ready_op) self._init_init_op(init_op=init_op, init_feed_dict=init_feed_dict) self._init_local_init_op(local_init_op=local_init_op) self._init_saver(saver=saver) self._init_summary_op(summary_op=summary_op) self._init_global_step(global_step=global_step) self._graph = graph self._is_chief = is_chief self._coord = coordinator.Coordinator() self._started_threads = [] self._recovery_wait_secs = recovery_wait_secs self._stop_grace_secs = stop_grace_secs self._init_fn = init_fn # Set all attributes related to checkpointing and writing events to None. # Afterwards, set them appropriately for chief supervisors, as these are # the only supervisors that can write checkpoints and events. self._logdir = None self._save_summaries_secs = None self._save_model_secs = None self._save_path = None self._summary_writer = None if self._is_chief: self._logdir = logdir self._save_summaries_secs = save_summaries_secs self._save_model_secs = save_model_secs if self._logdir: self._save_path = os.path.join(self._logdir, checkpoint_basename) if summary_writer is Supervisor.USE_DEFAULT: if self._logdir: self._summary_writer = summary_io.SummaryWriter( self._logdir) else: self._summary_writer = summary_writer self._init_session_manager(session_manager=session_manager) self._verify_setup() # The graph is not allowed to change anymore. graph.finalize()
def set_estimator(self, estimator): super(SummarySaver, self).set_estimator(estimator) # TODO(mdan): This line looks redundant. if self._summary_writer is None: self._summary_writer = summary_io.SummaryWriter( estimator.model_dir)
def set_estimator(self, estimator): super(SummarySaver, self).set_estimator(estimator) if self._summary_writer is None: self._summary_writer = summary_io.SummaryWriter( estimator.model_dir)
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.merge_all_summaries(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. """ if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() global_step = variables.get_or_create_global_step() init_op = control_flow_ops.group(tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, global_step=None, saver=saver) last_checkpoint = None number_of_evaluations = 0 while True: last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint) start = time.time() logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False) as sess: sv.saver.restore(sess, last_checkpoint) sv.start_queue_runners(sess) evaluation(sess, num_evals=num_evals, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) break time_to_next_eval = start + eval_interval_secs - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=saver) number_of_evaluations = 0 for checkpoint_path in checkpoints_iterator(checkpoint_dir, eval_interval_secs, timeout): logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: sv.saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation( sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) return final_op_value logging.info( 'Timed-out waiting for new checkpoint file. Exiting evaluation loop.') return final_op_value
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore(), write_version=saver_pb2.SaverDef.V1) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_op_value