def _init_summary_op(self, summary_op=USE_DEFAULT): """Initializes summary_op. Args: summary_op: An Operation that returns a Summary for the event logs. If set to USE_DEFAULT, create an op that merges all the summaries. """ if summary_op is Supervisor.USE_DEFAULT: summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP) if summary_op is None: summary_op = _summary.merge_all() if summary_op is not None: ops.add_to_collection(ops.GraphKeys.SUMMARY_OP, summary_op) self._summary_op = summary_op
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=saver) number_of_evaluations = 0 for checkpoint_path in checkpoints_iterator(checkpoint_dir, eval_interval_secs, timeout): logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: sv.saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation( sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) return final_op_value logging.info( 'Timed-out waiting for new checkpoint file. Exiting evaluation loop.') return final_op_value
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore(), write_version=saver_pb2.SaverDef.V1) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_op_value
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The BNS name of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.initialize_all_tables()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.initialize_all_tables()) if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT cleanup_op = None if is_chief and sync_optimizer is not None: if not isinstance( sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer, sync_replicas_optimizer.SyncReplicasOptimizerV2)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer or ' 'tf.train.SyncReplicasOptimizerV2.') # Need to create these BEFORE the supervisor finalizes the graph: with ops.control_dependencies([init_op]): init_tokens_op = sync_optimizer.get_init_tokens_op() init_op = init_tokens_op chief_queue_runner = sync_optimizer.get_chief_queue_runner() if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): cleanup_op = sync_optimizer.get_clean_up_op() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break if logdir and sv.is_chief: logging.info( 'Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except: if sv.is_chief and cleanup_op is not None: logging.info('About to execute sync_clean_up_op!') sess.run(cleanup_op) raise except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() hooks = [ evaluation.StopAfterNEvalsHook(num_evals), ] if summary_op is not None: hooks.append( evaluation.SummaryAtEndHook(logdir, summary_op, summary_op_feed_dict)) saver = None if variables_to_restore is not None: saver = tf_saver.Saver(variables_to_restore) return evaluation.evaluate_repeatedly( checkpoint_dir, master=master, scaffold=monitored_session.Scaffold( init_op=initial_op, init_feed_dict=initial_op_feed_dict, saver=saver), eval_ops=eval_op, feed_dict=eval_op_feed_dict, final_ops=final_op, final_ops_feed_dict=final_op_feed_dict, eval_interval_secs=eval_interval_secs, hooks=hooks, config=session_config, max_number_of_evaluations=max_number_of_evaluations, timeout=timeout)
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() hooks = [ evaluation.StopAfterNEvalsHook(num_evals), ] if summary_op is not None: hooks.append( evaluation.SummaryAtEndHook(logdir, summary_op, summary_op_feed_dict)) saver = None if variables_to_restore is not None: saver = tf_saver.Saver(variables_to_restore) return evaluation.evaluate_once( checkpoint_path, master=master, scaffold=monitored_session.Scaffold( init_op=initial_op, init_feed_dict=initial_op_feed_dict, saver=saver), eval_ops=eval_op, feed_dict=eval_op_feed_dict, final_ops=final_op, final_ops_feed_dict=final_op_feed_dict, hooks=hooks, config=session_config)
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=saver) number_of_evaluations = 0 for checkpoint_path in checkpoints_iterator(checkpoint_dir, eval_interval_secs, timeout): logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: sv.saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) return final_op_value logging.info( 'Timed-out waiting for new checkpoint file. Exiting evaluation loop.') return final_op_value
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver( variables_to_restore or variables.get_variables_to_restore(), write_version=saver_pb2.SaverDef.V1) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_op_value
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.initialize_all_tables()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.initialize_all_tables()) if sync_optimizer is not None and isinstance( sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): with ops.control_dependencies([local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = sync_optimizer.chief_init_op else: local_init_op = sync_optimizer.local_step_init_op ready_for_local_init_op = sync_optimizer.ready_for_local_init_op else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT cleanup_op = None if is_chief and sync_optimizer is not None: if not isinstance(sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.') # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): cleanup_op = sync_optimizer.get_clean_up_op() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal(global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except: if sv.is_chief and cleanup_op is not None: logging.info('About to execute sync_clean_up_op!') sess.run(cleanup_op) raise except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def begin(self): if self._summary_op is None: self._summary_op = summary.merge_all()
def unet_model_fn(features, labels, mode, params): tf.local_variables_initializer() loss, train_op, = None, None eval_metric_ops, training_hooks, evaluation_hooks = None, None, None predictions_dict = None unet = Unet(params=params) logits = unet.model(input_tensor=features['image']) y_pred = tf.math.softmax(logits, axis=-1) output_img = tf.expand_dims(tf.cast(tf.math.argmax(y_pred, axis=-1) * 255, dtype=tf.uint8), axis=-1) if mode in (estimator.ModeKeys.TRAIN, estimator.ModeKeys.EVAL): with tf.name_scope('Loss_Calculation'): loss = Losses(logits=logits, labels=labels['label']) loss = loss.custom_loss() with tf.name_scope('Dice_Score_Calculation'): dice = f1(labels=labels['label'], predictions=y_pred) with tf.name_scope('Images_{}'.format(mode)): with tf.name_scope('Reformat_Outputs'): label = tf.expand_dims(tf.cast(tf.argmax(labels['label'], -1) * 255, dtype=tf.uint8), axis=-1) image = tf.math.divide(features['image'] - tf.reduce_max(features['image'], [0, 1, 2]), tf.reduce_max(features['image'], [0, 1, 2]) - tf.reduce_min(features['image'], [0, 1, 2])) summary.image('1_Medical_Image', image, max_outputs=1) summary.image('2_Output', output_img, max_outputs=1) summary.image('3_Output_pred', tf.expand_dims(y_pred[:, :, :, 1], -1), max_outputs=1) summary.image('4_Output_label', label, max_outputs=1) if mode == estimator.ModeKeys.TRAIN: with tf.name_scope('Learning_Rate'): global_step = tf.compat.v1.train.get_or_create_global_step() learning_rate = tf.compat.v1.train.exponential_decay(params['lr'], global_step=global_step, decay_steps=params['decay_steps'], decay_rate=params['decay_rate'], staircase=False) with tf.name_scope('Optimizer_conf'): train_op = Adam(learning_rate=learning_rate).minimize(loss=loss, global_step=global_step) with tf.name_scope('Metrics'): summary.scalar('Output_DSC', dice[1]) summary.scalar('Learning_Rate', learning_rate) if mode == estimator.ModeKeys.EVAL: eval_metric_ops = {'Metrics/Output_DSC': dice} eval_summary_hook = tf.estimator.SummarySaverHook(output_dir=params['eval_path'], summary_op=summary.merge_all(), save_steps=params['eval_steps']) evaluation_hooks = [eval_summary_hook] if mode == estimator.ModeKeys.PREDICT: predictions_dict = {'image': features['image'], 'y_preds': y_pred[:, :, :, 1], 'output_img': output_img, 'path': features['path']} return estimator.EstimatorSpec(mode, predictions=predictions_dict, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops, training_hooks=training_hooks, evaluation_hooks=evaluation_hooks)