Example #1
0
def train(
    train_op,
    logdir,
    log_every_n_steps=1,
    graph=None,
    master='',
    is_chief=True,
    global_step=None,
    number_of_steps=None,
    init_op=_USE_DEFAULT,
    init_feed_dict=None,
    init_fn=None,
    summary_op=_USE_DEFAULT,
    save_summaries_secs=600,
    startup_delay_steps=0,
    saver=None,
    save_interval_secs=600,
    sync_optimizer=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: the directory where training logs are written to.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  if global_step is None:
    global_step = variables.get_or_create_global_step()
  saver = saver or tf_saver.Saver()

  if init_op is None:
    init_op = control_flow_ops.group(
        tf_variables.initialize_all_variables(),
        tf_variables.initialize_local_variables(),
        tf_variables.initialize_all_tables())

  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  local_init_op = None
  cleanup_op = None

  if is_chief and sync_optimizer:
    if not isinstance(sync_optimizer,
                      sync_replicas_optimizer.SyncReplicasOptimizer):
      raise ValueError(
          '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

    # Need to create these BEFORE the supervisor finalizes the graph:
    local_init_op = sync_optimizer.get_init_tokens_op()
    chief_queue_runner = sync_optimizer.get_chief_queue_runner()
    cleanup_op = sync_optimizer.get_clean_up_op()

  if number_of_steps:
    should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
  else:
    should_stop_op = constant_op.constant(False)

  should_log_op = math_ops.equal(
      math_ops.mod(global_step, log_every_n_steps), 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      summary_op=summary_op,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  with sv.managed_session(master, start_standard_services=False) as sess:
    if is_chief:
      sv.start_standard_services(sess)
    elif not is_chief and startup_delay_steps > 0:
      _wait_for_step(sess, global_step,
                     min(startup_delay_steps, number_of_steps or sys.maxint))
    sv.start_queue_runners(sess)
    if is_chief and sync_optimizer:
      sv.start_queue_runners(sess, [chief_queue_runner])

    total_loss = train_loop(
        sv, sess, train_op, should_stop_op, should_log_op, global_step,
        cleanup_op)

    # This waits for service threads to finish.
    sv.Stop()

    if sv.is_chief:
      logging.info('Finished training! Saving model to disk.')
      sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

    return total_loss
Example #2
0
def train(
    train_op,
    logdir,
    log_every_n_steps=1,
    graph=None,
    master='',
    is_chief=True,
    global_step=None,
    number_of_steps=None,
    init_op=_USE_DEFAULT,
    init_feed_dict=None,
    init_fn=None,
    summary_op=_USE_DEFAULT,
    save_summaries_secs=600,
    startup_delay_steps=0,
    saver=None,
    save_interval_secs=600,
    sync_optimizer=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: the directory where training logs are written to.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  if global_step is None:
    global_step = variables.get_or_create_global_step()
  saver = saver or tf_saver.Saver()

  if init_op is None:
    init_op = control_flow_ops.group(
        tf_variables.initialize_all_variables(),
        tf_variables.initialize_local_variables(),
        tf_variables.initialize_all_tables())

  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  local_init_op = None
  cleanup_op = None

  if is_chief and sync_optimizer:
    if not isinstance(sync_optimizer,
                      sync_replicas_optimizer.SyncReplicasOptimizer):
      raise ValueError(
          '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

    # Need to create these BEFORE the supervisor finalizes the graph:
    local_init_op = sync_optimizer.get_init_tokens_op()
    chief_queue_runner = sync_optimizer.get_chief_queue_runner()
    cleanup_op = sync_optimizer.get_clean_up_op()

  if number_of_steps:
    # Need to subtract 1 since the check for greater/equality is done
    # concurrently with the increment of global_step.
    # TODO(nsilberman): add a dependency to ensure the order of operations.
    should_stop_op = math_ops.greater_equal(global_step, number_of_steps-1)
  else:
    should_stop_op = constant_op.constant(False)

  should_log_op = math_ops.equal(math_ops.mod(global_step, log_every_n_steps),
                                 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      summary_op=summary_op,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  with sv.managed_session(master, start_standard_services=False) as sess:
    if is_chief:
      sv.start_standard_services(sess)
    elif not is_chief and startup_delay_steps > 0:
      _wait_for_step(sess, global_step,
                     min(startup_delay_steps, number_of_steps or sys.maxint))
    sv.start_queue_runners(sess)
    if is_chief and sync_optimizer:
      sv.start_queue_runners(sess, [chief_queue_runner])

    total_loss = train_loop(
        sv, sess, train_op, should_stop_op, should_log_op, global_step,
        cleanup_op)

    # This waits for service threads to finish.
    sv.Stop()

    if sv.is_chief:
      logging.info('Finished training! Saving model to disk.')
      sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

    return total_loss
Example #3
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=None,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60):
    """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops.
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
  """
    global_step = variables.get_or_create_global_step()

    init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                     tf_variables.initialize_local_variables(),
                                     tf_variables.initialize_all_tables())

    saver = tf_saver.Saver(variables_to_restore
                           or variables.get_variables_to_restore())

    summary_writer = summary_io.SummaryWriter(logdir)

    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=saver)

    last_checkpoint = None
    while True:
        last_checkpoint = wait_for_new_checkpoint(checkpoint_dir,
                                                  last_checkpoint)
        start = time.time()
        logging.info('Starting evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        with sv.managed_session(master, start_standard_services=False) as sess:
            sv.start_queue_runners(sess)
            sv.saver.restore(sess, last_checkpoint)
            evaluation(sess,
                       num_evals=num_evals,
                       eval_op=eval_op,
                       eval_op_feed_dict=eval_op_feed_dict,
                       final_op=final_op,
                       final_op_feed_dict=final_op_feed_dict,
                       summary_op=summary_op,
                       summary_op_feed_dict=summary_op_feed_dict,
                       summary_writer=summary_writer,
                       global_step=global_step)

        logging.info('Finished evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
        time_to_next_eval = start + eval_interval_secs - time.time()
        if time_to_next_eval > 0:
            time.sleep(time_to_next_eval)
Example #4
0
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1,
                    eval_op=None, eval_op_feed_dict=None,
                    final_op=None, final_op_feed_dict=None, summary_op=None,
                    summary_op_feed_dict=None, variables_to_restore=None,
                    eval_interval_secs=60):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops.
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
  """
  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(
      tf_variables.initialize_all_variables(),
      tf_variables.initialize_local_variables(),
      tf_variables.initialize_all_tables())

  saver = tf_saver.Saver(
      variables_to_restore or variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(
      graph=ops.get_default_graph(),
      logdir=logdir,
      init_op=init_op,
      summary_op=None,
      summary_writer=None,
      global_step=None,
      saver=saver)

  last_checkpoint = None
  while True:
    last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint)
    start = time.time()
    logging.info(
        'Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                  time.gmtime()))

    with sv.managed_session(master, start_standard_services=False) as sess:
      sv.start_queue_runners(sess)
      sv.saver.restore(sess, last_checkpoint)
      evaluation(
          sess,
          num_evals=num_evals,
          eval_op=eval_op,
          eval_op_feed_dict=eval_op_feed_dict,
          final_op=final_op,
          final_op_feed_dict=final_op_feed_dict,
          summary_op=summary_op,
          summary_op_feed_dict=summary_op_feed_dict,
          summary_writer=summary_writer,
          global_step=global_step)

    logging.info(
        'Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                  time.gmtime()))
    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)