Esempio n. 1
0
def _train_internal(graph,
                    output_dir,
                    train_op,
                    loss_op,
                    global_step_tensor,
                    init_op,
                    init_feed_dict,
                    init_fn,
                    log_every_steps,
                    supervisor_is_chief,
                    supervisor_master,
                    supervisor_save_model_secs,
                    keep_checkpoint_max,
                    supervisor_save_summaries_steps,
                    feed_fn,
                    steps,
                    fail_on_nan_loss,
                    monitors,
                    max_steps):
  """See train."""
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')

  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
    if global_step_tensor is None:
      raise ValueError('No "global_step" was provided or found in the graph.')

    # Get current step.
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
    except (errors.NotFoundError, ValueError):
      start_step = 0

    summary_writer = (get_summary_writer(output_dir)
                      if supervisor_is_chief else None)

    # Add default chief monitors if none were provided.
    if not monitors:
      monitors = monitors_lib.get_default_monitors(
          loss_op=loss_op,
          summary_op=logging_ops.get_summary_op(),
          save_summary_steps=supervisor_save_summaries_steps,
          summary_writer=summary_writer) if supervisor_is_chief else []

    # TODO(ipolosukhin): Replace all functionality of Supervisor
    # with Chief-Exclusive Monitors.
    if not supervisor_is_chief:
      # Prune list of monitor to the ones runnable on all workers.
      monitors = [monitor for monitor in monitors if monitor.run_on_all_workers]

    if max_steps is None:
      max_steps = (start_step + steps) if steps else None
    # Start monitors, can create graph parts.
    for monitor in monitors:
      monitor.begin(max_steps=max_steps)

  supervisor = tf_supervisor.Supervisor(
      graph,
      init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT,
      init_feed_dict=init_feed_dict,
      is_chief=supervisor_is_chief,
      logdir=output_dir,
      saver=_make_saver(graph, keep_checkpoint_max),
      global_step=global_step_tensor,
      summary_op=None,
      summary_writer=summary_writer,
      save_model_secs=supervisor_save_model_secs,
      init_fn=init_fn)
  session = supervisor.PrepareSession(master=supervisor_master,
                                      start_standard_services=True)
  supervisor.StartQueueRunners(session)

  with session:
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
    last_step = start_step
    last_log_step = start_step
    loss_value = None
    logging.info('Training steps [%d,%s)', last_step, 'inf'
                 if max_steps is None else str(max_steps))

    excinfo = None
    try:
      while not supervisor.ShouldStop() and (
          (max_steps is None) or (last_step < max_steps)):
        start_time = time.time()
        feed_dict = feed_fn() if feed_fn is not None else None

        outputs, should_stop = _run_with_monitors(
            session, last_step + 1, [train_op, loss_op], feed_dict, monitors)

        loss_value = outputs[loss_op.name]
        if np.isnan(loss_value):
          failure_message = 'Model diverged with loss = NaN.'
          if fail_on_nan_loss:
            logging.error(failure_message)
            raise monitors_lib.NanLossDuringTrainingError()
          else:
            logging.warning(failure_message)

        if should_stop:
          break

        this_step = get_current_step()

        if this_step <= last_step:
          logging.error(
              'Global step was not incremented by train op at step %s'
              ': new step %d', last_step, this_step)

        last_step = this_step
        is_last_step = (max_steps is not None) and (last_step >= max_steps)
        if is_last_step or (last_step - last_log_step >= log_every_steps):
          logging.info(
              'training step %d, loss = %.5f (%.3f sec/batch).',
              last_step, loss_value, float(time.time() - start_time))
          last_log_step = last_step
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)
    except StopIteration:
      logging.info('Exhausted input iterarator.')
    except BaseException as e:  # pylint: disable=broad-except
      # Hold on to any other exceptions while we try recording a final
      # checkpoint and summary.
      excinfo = sys.exc_info()
    finally:
      try:
        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop(close_summary_writer=False)

        # Save one last checkpoint and summaries
        # TODO(wicke): This should be handled by Supervisor

        # In case we encountered an exception in the try block before we updated
        # last_step, update it here (again).
        last_step = get_current_step()
        if supervisor_is_chief:
          ckpt_path = supervisor.save_path
          logging.info('Saving checkpoint for step %d to checkpoint: %s.',
                       last_step, ckpt_path)
          supervisor.saver.save(session, ckpt_path, global_step=last_step)

          # Finish monitors.
          for monitor in monitors:
            monitor.end()

      # catch OutOfRangeError which is thrown when queue is out of data (and for
      # other reasons as well).
      except errors.OutOfRangeError as e:
        logging.warn('OutOfRangeError in tf.learn final checkpoint possibly '
                     'due to exhausted input queue. Note: summary_op is not '
                     'expected to trigger dequeues. %s.', e)
      except BaseException as e:  # pylint: disable=broad-except
        # If we don't already have an exception to re-raise, raise this one.
        if not excinfo:
          raise
        # Otherwise, log this one and raise the other in the finally block.
        logging.error('Got exception during tf.learn final checkpoint %s.', e)
      finally:
        if excinfo:
          reraise(*excinfo)
    return loss_value
Esempio n. 2
0
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             update_op=None,
             global_step_tensor=None,
             supervisor_master='',
             log_every_steps=10,
             feed_fn=None,
             max_steps=None):
  """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps, or until an exception (generally, an
  end-of-input signal from a reader operation) is raised from running
  `eval_dict`.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate. It is
      evaluated in every logging step. The result of the final evaluation is
      returned. If `update_op` is None, then it's evaluated in every step. If
      `max_steps` is `None`, this should depend on a reader that will raise an
      end-of-input exception when the inputs are exhausted.
    update_op: A `Tensor` which is run in every step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    max_steps: Integer. Evaluate `eval_dict` this many times.

  Returns:
    A tuple `(eval_results, global_step)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the result of running eval_dict in the last step. `None` if no
      eval steps were run.
    global_step: The global step this evaluation corresponds to.

  Raises:
    ValueError: if `output_dir` is empty.
  """
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)

    # Create or get summary op, global_step and saver.
    saver = _get_saver()
    local_init_op = _get_local_init_op()
    ready_op = _get_ready_op()

    session_manager = session_manager_lib.SessionManager(
        local_init_op=local_init_op,
        ready_op=ready_op)
    session, initialized = session_manager.recover_session(
        master=supervisor_master,
        saver=saver,
        checkpoint_dir=checkpoint_path)

    # Start queue runners.
    coord = coordinator.Coordinator()
    threads = queue_runner.start_queue_runners(session, coord)

  with session:
    if not initialized:
      logging.warning('Failed to initialize from %s.', checkpoint_path)
      # TODO(ipolosukhin): This should be failing, but old code relies on that.
      session.run(variables.global_variables_initializer())
      if checkpoint_path:
        _restore_from_checkpoint(session, graph, checkpoint_path, saver)

    current_global_step = session.run(global_step_tensor)
    eval_results = None
    # TODO(amodei): Fix this to run through the eval set exactly once.
    step = 0
    eval_step = None
    feed_dict = None
    logging.info('Eval steps [%d,%s) for training step %d.', step,
                 'inf' if max_steps is None
                 else str(max_steps), current_global_step)
    try:
      try:
        while (max_steps is None) or (step < max_steps):
          step += 1
          start_time = time.time()
          feed_dict = feed_fn() if feed_fn is not None else None
          if update_op is not None:
            session.run(update_op, feed_dict=feed_dict)
          else:
            eval_results = session.run(eval_dict, feed_dict=feed_dict)
            eval_step = step

          # TODO(wicke): We should assert that the global step hasn't changed.
          if step % log_every_steps == 0:
            if eval_step is None or step != eval_step:
              eval_results = session.run(eval_dict, feed_dict=feed_dict)
              eval_step = step
            duration = time.time() - start_time
            logging.info('Results after %d steps (%.3f sec/batch): %s.',
                         step, float(duration),
                         _eval_results_to_str(eval_results))
      finally:
        if eval_results is None or step != eval_step:
          eval_results = session.run(eval_dict, feed_dict=feed_dict)
          eval_step = step
        # Stop session first, before queue runners.
        session.close()

        # Stop queue runners.
        try:
          coord.request_stop()
          coord.join(threads, stop_grace_period_secs=120)
        except (RuntimeError, errors.CancelledError) as e:
          logging.warning('Coordinator didn\'t stop cleanly: %s', e)

    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
      if max_steps is None:
        logging.info('Input queue is exhausted.')
      else:
        logging.warn('Input queue is exhausted: %s.', e)
    # catch StopIteration which is thrown is DataReader is out of data.
    except StopIteration as e:
      if max_steps is None:
        logging.info('Input iterator is exhausted.')
      else:
        logging.warn('Input iterator is exhausted: %s.', e)

  # Save summaries for this evaluation.
  _write_summary_results(output_dir, eval_results, current_global_step)

  return eval_results, current_global_step
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             global_step_tensor=None,
             init_op=None,
             supervisor_master='',
             log_every_steps=10,
             max_steps=None):
  """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate for in every
      eval step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    max_steps: Integer. Evaluate `eval_dict` this many times.

  Returns:
    A tuple `(eval_results, global_step)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the eval results from the last step of the eval.  None if no
      eval steps were run.
    global_step: The global step this evaluation corresponds to.
  """
  global_step_tensor = contrib_variables.assert_or_get_global_step(
      graph, global_step_tensor)

  # TODO(wicke): Don't use supervisor here, or switch to output_dir=eval_dir.
  supervisor, session = _prepare_session(
      graph=graph,
      output_dir=None,  # Must be None to avoid writing an event file
      start_services=False,
      global_step_tensor=global_step_tensor,
      init_op=init_op,
      supervisor_is_chief=True,
      supervisor_master=supervisor_master,
      supervisor_save_model_secs=None,
      supervisor_save_summaries_secs=None)
  global_step_tensor = supervisor.global_step

  with session:
    if checkpoint_path:
      _restore_from_checkpoint(
          session, graph, checkpoint_path, supervisor.saver)

    current_global_step = session.run(global_step_tensor)
    eval_results = None
    # TODO(amodei): Fix this to run through the eval set exactly once.
    step = 0
    logging.info('Eval steps [%d,%s)', step, 'inf' if max_steps is None
                 else str(max_steps))
    try:
      try:
        while not supervisor.ShouldStop() and (
            (max_steps is None) or (step < max_steps)):
          start_time = time.time()
          eval_results = _run_dict(session, eval_dict)
          # TODO(wicke): We should assert that the global step hasn't changed.
          step += 1
          if step % log_every_steps == 0:
            duration = time.time() - start_time
            logging.info('Results after %d steps (%.3f sec/batch): %s.',
                         step, float(duration),
                         ', '.join('%s = %s' % (k, v)
                                   for k, v in eval_results.items()))
      finally:
        # Make our own summary writer and write a summary to the eval dir
        if supervisor.summary_op is not None:
          try:
            summary_writer = SummaryWriter(output_dir,
                                           graph_def=session.graph_def)

            summary_str = session.run(supervisor.summary_op)
            if summary_str:
              summary_writer.add_summary(summary_str, current_global_step)
          finally:
            summary_writer.close()

        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop()
    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
      logging.warn('Input queue exhausted: %s.', e)

  return eval_results, current_global_step
Esempio n. 4
0
def _monitored_train(graph,
                     output_dir,
                     train_op,
                     loss_op,
                     global_step_tensor=None,
                     init_op=None,
                     init_feed_dict=None,
                     init_fn=None,
                     log_every_steps=10,
                     supervisor_is_chief=True,
                     supervisor_master='',
                     supervisor_save_model_secs=600,
                     supervisor_save_model_steps=None,
                     keep_checkpoint_max=5,
                     supervisor_save_summaries_secs=None,
                     supervisor_save_summaries_steps=100,
                     feed_fn=None,
                     steps=None,
                     fail_on_nan_loss=True,
                     hooks=None,
                     max_steps=None):
  """Train a model via monitored_session.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss. A `0` or negative value disables logging.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save checkpoints every this many seconds. Can
        not be specified with `supervisor_save_model_steps`.
    supervisor_save_model_steps: Save checkpoints every this many steps. Can not
        be specified with `supervisor_save_model_secs`.
    keep_checkpoint_max: The maximum number of recent checkpoint files to
      keep. As new files are created, older files are deleted. If None or 0,
      all checkpoint files are kept. This is simply passed as the max_to_keep
      arg to `tf.Saver` constructor.
    supervisor_save_summaries_secs: Save summaries every
      `supervisor_save_summaries_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` steps when training. Exactly one of
      `supervisor_save_model_steps` and `supervisor_save_model_secs` should be
      specified, and the other should be None.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    hooks: List of `SessionRunHook` subclass instances. Used for callbacks
      inside the training loop.
    max_steps: Number of total steps for which to train model. If `None`,
      train forever. Two calls fit(steps=100) means 200 training iterations.
      On the other hand two calls of fit(max_steps=100) means, second call
      will not do any iteration since first call did all 100 steps.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
      is not provided. See `tf.contrib.framework.get_global_step` for how we
      look up the latter if not provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
      evaluates to `NaN`.
    ValueError: If both `steps` and `max_steps` are not `None`.
  """
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')
  if hooks is None:
    hooks = []
  if not isinstance(hooks, list):
    raise ValueError('Hooks should be a list.')
  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')

  if max_steps is not None:
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
      if max_steps <= start_step:
        logging.info('Skipping training since max_steps has already saved.')
        return None
    except:  # pylint: disable=bare-except
      pass

  # Adapted SessionRunHooks such as ExportMonitor depend on the
  # CheckpointSaverHook to be executed before they should be executed.
  # The `hooks` param comprises of deprecated monitor hooks
  # (such as ExportMonitor). Appending them after the basic_session_run_hooks.
  all_hooks = []
  with graph.as_default():
    all_hooks.append(basic_session_run_hooks.NanTensorHook(
        loss_op, fail_on_nan_loss=fail_on_nan_loss))
    if log_every_steps > 0:
      all_hooks.append(basic_session_run_hooks.LoggingTensorHook({
          'loss': loss_op.name,
          'step': global_step_tensor.name
      }, every_n_iter=log_every_steps))

    def make_saver():
      return tf_saver.Saver(
          sharded=True, max_to_keep=keep_checkpoint_max, defer_build=True,
          write_version=saver_pb2.SaverDef.V1)

    scaffold = monitored_session.Scaffold(
        init_op=init_op,
        init_feed_dict=init_feed_dict,
        init_fn=init_fn,
        saver=monitored_session.Scaffold.get_or_default('saver',
                                                        ops.GraphKeys.SAVERS,
                                                        make_saver))

    if not supervisor_is_chief:
      session_creator = monitored_session.WorkerSessionCreator(
          scaffold=scaffold,
          master=supervisor_master)
    else:
      session_creator = monitored_session.ChiefSessionCreator(
          scaffold=scaffold,
          checkpoint_dir=output_dir,
          master=supervisor_master)
      summary_writer = summary_io.SummaryWriterCache.get(output_dir)
      all_hooks.append(
          basic_session_run_hooks.StepCounterHook(
              summary_writer=summary_writer))
      all_hooks.append(
          basic_session_run_hooks.SummarySaverHook(
              save_secs=supervisor_save_summaries_secs,
              save_steps=supervisor_save_summaries_steps,
              summary_writer=summary_writer,
              scaffold=scaffold))
      if (supervisor_save_model_secs is not None
          or supervisor_save_model_steps is not None):
        all_hooks.append(
            basic_session_run_hooks.CheckpointSaverHook(
                output_dir,
                save_secs=supervisor_save_model_secs,
                save_steps=supervisor_save_model_steps,
                scaffold=scaffold))

    if steps is not None or max_steps is not None:
      all_hooks.append(basic_session_run_hooks.StopAtStepHook(steps, max_steps))
    all_hooks.extend(hooks)

    with monitored_session.MonitoredSession(
        session_creator=session_creator,
        hooks=all_hooks) as super_sess:
      loss = None
      while not super_sess.should_stop():
        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                 None)
    summary_io.SummaryWriterCache.clear()
    return loss
Esempio n. 5
0
def _supervised_train(graph,
                      output_dir,
                      train_op,
                      loss_op,
                      global_step_tensor=None,
                      init_op=None,
                      init_feed_dict=None,
                      init_fn=None,
                      log_every_steps=10,
                      supervisor_is_chief=True,
                      supervisor_master='',
                      supervisor_save_model_steps=1000,
                      keep_checkpoint_max=5,
                      supervisor_save_summaries_steps=100,
                      feed_fn=None,
                      steps=None,
                      fail_on_nan_loss=True,
                      monitors=None,
                      max_steps=None):
  """Train a model via supervised_session.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_steps: Save a checkpoint every
      `supervisor_save_model_steps` steps when training.
    keep_checkpoint_max: The maximum number of recent checkpoint files to
      keep. As new files are created, older files are deleted. If None or 0,
      all checkpoint files are kept. This is simply passed as the max_to_keep
      arg to tf.Saver constructor.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    monitors: List of `BaseMonitor` subclass instances. Used for callbacks
      inside the training loop.
    max_steps: Number of total steps for which to train model. If `None`,
      train forever. Two calls fit(steps=100) means 200 training iterations.
      On the other hand two calls of fit(max_steps=100) means, second call
      will not do any iteration since first call did all 100 steps.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
      is not provided. See `tf.contrib.framework.get_global_step` for how we
      look up the latter if not provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
      evaluates to `NaN`.
    ValueError: If both `steps` and `max_steps` are not `None`.
  """
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')
  if monitors is None:
    monitors = []
  if not isinstance(monitors, list):
    raise ValueError('Monitors should be a list.')
  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')

  if max_steps is not None:
    try:
      start_step = checkpoints.load_variable(output_dir,
                                             global_step_tensor.name)
      if max_steps <= start_step:
        logging.info('Skipping training since max_steps has already saved.')
        return None
    except:  # pylint: disable=bare-except
      pass

  with graph.as_default():
    # See question about adding the summary writer to the scaffold.
    if supervisor_is_chief:
      summary_writer = summary_writer_cache.SummaryWriterCache.get(output_dir)
      monitors.extend([
          monitors_lib.StepCounter(summary_writer=summary_writer),
          monitors_lib.NanLoss(loss_op,
                               fail_on_nan_loss=fail_on_nan_loss),
          monitors_lib.PrintTensor({'loss': loss_op.name},
                                   every_n=log_every_steps),
      ])

    # Finalize graph and add savers
    # TODO(ispir): remove keep_checkpoint_max from Scaffold interface
    scaffold = supervised_session.Scaffold(
        global_step_tensor=global_step_tensor,
        init_op=init_op,
        init_feed_dict=init_feed_dict,
        init_fn=init_fn,
        keep_checkpoint_max=keep_checkpoint_max)
    if supervisor_is_chief:
      if scaffold.summary_op is not None:
        monitors.append(monitors_lib.SummarySaver(
            scaffold.summary_op,
            save_steps=supervisor_save_summaries_steps,
            summary_writer=summary_writer))
      if supervisor_save_model_steps > 0:
        monitors.append(
            monitors_lib.CheckpointSaver(supervisor_save_model_steps,
                                         scaffold.saver, output_dir))

    if steps is not None or max_steps is not None:
      monitors.append(monitors_lib.StopAtStep(steps, max_steps))
    if not supervisor_is_chief:
      # Prune list of monitor to the ones runnable on all workers.
      monitors = [monitor for monitor in monitors if monitor.run_on_all_workers]

    with supervised_session.SupervisedSession(supervisor_master,
                                              is_chief=supervisor_is_chief,
                                              checkpoint_dir=output_dir,
                                              monitors=monitors,
                                              scaffold=scaffold) as super_sess:
      loss = None
      while not super_sess.should_stop():
        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                 None)
      return loss
def train(graph,
          output_dir,
          train_op,
          loss_op,
          global_step_tensor=None,
          init_op=None,
          log_every_steps=10,
          supervisor_is_chief=True,
          supervisor_master='',
          supervisor_save_model_secs=600,
          supervisor_save_summaries_secs=10,
          max_steps=None,
          fail_on_nan_loss=True):
  """Train a model.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save a checkpoint every
      `supervisor_save_model_secs` seconds when training.
    supervisor_save_summaries_secs: Save summaries every
      `supervisor_save_summaries_secs` seconds when training.
    max_steps: Train until `global_step_tensor` evaluates to this value.
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `global_step_tensor` is not provided. See
        `tf.contrib.framework.get_global_step` for how we look it up if not
        provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
        evaluates to `NaN`.
  """
  global_step_tensor = contrib_variables.assert_or_get_global_step(
      graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')
  supervisor, session = _prepare_session(
      graph=graph,
      output_dir=output_dir,
      start_services=True,
      global_step_tensor=global_step_tensor,
      init_op=init_op,
      supervisor_is_chief=supervisor_is_chief,
      supervisor_master=supervisor_master,
      supervisor_save_model_secs=supervisor_save_model_secs,
      supervisor_save_summaries_secs=supervisor_save_summaries_secs)

  with session:
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
    last_step = start_step
    last_log_step = start_step
    loss_value = None
    logging.info('Training steps [%d,%s)', last_step, 'inf'
                 if max_steps is None else str(max_steps))
    try:
      try:
        while not supervisor.ShouldStop() and (
            (max_steps is None) or (last_step < max_steps)):
          start_time = time.time()
          _, loss_value = session.run([train_op, loss_op])
          if np.isnan(loss_value):
            failure_message = 'Model diverged with loss = NaN.'
            if fail_on_nan_loss:
              logging.error(failure_message)
              raise NanLossDuringTrainingError()
            else:
              logging.warning(failure_message)

          this_step = get_current_step()

          if this_step <= last_step:
            logging.error(
                'Global step was not incremented by train op at step %s'
                ': new step %d' % (last_step, this_step))

          last_step = this_step
          is_last_step = (max_steps is not None) and (last_step >= max_steps)
          if is_last_step or (last_step - last_log_step >= log_every_steps):
            logging.info(
                'training step %d, loss = %.5f (%.3f sec/batch).',
                last_step, loss_value, float(time.time() - start_time))
            last_log_step = last_step
      finally:
        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop(close_summary_writer=False)

        # Save one last checkpoint and summaries
        # TODO(wicke): This should be handled by Supervisor

        # In case we encountered an exception in the try block before we updated
        # last_step, update it here (again).
        last_step = get_current_step()
        if supervisor_is_chief:
          ckpt_path = supervisor.save_path
          logging.info('Saving checkpoint for step %d to checkpoint: %s.' % (
              last_step, ckpt_path))
          supervisor.saver.save(session, ckpt_path, global_step=last_step)
          if supervisor.summary_op is not None:
            summary_strs = session.run(supervisor.summary_op)
            supervisor.summary_writer.add_summary(summary_strs, last_step)
            supervisor.summary_writer.add_session_log(
                SessionLog(status=SessionLog.STOP), last_step)
            supervisor.summary_writer.close()
    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)
    return loss_value
Esempio n. 7
0
def train(graph,
          output_dir,
          train_op,
          loss_op,
          global_step_tensor=None,
          init_op=None,
          init_feed_dict=None,
          init_fn=None,
          log_every_steps=10,
          supervisor_is_chief=True,
          supervisor_master='',
          supervisor_save_model_secs=600,
          supervisor_save_summaries_steps=100,
          feed_fn=None,
          steps=None,
          fail_on_nan_loss=True,
          monitors=None):
    """Train a model.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save a checkpoint every
      `supervisor_save_model_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    monitors: List of `BaseMonitor` subclass instances. Used for callbacks
      inside the training loop.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `global_step_tensor` is not provided. See
        `tf.contrib.framework.get_global_step` for how we look it up if not
        provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
        evaluates to `NaN`.
  """
    if not output_dir:
        raise ValueError('Output directory should be non-empty.')

    with graph.as_default():
        global_step_tensor = contrib_variables.assert_or_get_global_step(
            graph, global_step_tensor)
        if global_step_tensor is None:
            raise ValueError(
                'No "global_step" was provided or found in the graph.')

        # Get current step.
        try:
            start_step = checkpoints.load_variable(output_dir,
                                                   global_step_tensor.name)
        except (errors.NotFoundError, ValueError):
            start_step = 0

        summary_writer = (get_summary_writer(output_dir)
                          if supervisor_is_chief else None)

        # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
        if not supervisor_is_chief:
            # monitors should run only on the chief.
            monitors = []
        elif not monitors:
            monitors = monitors_lib.get_default_monitors(
                loss_op=loss_op,
                summary_op=logging_ops.get_summary_op(),
                save_summary_steps=supervisor_save_summaries_steps,
                summary_writer=summary_writer)

        # Start monitors, can create graph parts.
        for monitor in monitors:
            monitor.begin(max_steps=start_step + steps)

    supervisor = tf_supervisor.Supervisor(
        graph,
        init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT,
        init_feed_dict=init_feed_dict,
        is_chief=supervisor_is_chief,
        logdir=output_dir,
        saver=_make_saver(graph),
        global_step=global_step_tensor,
        summary_op=None,
        summary_writer=summary_writer,
        save_model_secs=supervisor_save_model_secs,
        init_fn=init_fn)
    session = supervisor.PrepareSession(master=supervisor_master,
                                        start_standard_services=True)
    supervisor.StartQueueRunners(session)

    with session:
        get_current_step = lambda: session.run(global_step_tensor)

        start_step = get_current_step()
        max_steps = start_step + steps
        last_step = start_step
        last_log_step = start_step
        loss_value = None
        logging.info('Training steps [%d,%s)', last_step,
                     'inf' if max_steps is None else str(max_steps))

        excinfo = None
        try:
            while not supervisor.ShouldStop() and ((max_steps is None) or
                                                   (last_step < max_steps)):
                start_time = time.time()
                feed_dict = feed_fn() if feed_fn is not None else None

                outputs, should_stop = _run_with_monitors(
                    session, last_step + 1, [train_op, loss_op], feed_dict,
                    monitors)

                loss_value = outputs[loss_op.name]
                if np.isnan(loss_value):
                    failure_message = 'Model diverged with loss = NaN.'
                    if fail_on_nan_loss:
                        logging.error(failure_message)
                        raise NanLossDuringTrainingError()
                    else:
                        logging.warning(failure_message)

                if should_stop:
                    break

                this_step = get_current_step()

                if this_step <= last_step:
                    logging.error(
                        'Global step was not incremented by train op at step %s'
                        ': new step %d', last_step, this_step)

                last_step = this_step
                is_last_step = (max_steps
                                is not None) and (last_step >= max_steps)
                if is_last_step or (last_step - last_log_step >=
                                    log_every_steps):
                    logging.info(
                        'training step %d, loss = %.5f (%.3f sec/batch).',
                        last_step, loss_value, float(time.time() - start_time))
                    last_log_step = last_step
        except errors.OutOfRangeError as e:
            logging.warn(
                'Got exception during tf.learn training loop possibly '
                'due to exhausted input queue %s.', e)
        except BaseException as e:  # pylint: disable=broad-except
            # Hold on to any other exceptions while we try recording a final
            # checkpoint and summary.
            excinfo = sys.exc_info()
        finally:
            try:
                # Call supervisor.Stop() from within a try block because it re-raises
                # exceptions thrown by the supervised threads.
                supervisor.Stop(close_summary_writer=False)

                # Save one last checkpoint and summaries
                # TODO(wicke): This should be handled by Supervisor

                # In case we encountered an exception in the try block before we updated
                # last_step, update it here (again).
                last_step = get_current_step()
                if supervisor_is_chief:
                    ckpt_path = supervisor.save_path
                    logging.info(
                        'Saving checkpoint for step %d to checkpoint: %s.',
                        last_step, ckpt_path)
                    supervisor.saver.save(session,
                                          ckpt_path,
                                          global_step=last_step)

                    # Finish monitors.
                    for monitor in monitors:
                        monitor.end()

            # catch OutOfRangeError which is thrown when queue is out of data (and for
            # other reasons as well).
            except errors.OutOfRangeError as e:
                logging.warn(
                    'OutOfRangeError in tf.learn final checkpoint possibly '
                    'due to exhausted input queue. Note: summary_op is not '
                    'expected to trigger dequeues. %s.', e)
            except BaseException as e:  # pylint: disable=broad-except
                # If we don't already have an exception to re-raise, raise this one.
                if not excinfo:
                    raise
                # Otherwise, log this one and raise the other in the finally block.
                logging.error(
                    'Got exception during tf.learn final checkpoint %s.', e)
            finally:
                if excinfo:
                    reraise(*excinfo)
        return loss_value
def train(graph,
          output_dir,
          train_op,
          loss_op,
          global_step_tensor=None,
          init_op=None,
          init_feed_dict=None,
          init_fn=None,
          log_every_steps=10,
          supervisor_is_chief=True,
          supervisor_master='',
          supervisor_save_model_secs=600,
          supervisor_save_summaries_steps=100,
          feed_fn=None,
          steps=None,
          fail_on_nan_loss=True,
          monitors=None):
  """Train a model.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save a checkpoint every
      `supervisor_save_model_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    monitors: List of `BaseMonitor` subclass instances. Used for callbacks
      inside the training loop.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `global_step_tensor` is not provided. See
        `tf.contrib.framework.get_global_step` for how we look it up if not
        provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
        evaluates to `NaN`.
  """
  if not output_dir:
    raise ValueError('Output directory should be non-empty.')

  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
    if global_step_tensor is None:
      raise ValueError('No "global_step" was provided or found in the graph.')

    # Get current step.
    try:
      start_step = checkpoints.load_variable(
          output_dir, global_step_tensor.name)
    except (errors.NotFoundError, ValueError):
      start_step = 0

    summary_writer = (get_summary_writer(output_dir)
                      if supervisor_is_chief else None)

    # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
    if not supervisor_is_chief:
      # monitors should run only on the chief.
      monitors = []
    elif not monitors:
      monitors = monitors_lib.get_default_monitors(
          loss_op=loss_op,
          summary_op=logging_ops.get_summary_op(),
          save_summary_steps=supervisor_save_summaries_steps,
          summary_writer=summary_writer)

    max_steps = (start_step + steps) if steps else None
    # Start monitors, can create graph parts.
    for monitor in monitors:
      monitor.begin(max_steps=max_steps)

  supervisor = tf_supervisor.Supervisor(
      graph,
      init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT,
      init_feed_dict=init_feed_dict,
      is_chief=supervisor_is_chief,
      logdir=output_dir,
      saver=_make_saver(graph),
      global_step=global_step_tensor,
      summary_op=None,
      summary_writer=summary_writer,
      save_model_secs=supervisor_save_model_secs,
      init_fn=init_fn)
  session = supervisor.PrepareSession(master=supervisor_master,
                                      start_standard_services=True)
  supervisor.StartQueueRunners(session)

  with session:
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
    last_step = start_step
    last_log_step = start_step
    loss_value = None
    logging.info('Training steps [%d,%s)', last_step, 'inf'
                 if max_steps is None else str(max_steps))

    excinfo = None
    try:
      while not supervisor.ShouldStop() and (
          (max_steps is None) or (last_step < max_steps)):
        start_time = time.time()
        feed_dict = feed_fn() if feed_fn is not None else None

        outputs, should_stop = _run_with_monitors(
            session, last_step + 1, [train_op, loss_op], feed_dict, monitors)

        loss_value = outputs[loss_op.name]
        if np.isnan(loss_value):
          failure_message = 'Model diverged with loss = NaN.'
          if fail_on_nan_loss:
            logging.error(failure_message)
            raise NanLossDuringTrainingError()
          else:
            logging.warning(failure_message)

        if should_stop:
          break

        this_step = get_current_step()

        if this_step <= last_step:
          logging.error(
              'Global step was not incremented by train op at step %s'
              ': new step %d', last_step, this_step)

        last_step = this_step
        is_last_step = (max_steps is not None) and (last_step >= max_steps)
        if is_last_step or (last_step - last_log_step >= log_every_steps):
          logging.info(
              'training step %d, loss = %.5f (%.3f sec/batch).',
              last_step, loss_value, float(time.time() - start_time))
          last_log_step = last_step
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)
    except BaseException as e:  # pylint: disable=broad-except
      # Hold on to any other exceptions while we try recording a final
      # checkpoint and summary.
      excinfo = sys.exc_info()
    finally:
      try:
        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop(close_summary_writer=False)

        # Save one last checkpoint and summaries
        # TODO(wicke): This should be handled by Supervisor

        # In case we encountered an exception in the try block before we updated
        # last_step, update it here (again).
        last_step = get_current_step()
        if supervisor_is_chief:
          ckpt_path = supervisor.save_path
          logging.info('Saving checkpoint for step %d to checkpoint: %s.',
                       last_step, ckpt_path)
          supervisor.saver.save(session, ckpt_path, global_step=last_step)

          # Finish monitors.
          for monitor in monitors:
            monitor.end()

      # catch OutOfRangeError which is thrown when queue is out of data (and for
      # other reasons as well).
      except errors.OutOfRangeError as e:
        logging.warn('OutOfRangeError in tf.learn final checkpoint possibly '
                     'due to exhausted input queue. Note: summary_op is not '
                     'expected to trigger dequeues. %s.', e)
      except BaseException as e:  # pylint: disable=broad-except
        # If we don't already have an exception to re-raise, raise this one.
        if not excinfo:
          raise
        # Otherwise, log this one and raise the other in the finally block.
        logging.error('Got exception during tf.learn final checkpoint %s.', e)
      finally:
        if excinfo:
          reraise(*excinfo)
    return loss_value
Esempio n. 9
0
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             update_op=None,
             global_step_tensor=None,
             supervisor_master='',
             log_every_steps=10,
             feed_fn=None,
             max_steps=None):
  """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate. It is
      evaluated in every logging step. The result of the final evaluation is
      returned. If update_op is None, then it's evaluated in every step.
    update_op: A `Tensor` which is run in every step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    max_steps: Integer. Evaluate `eval_dict` this many times.

  Returns:
    A tuple `(eval_results, global_step)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the result of running eval_dict in the last step. `None` if no
      eval steps were run.
    global_step: The global step this evaluation corresponds to.
  """
  global_step_tensor = contrib_variables.assert_or_get_global_step(
      graph, global_step_tensor)

  # Add scalar summaries for every tensor in evaluation dict if there is not
  # one existing already or it's a string.
  existing_tags = [tensor_util.constant_value(summary.op.inputs[0])
                   for summary in ops.get_collection(ops.GraphKeys.SUMMARIES)]
  for key, value in eval_dict.items():
    if key in existing_tags:
      continue
    if isinstance(value, ops.Tensor):
      summaries.summarize_tensor(value, tag=key)

  # Create or get summary op, global_step and saver.
  summary_op = logging_ops.get_summary_op()
  saver = _get_saver()
  local_init_op = _get_local_init_op()
  ready_op = _get_ready_op()

  session_manager = session_manager_lib.SessionManager(
      local_init_op=local_init_op,
      ready_op=ready_op)
  session, initialized = session_manager.recover_session(
      master=supervisor_master,
      saver=saver,
      checkpoint_dir=checkpoint_path)

  # Start queue runners.
  coord = coordinator.Coordinator()
  threads = _start_queue_runners(session, coord)

  with session:
    if not initialized:
      logging.warning('Failed to initialize from %s.', checkpoint_path)
      # TODO(ipolosukhin): This should be failing, but old code relies on that.
      session.run(variables.initialize_all_variables())
      if checkpoint_path:
        _restore_from_checkpoint(session, graph, checkpoint_path, saver)

    current_global_step = session.run(global_step_tensor)
    eval_results = None
    # TODO(amodei): Fix this to run through the eval set exactly once.
    step = 0
    logging.info('Eval steps [%d,%s) for training step %d.', step,
                 'inf' if max_steps is None
                 else str(max_steps), current_global_step)
    try:
      try:
        while (max_steps is None) or (step < max_steps):
          start_time = time.time()
          feed_dict = feed_fn() if feed_fn is not None else None
          eval_results = None
          if update_op is not None:
            session.run(update_op, feed_dict=feed_dict)
          else:
            eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict)

          # TODO(wicke): We should assert that the global step hasn't changed.
          step += 1
          if step % log_every_steps == 0:
            if eval_results is None:
              eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict)
            duration = time.time() - start_time
            logging.info('Results after %d steps (%.3f sec/batch): %s.',
                         step, float(duration),
                         ', '.join('%s = %s' % (k, v)
                                   for k, v in eval_results.items()))
      finally:
        if eval_results is None:
          eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict)
        # Stop queue runners.
        coord.request_stop()
        coord.join(threads, stop_grace_period_secs=120)

        # Make our own summary writer and write a summary to the eval dir.
        # Only is feed_fn is not provided.
        # TODO(ipolosukhin): Convert evaluation to use streaming_metrics,
        # then we can save for non feed_fn as well.
        if summary_op is not None and feed_fn is None:
          summary_writer = None
          try:
            summary_writer = SummaryWriter(output_dir,
                                           graph_def=session.graph_def)

            summary_str = session.run(summary_op)
            if summary_str:
              summary_writer.add_summary(summary_str, current_global_step)
          finally:
            if summary_writer:
              summary_writer.close()
    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
      if max_steps is None:
        logging.info('Input queue is exhausted.')
      else:
        logging.warn('Input queue is exhausted: %s.', e)
    # catch StopIteration which is thrown is DataReader is out of data.
    except StopIteration as e:
      if max_steps is None:
        logging.info('Input iterator is exhausted.')
      else:
        logging.warn('Input iterator is exhausted: %s.', e)

  return eval_results, current_global_step
Esempio n. 10
0
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             update_op=None,
             global_step_tensor=None,
             supervisor_master='',
             log_every_steps=10,
             feed_fn=None,
             max_steps=None):
    """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate. It is
      evaluated in every logging step. The result of the final evaluation is
      returned. If update_op is None, then it's evaluated in every step.
    update_op: A `Tensor` which is run in every step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    max_steps: Integer. Evaluate `eval_dict` this many times.

  Returns:
    A tuple `(eval_results, global_step)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the result of running eval_dict in the last step. `None` if no
      eval steps were run.
    global_step: The global step this evaluation corresponds to.
  """
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)

    # Add scalar summaries for every tensor in evaluation dict if there is not
    # one existing already or it's a string.
    existing_tags = [
        tensor_util.constant_value(summary.op.inputs[0])
        for summary in ops.get_collection(ops.GraphKeys.SUMMARIES)
    ]
    for key, value in eval_dict.items():
        if key in existing_tags:
            continue
        if isinstance(value, ops.Tensor):
            summaries.summarize_tensor(value, tag=key)

    # Create or get summary op, global_step and saver.
    summary_op = logging_ops.get_summary_op()
    saver = _get_saver()
    local_init_op = _get_local_init_op()
    ready_op = _get_ready_op()

    session_manager = session_manager_lib.SessionManager(
        local_init_op=local_init_op, ready_op=ready_op)
    session, initialized = session_manager.recover_session(
        master=supervisor_master, saver=saver, checkpoint_dir=checkpoint_path)

    # Start queue runners.
    coord = coordinator.Coordinator()
    threads = _start_queue_runners(session, coord)

    with session:
        if not initialized:
            logging.warning('Failed to initialize from %s.', checkpoint_path)
            # TODO(ipolosukhin): This should be failing, but old code relies on that.
            session.run(variables.initialize_all_variables())
            if checkpoint_path:
                _restore_from_checkpoint(session, graph, checkpoint_path,
                                         saver)

        current_global_step = session.run(global_step_tensor)
        eval_results = None
        # TODO(amodei): Fix this to run through the eval set exactly once.
        step = 0
        logging.info('Eval steps [%d,%s) for training step %d.', step,
                     'inf' if max_steps is None else str(max_steps),
                     current_global_step)
        try:
            try:
                while (max_steps is None) or (step < max_steps):
                    start_time = time.time()
                    feed_dict = feed_fn() if feed_fn is not None else None
                    eval_results = None
                    if update_op is not None:
                        session.run(update_op, feed_dict=feed_dict)
                    else:
                        eval_results = _run_dict(session,
                                                 eval_dict,
                                                 feed_dict=feed_dict)

                    # TODO(wicke): We should assert that the global step hasn't changed.
                    step += 1
                    if step % log_every_steps == 0:
                        if eval_results is None:
                            eval_results = _run_dict(session,
                                                     eval_dict,
                                                     feed_dict=feed_dict)
                        duration = time.time() - start_time
                        logging.info(
                            'Results after %d steps (%.3f sec/batch): %s.',
                            step, float(duration),
                            ', '.join('%s = %s' % (k, v)
                                      for k, v in eval_results.items()))
            finally:
                if eval_results is None:
                    eval_results = _run_dict(session,
                                             eval_dict,
                                             feed_dict=feed_dict)
                # Stop queue runners.
                coord.request_stop()
                coord.join(threads, stop_grace_period_secs=120)

                # Make our own summary writer and write a summary to the eval dir.
                # Only is feed_fn is not provided.
                # TODO(ipolosukhin): Convert evaluation to use streaming_metrics,
                # then we can save for non feed_fn as well.
                if summary_op is not None and feed_fn is None:
                    summary_writer = None
                    try:
                        summary_writer = get_summary_writer(output_dir)
                        summary_str = session.run(summary_op)
                        if summary_str:
                            summary_writer.add_summary(summary_str,
                                                       current_global_step)
                    finally:
                        if summary_writer:
                            summary_writer.close()
        # catch OutOfRangeError which is thrown when queue is out of data (and for
        # other reasons as well).
        except errors.OutOfRangeError as e:
            if max_steps is None:
                logging.info('Input queue is exhausted.')
            else:
                logging.warn('Input queue is exhausted: %s.', e)
        # catch StopIteration which is thrown is DataReader is out of data.
        except StopIteration as e:
            if max_steps is None:
                logging.info('Input iterator is exhausted.')
            else:
                logging.warn('Input iterator is exhausted: %s.', e)

    return eval_results, current_global_step
Esempio n. 11
0
def _supervised_train(graph,
                      output_dir,
                      train_op,
                      loss_op,
                      global_step_tensor=None,
                      init_op=None,
                      init_feed_dict=None,
                      init_fn=None,
                      log_every_steps=10,
                      supervisor_is_chief=True,
                      supervisor_master='',
                      supervisor_save_model_secs=600,
                      keep_checkpoint_max=5,
                      supervisor_save_summaries_steps=100,
                      feed_fn=None,
                      steps=None,
                      fail_on_nan_loss=True,
                      monitors=None,
                      max_steps=None):
    """Train a model via supervised_session.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save a checkpoint every
      `supervisor_save_model_secs` seconds when training.
    keep_checkpoint_max: The maximum number of recent checkpoint files to
      keep. As new files are created, older files are deleted. If None or 0,
      all checkpoint files are kept. This is simply passed as the max_to_keep
      arg to tf.Saver constructor.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    monitors: List of `BaseMonitor` subclass instances. Used for callbacks
      inside the training loop.
    max_steps: Number of total steps for which to train model. If `None`,
      train forever. Two calls fit(steps=100) means 200 training iterations.
      On the other hand two calls of fit(max_steps=100) means, second call
      will not do any iteration since first call did all 100 steps.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
      is not provided. See `tf.contrib.framework.get_global_step` for how we
      look up the latter if not provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
      evaluates to `NaN`.
    ValueError: If both `steps` and `max_steps` are not `None`.
  """
    if (steps is not None) and (max_steps is not None):
        raise ValueError('Can not provide both steps and max_steps.')
    if not output_dir:
        raise ValueError('Output directory should be non-empty %s.' %
                         output_dir)
    if train_op is None:
        raise ValueError('Missing train_op.')
    if loss_op is None:
        raise ValueError('Missing loss_op.')
    if monitors is None:
        monitors = []
    if not isinstance(monitors, list):
        raise ValueError('Monitors should be a list.')
    with graph.as_default():
        global_step_tensor = contrib_variables.assert_or_get_global_step(
            graph, global_step_tensor)
    if global_step_tensor is None:
        raise ValueError(
            'No "global_step" was provided or found in the graph.')

    if max_steps is not None:
        try:
            start_step = checkpoints.load_variable(output_dir,
                                                   global_step_tensor.name)
            if max_steps <= start_step:
                logging.info(
                    'Skipping training since max_steps has already saved.')
                return None
        except:  # pylint: disable=bare-except
            pass

    with graph.as_default():
        # See question about adding the summary writer to the scaffold.
        if supervisor_is_chief:
            summary_writer = summary_writer_cache.SummaryWriterCache.get(
                output_dir)
            monitors.extend([
                monitors_lib.StepCounter(summary_writer=summary_writer),
                monitors_lib.NanLoss(loss_op,
                                     fail_on_nan_loss=fail_on_nan_loss),
                monitors_lib.PrintTensor({'loss': loss_op.name},
                                         every_n=log_every_steps),
            ])

        # Finalize graph and add savers
        # TODO(ispir): remove keep_checkpoint_max from Scaffold interface
        scaffold = supervised_session.Scaffold(
            global_step_tensor=global_step_tensor,
            init_op=init_op,
            init_feed_dict=init_feed_dict,
            init_fn=init_fn,
            keep_checkpoint_max=keep_checkpoint_max)
        if supervisor_is_chief:
            if scaffold.summary_op is not None:
                monitors.append(
                    monitors_lib.SummarySaver(
                        scaffold.summary_op,
                        save_steps=supervisor_save_summaries_steps,
                        summary_writer=summary_writer))
            if supervisor_save_model_secs:
                monitors.append(
                    monitors_lib.CheckpointSaver(
                        # Make CheckpointSaver use a timer or change arg to be steps.
                        3 * supervisor_save_summaries_steps,
                        scaffold.saver,
                        output_dir))

        if steps is not None or max_steps is not None:
            monitors.append(monitors_lib.StopAtStep(steps, max_steps))
        if not supervisor_is_chief:
            # Prune list of monitor to the ones runnable on all workers.
            monitors = [
                monitor for monitor in monitors if monitor.run_on_all_workers
            ]

        with supervised_session.SupervisedSession(
                supervisor_master,
                is_chief=supervisor_is_chief,
                checkpoint_dir=output_dir,
                monitors=monitors,
                scaffold=scaffold) as super_sess:
            loss = None
            while not super_sess.should_stop():
                _, loss = super_sess.run([train_op, loss_op],
                                         feed_fn() if feed_fn else None)
            return loss
Esempio n. 12
0
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             global_step_tensor=None,
             init_op=None,
             supervisor_master='',
             log_every_steps=10,
             feed_fn=None,
             max_steps=None):
  """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate for in every
      eval step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    max_steps: Integer. Evaluate `eval_dict` this many times.

  Returns:
    A tuple `(eval_results, global_step)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the eval results from the last step of the eval.  None if no
      eval steps were run.
    global_step: The global step this evaluation corresponds to.
  """
  global_step_tensor = contrib_variables.assert_or_get_global_step(
      graph, global_step_tensor)

  # Add scalar summaries for every tensor in evaluation dict if there is not
  # one existing already or it's a string.
  existing_tags = [tensor_util.constant_value(summary.op.inputs[0])
                   for summary in ops.get_collection(ops.GraphKeys.SUMMARIES)]
  for key, value in eval_dict.items():
    if key in existing_tags:
      continue
    if isinstance(value, ops.Tensor):
      summaries.summarize_tensor(value, tag=key)

  # TODO(wicke): Don't use supervisor here, or switch to output_dir=eval_dir.
  supervisor, session = _prepare_session(
      graph=graph,
      output_dir=None,  # Must be None to avoid writing an event file
      start_services=False,
      global_step_tensor=global_step_tensor,
      init_op=init_op,
      supervisor_is_chief=True,
      supervisor_master=supervisor_master,
      supervisor_save_model_secs=None,
      supervisor_save_summaries_secs=None)
  global_step_tensor = supervisor.global_step

  with session:
    if checkpoint_path:
      _restore_from_checkpoint(
          session, graph, checkpoint_path, supervisor.saver)

    current_global_step = session.run(global_step_tensor)
    eval_results = None
    # TODO(amodei): Fix this to run through the eval set exactly once.
    step = 0
    logging.info('Eval steps [%d,%s)', step, 'inf' if max_steps is None
                 else str(max_steps))
    try:
      try:
        while not supervisor.ShouldStop() and (
            (max_steps is None) or (step < max_steps)):
          start_time = time.time()
          feed_dict = feed_fn() if feed_fn is not None else None
          eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict)
          # TODO(wicke): We should assert that the global step hasn't changed.
          step += 1
          if step % log_every_steps == 0:
            duration = time.time() - start_time
            logging.info('Results after %d steps (%.3f sec/batch): %s.',
                         step, float(duration),
                         ', '.join('%s = %s' % (k, v)
                                   for k, v in eval_results.items()))
      finally:
        # Make our own summary writer and write a summary to the eval dir
        if supervisor.summary_op is not None:
          summary_writer = None
          try:
            summary_writer = SummaryWriter(output_dir,
                                           graph_def=session.graph_def)

            summary_str = session.run(supervisor.summary_op)
            if summary_str:
              summary_writer.add_summary(summary_str, current_global_step)
          finally:
            if summary_writer:
              summary_writer.close()

        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop()
    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
      logging.warn('Input queue exhausted: %s.', e)

  return eval_results, current_global_step
Esempio n. 13
0
def train(graph,
          output_dir,
          train_op,
          loss_op,
          global_step_tensor=None,
          init_op=None,
          init_fn=None,
          log_every_steps=10,
          supervisor_is_chief=True,
          supervisor_master='',
          supervisor_save_model_secs=600,
          supervisor_save_summaries_secs=10,
          feed_fn=None,
          max_steps=None,
          fail_on_nan_loss=True):
  """Train a model.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save a checkpoint every
      `supervisor_save_model_secs` seconds when training.
    supervisor_save_summaries_secs: Save summaries every
      `supervisor_save_summaries_secs` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    max_steps: Train until `global_step_tensor` evaluates to this value.
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `global_step_tensor` is not provided. See
        `tf.contrib.framework.get_global_step` for how we look it up if not
        provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
        evaluates to `NaN`.
  """
  global_step_tensor = contrib_variables.assert_or_get_global_step(
      graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')
  supervisor, session = _prepare_session(
      graph=graph,
      output_dir=output_dir,
      start_services=True,
      global_step_tensor=global_step_tensor,
      init_op=init_op,
      init_fn=init_fn,
      supervisor_is_chief=supervisor_is_chief,
      supervisor_master=supervisor_master,
      supervisor_save_model_secs=supervisor_save_model_secs,
      supervisor_save_summaries_secs=supervisor_save_summaries_secs)

  with session:
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
    last_step = start_step
    last_log_step = start_step
    loss_value = None
    logging.info('Training steps [%d,%s)', last_step, 'inf'
                 if max_steps is None else str(max_steps))
    excinfo = None
    try:
      while not supervisor.ShouldStop() and (
          (max_steps is None) or (last_step < max_steps)):
        start_time = time.time()
        feed_dict = feed_fn() if feed_fn is not None else None
        _, loss_value = session.run([train_op, loss_op], feed_dict=feed_dict)
        if np.isnan(loss_value):
          failure_message = 'Model diverged with loss = NaN.'
          if fail_on_nan_loss:
            logging.error(failure_message)
            raise NanLossDuringTrainingError()
          else:
            logging.warning(failure_message)

        this_step = get_current_step()

        if this_step <= last_step:
          logging.error(
              'Global step was not incremented by train op at step %s'
              ': new step %d' % (last_step, this_step))

        last_step = this_step
        is_last_step = (max_steps is not None) and (last_step >= max_steps)
        if is_last_step or (last_step - last_log_step >= log_every_steps):
          logging.info(
              'training step %d, loss = %.5f (%.3f sec/batch).',
              last_step, loss_value, float(time.time() - start_time))
          last_log_step = last_step
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)
    except BaseException as e:  # pylint: disable=broad-except
      # Hold on to any other exceptions while we try recording a final
      # checkpoint and summary.
      excinfo = sys.exc_info()
    finally:
      try:
        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop(close_summary_writer=False)

        # Save one last checkpoint and summaries
        # TODO(wicke): This should be handled by Supervisor

        # In case we encountered an exception in the try block before we updated
        # last_step, update it here (again).
        last_step = get_current_step()
        if supervisor_is_chief:
          ckpt_path = supervisor.save_path
          logging.info('Saving checkpoint for step %d to checkpoint: %s.' % (
              last_step, ckpt_path))
          supervisor.saver.save(session, ckpt_path, global_step=last_step)
          if supervisor.summary_op is not None:
            summary_strs = session.run(supervisor.summary_op)
            supervisor.summary_writer.add_summary(summary_strs, last_step)
            supervisor.summary_writer.add_session_log(
                SessionLog(status=SessionLog.STOP), last_step)
            supervisor.summary_writer.close()
      # catch OutOfRangeError which is thrown when queue is out of data (and for
      # other reasons as well).
      except errors.OutOfRangeError as e:
        logging.warn('OutOfRangeError in tf.learn final checkpoint possibly '
                     'due to exhausted input queue. Note: summary_op is not '
                     'expected to trigger dequeues. %s.', e)
      except BaseException as e:  # pylint: disable=broad-except
        # If we don't already have an exception to re-raise, raise this one.
        if not excinfo:
          raise
        # Otherwise, log this one and raise the other in the finally block.
        logging.error('Got exception during tf.learn final checkpoint %s.', e)
      finally:
        if excinfo:
          reraise(*excinfo)
    return loss_value
Esempio n. 14
0
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             global_step_tensor=None,
             init_op=None,
             supervisor_master='',
             log_every_steps=10,
             max_steps=None,
             max_global_step=None,
             tuner=None,
             tuner_metric=None):
  """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate for in every
      eval step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    max_steps: Integer. Evaluate `eval_dict` this many times.
    max_global_step: Integer.  If the global_step is larger than this, skip
      the eval and return None.
    tuner: A `Tuner` that will be notified of eval completion and updated
      with objective metrics.
    tuner_metric: A `string` that specifies the eval metric to report to
      `tuner`.

  Returns:
    A tuple `(eval_results, should_stop)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the eval results from the last step of the eval.  None if no
      eval steps were run.
    should stop: A `bool`, indicating whether it was detected that eval should
      stop.

  Raises:
    ValueError: if the caller specifies max_global_step without providing
      a global_step.
  """
  # TODO(mdan): Move the tuner code outside eval.
  # The func returns sufficient information for the tuner to use.
  if tuner and not tuner_metric:
    raise ValueError('A tuner metric must be specified when using a tuner.')
  global_step_tensor = contrib_variables.assert_or_get_global_step(
      graph, global_step_tensor)

  # TODO(wicke): Don't use supervisor here, or switch to output_dir=eval_dir.
  supervisor, session = _prepare_session(
      graph=graph,
      output_dir=None,  # Must be None to avoid writing an event file
      start_services=False,
      global_step_tensor=global_step_tensor,
      init_op=init_op,
      supervisor_is_chief=True,
      supervisor_master=supervisor_master,
      supervisor_save_model_secs=None,
      supervisor_save_summaries_secs=None)
  global_step_tensor = supervisor.global_step

  with session:
    if checkpoint_path:
      _restore_from_checkpoint(
          session, graph, checkpoint_path, supervisor.saver)

    # When detecting that eval should stop, we still run one final measurement.
    should_stop = False
    # Check the global step.
    current_global_step = session.run(global_step_tensor)
    if max_global_step > 0 and current_global_step >= max_global_step:
      should_stop = True
      # TODO(mdan): Add support for more complex stop conditions.
    # Check for stop requests, if a tuner is present.
    if tuner and tuner.should_experiment_stop():
      should_stop = True

    eval_results = None
    # TODO(amodei): Fix this to run through the eval set exactly once.
    step = 0
    logging.info('Eval steps [%d,%s)', step, 'inf' if max_steps is None
                 else str(max_steps))
    try:
      try:
        while not supervisor.ShouldStop() and (
            (max_steps is None) or (step < max_steps)):
          start_time = time.time()
          eval_results = _run_dict(session, eval_dict)
          step += 1
          if step % log_every_steps == 0:
            duration = time.time() - start_time
            logging.info('Results after %d steps (%.3f sec/batch): %s.',
                         step, float(duration),
                         ', '.join('%s = %s' % (k, v)
                                   for k, v in eval_results.items()))
      finally:
        # Make our own summary writer and write a summary to the eval dir
        if supervisor.summary_op is not None:
          try:
            summary_writer = SummaryWriter(output_dir,
                                           graph_def=session.graph_def)

            summary_str = session.run(supervisor.summary_op)
            if summary_str:
              # TODO(wicke): We should assert that the global step hasn't
              #              changed.
              global_step = session.run(global_step_tensor)
              summary_writer.add_summary(summary_str, global_step)
          finally:
            summary_writer.close()

        if tuner:
          if eval_results and eval_results[tuner_metric]:
            tuner.report_measure(
                float(eval_results[tuner_metric]),
                model_save_path=checkpoint_path,
                global_step=int(current_global_step))

        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop()
    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)

  if should_stop and tuner is not None:
    try:
      tuner.report_done()
    except Exception as e:
      # TODO(mdan): Vizier should allow softer application error reporting.
      tuner.report_done(infeasible=True, infeasible_reason=e)
      raise

  return eval_results, should_stop
Esempio n. 15
0
def evaluate(graph,
             output_dir,
             checkpoint_path,
             eval_dict,
             global_step_tensor=None,
             init_op=None,
             supervisor_master='',
             log_every_steps=10,
             max_steps=None,
             max_global_step=None,
             tuner=None,
             tuner_metric=None):
    """Evaluate a model loaded from a checkpoint.

  Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
  to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
  loop for `max_steps` steps.

  In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
  every `log_every_steps` steps, they are logged. At the very end of evaluation,
  a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
  and written to `output_dir`.

  Args:
    graph: A `Graph` to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A string containing the directory to write a summary to.
    checkpoint_path: A string containing the path to a checkpoint to restore.
      Can be `None` if the graph doesn't require loading any variables.
    eval_dict: A `dict` mapping string names to tensors to evaluate for in every
      eval step.
    global_step_tensor: A `Variable` containing the global step. If `None`,
      one is extracted from the graph using the same logic as in `Supervisor`.
      Used to place eval summaries on training curves.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    supervisor_master: The master string to use when preparing the session.
    log_every_steps: Integer. Output logs every `log_every_steps` evaluation
      steps. The logs contain the `eval_dict` and timing information.
    max_steps: Integer. Evaluate `eval_dict` this many times.
    max_global_step: Integer.  If the global_step is larger than this, skip
      the eval and return None.
    tuner: A `Tuner` that will be notified of eval completion and updated
      with objective metrics.
    tuner_metric: A `string` that specifies the eval metric to report to
      `tuner`.

  Returns:
    A tuple `(eval_results, should_stop)`:
    eval_results: A `dict` mapping `string` to numeric values (`int`, `float`)
      that are the eval results from the last step of the eval.  None if no
      eval steps were run.
    should stop: A `bool`, indicating whether it was detected that eval should
      stop.

  Raises:
    ValueError: if the caller specifies max_global_step without providing
      a global_step.
  """
    # TODO(mdan): Move the tuner code outside eval.
    # The func returns sufficient information for the tuner to use.
    if tuner and not tuner_metric:
        raise ValueError(
            'A tuner metric must be specified when using a tuner.')
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)

    # TODO(wicke): Don't use supervisor here, or switch to output_dir=eval_dir.
    supervisor, session = _prepare_session(
        graph=graph,
        output_dir=None,  # Must be None to avoid writing an event file
        start_services=False,
        global_step_tensor=global_step_tensor,
        init_op=init_op,
        supervisor_is_chief=True,
        supervisor_master=supervisor_master,
        supervisor_save_model_secs=None,
        supervisor_save_summaries_secs=None)
    global_step_tensor = supervisor.global_step

    with session:
        if checkpoint_path:
            _restore_from_checkpoint(session, graph, checkpoint_path,
                                     supervisor.saver)

        # When detecting that eval should stop, we still run one final measurement.
        should_stop = False
        # Check the global step.
        current_global_step = session.run(global_step_tensor)
        if max_global_step > 0 and current_global_step >= max_global_step:
            should_stop = True
            # TODO(mdan): Add support for more complex stop conditions.
        # Check for stop requests, if a tuner is present.
        if tuner and tuner.should_experiment_stop():
            should_stop = True

        eval_results = None
        # TODO(amodei): Fix this to run through the eval set exactly once.
        step = 0
        logging.info('Eval steps [%d,%s)', step,
                     'inf' if max_steps is None else str(max_steps))
        try:
            try:
                while not supervisor.ShouldStop() and ((max_steps is None) or
                                                       (step < max_steps)):
                    start_time = time.time()
                    eval_results = _run_dict(session, eval_dict)
                    step += 1
                    if step % log_every_steps == 0:
                        duration = time.time() - start_time
                        logging.info(
                            'Results after %d steps (%.3f sec/batch): %s.',
                            step, float(duration),
                            ', '.join('%s = %s' % (k, v)
                                      for k, v in eval_results.items()))
            finally:
                # Make our own summary writer and write a summary to the eval dir
                if supervisor.summary_op is not None:
                    try:
                        summary_writer = SummaryWriter(
                            output_dir, graph_def=session.graph_def)

                        summary_str = session.run(supervisor.summary_op)
                        if summary_str:
                            # TODO(wicke): We should assert that the global step hasn't
                            #              changed.
                            global_step = session.run(global_step_tensor)
                            summary_writer.add_summary(summary_str,
                                                       global_step)
                    finally:
                        summary_writer.close()

                if tuner:
                    if eval_results and eval_results[tuner_metric]:
                        tuner.report_measure(
                            float(eval_results[tuner_metric]),
                            model_save_path=checkpoint_path,
                            global_step=int(current_global_step))

                # Call supervisor.Stop() from within a try block because it re-raises
                # exceptions thrown by the supervised threads.
                supervisor.Stop()
        # catch OutOfRangeError which is thrown when queue is out of data (and for
        # other reasons as well).
        except errors.OutOfRangeError as e:
            logging.warn(
                'Got exception during tf.learn training loop possibly '
                'due to exhausted input queue %s.', e)

    if should_stop and tuner is not None:
        try:
            tuner.report_done()
        except Exception as e:
            # TODO(mdan): Vizier should allow softer application error reporting.
            tuner.report_done(infeasible=True, infeasible_reason=e)
            raise

    return eval_results, should_stop