Example #1
0
 def bias_(self):
   hiddenlayer_bias = [load_variable(
       self._model_dir, name=("dnn/hiddenlayer_%d/biases" % i))
                       for i, _ in enumerate(self._hidden_units)]
   logits_bias = [load_variable(self._model_dir, name="dnn/logits/biases")]
   centered_bias = [load_variable(self._model_dir, name=_CENTERED_BIAS_WEIGHT)]
   return hiddenlayer_bias + logits_bias + centered_bias
Example #2
0
 def bias_(self):
   hiddenlayer_bias = [load_variable(
       self._model_dir, name=("dnn/hiddenlayer_%d/biases" % i))
                       for i, _ in enumerate(self._hidden_units)]
   logits_bias = [load_variable(self._model_dir, name="dnn/logits/biases")]
   if self._estimator.params["enable_centered_bias"]:
     centered_bias = [
         load_variable(self._model_dir, name=_CENTERED_BIAS_WEIGHT)]
   else:
     centered_bias = []
   return hiddenlayer_bias + logits_bias + centered_bias
    def get_bias(self, model_dir):
        """Returns the bias of the model.

    Args:
      model_dir: Directory where model parameters, graph and etc. are saved.

    Returns:
      The bias weights created by this model.
    """
        return [
            load_variable(model_dir,
                          name=(self._scope + "/hiddenlayer_%d/biases" % i))
            for i, _ in enumerate(self._hidden_units)
        ] + [load_variable(model_dir, name=(self._scope + "/logits/biases"))]
  def get_bias(self, model_dir):
    """Returns the bias of the model.

    Args:
      model_dir: Directory where model parameters, graph and etc. are saved.

    Returns:
      The bias weights created by this model.
    """
    return [
        load_variable(
            model_dir, name=(self._scope+"/hiddenlayer_%d/biases" % i))
        for i, _ in enumerate(self._hidden_units)
    ] + [load_variable(model_dir, name=(self._scope+"/logits/biases"))]
Example #5
0
 def bias_(self):
     hiddenlayer_bias = [
         load_variable(self._model_dir,
                       name=("dnn/hiddenlayer_%d/biases" % i))
         for i, _ in enumerate(self._hidden_units)
     ]
     logits_bias = [
         load_variable(self._model_dir, name="dnn/logits/biases")
     ]
     if self._estimator.params["enable_centered_bias"]:
         centered_bias = [
             load_variable(self._model_dir, name=_CENTERED_BIAS_WEIGHT)
         ]
     else:
         centered_bias = []
     return hiddenlayer_bias + logits_bias + centered_bias
Example #6
0
    def train(self,
              env,
              first_update=35,
              update_frequency=10,
              episodes=None,
              steps=None,
              hooks=None,
              max_steps=None,
              max_episodes=None):
        """Trains a model given an environment.

        Args:
            env: `Environment` instance.
            first_update: `int`. First timestep to calculate the loss and train_op for the model.
            update_frequency: `int`. The frequecncy at which to calcualate the loss and train_op.
            steps: Number of steps for which to train model. If `None`, train forever.
                'steps' works incrementally. If you call two times fit(steps=10) then
                training occurs in total 20 steps. If you don't want to have incremental
                behaviour please set `max_steps` instead. If set, `max_steps` must be
                `None`.
            hooks: List of `BaseMonitor` subclass instances.
                Used for callbacks inside the training loop.
            max_steps: Number of total steps for which to train model. If `None`,
                train forever. If set, `steps` must be `None`.
            max_episodes: Number of total episodes for which to train model. If `None`,
                train forever. If set, `episodes` must be `None`.

            Two calls to `fit(steps=100)` means 200 training iterations.
            On the other hand, two calls to `fit(max_steps=100)` means
            that the second call will not do any iteration since first call did all 100 steps.

        Returns:
            `self`, for chaining.
        """
        if first_update < self.memory.batch_size:
            raise ValueError(
                "Cannot update the model before gathering enough data")

        if max_steps is not None:
            try:
                start_step = load_variable(self._model_dir,
                                           ops.GraphKeys.GLOBAL_STEP)
                if max_steps <= start_step:
                    logging.info(
                        'Skipping training since max_steps has already saved.')
                    return self
            except:  # pylint: disable=bare-except
                pass

        hooks = self._prepare_train(first_update, update_frequency, steps,
                                    hooks, max_steps, max_episodes)
        loss = self._train_model(env=env,
                                 first_update=first_update,
                                 update_frequency=update_frequency,
                                 hooks=hooks)
        logging.info('Loss for final step: %s.', loss)
        return self
Example #7
0
  def get_variable_value(self, name):
    """Returns value of the variable given by name.

    Args:
      name: string, name of the tensor.

    Returns:
      Numpy array - value of the tensor.
    """
    return load_variable(self.model_dir, name)
Example #8
0
 def weights_(self):
     values = {}
     optimizer_regex = r".*/" + self._optimizer.get_name() + r"(_\d)?$"
     for name, _ in list_variables(self._model_dir):
         if (name.startswith("linear/") and name != "linear/bias_weight"
                 and not re.match(optimizer_regex, name)):
             values[name] = load_variable(self._model_dir, name)
     if len(values) == 1:
         return values[list(values.keys())[0]]
     return values
  def get_bias(self, model_dir):
    """Returns bias of the model.

    Args:
      model_dir: Directory where model parameters, graph and etc. are saved.

    Returns:
      The bias weights created by this model.
    """
    return load_variable(model_dir, name=(self._scope+"/bias_weight"))
Example #10
0
    def get_variable_value(self, name):
        """Returns value of the variable given by name.

    Args:
      name: string, name of the tensor.

    Returns:
      `Tensor` object.
    """
        return load_variable(self._model_dir, name)
Example #11
0
    def get_variable_value(self, name):
        """Returns value of the variable given by name.

        Args:
            name: string, name of the tensor.

        Returns:
            Numpy array - value of the tensor.
        """
        return load_variable(self.model_dir, name)
Example #12
0
  def get_variable_value(self, name):
    """Returns value of the variable given by name.

    Args:
      name: string, name of the tensor.

    Returns:
      `Tensor` object.
    """
    return load_variable(self._model_dir, name)
  def get_bias(self, model_dir):
    """Returns bias of the model.

    Args:
      model_dir: Directory where model parameters, graph and etc. are saved.

    Returns:
      The bias weights created by this model.
    """
    return load_variable(model_dir, name=(self._scope+"/bias_weight"))
Example #14
0
 def _initialize_deep_lab_rgb_weights(self, fn):
   vars_ = tf.global_variables()
   var_w = [x for x in vars_ if x.name == "xception_65/entry_flow/conv1_1/weights:0"]
   assert len(var_w) == 1, len(var_w)
   var_w = var_w[0]
   w = load_variable(fn, "xception_65/entry_flow/conv1_1/weights")
   val_new_w = self.session.run(var_w)
   val_new_w[:, :, :3, :] = w
   placeholder_w = tf.placeholder(tf.float32)
   assign_op_w = tf.assign(var_w, placeholder_w)
   self.session.run(assign_op_w, feed_dict={placeholder_w: val_new_w})
Example #15
0
 def weights_(self):
   values = {}
   optimizer_regex = r".*/"+self._optimizer.get_name() + r"(_\d)?$"
   for name, _ in list_variables(self._model_dir):
     if (name.startswith("linear/") and
         name != "linear/bias_weight" and
         not re.match(optimizer_regex, name)):
       values[name] = load_variable(self._model_dir, name)
   if len(values) == 1:
     return values[list(values.keys())[0]]
   return values
Example #16
0
    def train(self, input_fn=None, steps=None, hooks=None, max_steps=None):
        """Trains a model given training data `x` predictions and `y` labels.

        Args:
            input_fn: Input function returning a tuple of:
                features - `Tensor` or dictionary of string feature name to `Tensor`.
                labels - `Tensor` or dictionary of `Tensor` with labels.
            steps: Number of steps for which to train model. If `None`, train forever.
                'steps' works incrementally. If you call two times fit(steps=10) then
                training occurs in total 20 steps. If you don't want to have incremental
                behaviour please set `max_steps` instead. If set, `max_steps` must be
                `None`.
            hooks: List of `BaseMonitor` subclass instances.
                Used for callbacks inside the training loop.
            max_steps: Number of total steps for which to train model. If `None`,
                train forever. If set, `steps` must be `None`.

            Two calls to `fit(steps=100)` means 200 training iterations.
            On the other hand, two calls to `fit(max_steps=100)` means
            that the second call will not do any iteration since first call did all 100 steps.

        Returns:
            `self`, for chaining.
        """
        if (steps is not None) and (max_steps is not None):
            raise ValueError('Can not provide both steps and max_steps.')
        if steps is not None and steps <= 0:
            raise ValueError('Must specify steps > 0, given: {}'.format(steps))
        if max_steps is not None and max_steps <= 0:
            raise ValueError(
                'Must specify max_steps > 0, given: {}'.format(max_steps))

        if max_steps is not None:
            try:
                start_step = load_variable(self._model_dir,
                                           ops.GraphKeys.GLOBAL_STEP)
                if max_steps <= start_step:
                    logging.info(
                        'Skipping training since max_steps has already saved.')
                    return self
            except:  # pylint: disable=bare-except
                pass

        hooks = self._check_hooks(hooks)
        if steps is not None or max_steps is not None:
            hooks.append(plx_hooks.StopAtStepHook(steps, max_steps))

        loss = self._train_model(input_fn=input_fn, hooks=hooks)
        logging.info('Loss for final step: %s.', loss)
        return self
Example #17
0
    def train(self, env, first_update=35, update_frequency=10, episodes=None, steps=None,
              hooks=None, max_steps=None, max_episodes=None):
        """Trains a model given an environment.

        Args:
            env: `Environment` instance.
            first_update: `int`. First timestep to calculate the loss and train_op for the model.
            update_frequency: `int`. The frequecncy at which to calcualate the loss and train_op.
            steps: Number of steps for which to train model. If `None`, train forever.
                'steps' works incrementally. If you call two times fit(steps=10) then
                training occurs in total 20 steps. If you don't want to have incremental
                behaviour please set `max_steps` instead. If set, `max_steps` must be
                `None`.
            hooks: List of `BaseMonitor` subclass instances.
                Used for callbacks inside the training loop.
            max_steps: Number of total steps for which to train model. If `None`,
                train forever. If set, `steps` must be `None`.
            max_episodes: Number of total episodes for which to train model. If `None`,
                train forever. If set, `episodes` must be `None`.

            Two calls to `fit(steps=100)` means 200 training iterations.
            On the other hand, two calls to `fit(max_steps=100)` means
            that the second call will not do any iteration since first call did all 100 steps.

        Returns:
            `self`, for chaining.
        """
        if not self.memory.can_sample(first_update):
            raise ValueError("Cannot update the model before gathering enough data")

        if max_steps is not None:
            try:
                start_step = load_variable(self._model_dir, ops.GraphKeys.GLOBAL_STEP)
                if max_steps <= start_step:
                    logging.info('Skipping training since max_steps has already saved.')
                    return self
            except:  # pylint: disable=bare-except
                pass

        hooks = self._prepare_train(
            first_update, update_frequency, steps, hooks, max_steps, max_episodes)
        loss = self._train_model(env=env, first_update=first_update,
                                 update_frequency=update_frequency, hooks=hooks)
        logging.info('Loss for final step: %s.', loss)
        return self
Example #18
0
    def _prepare_train(self, episodes=None, steps=None,
                       hooks=None, max_steps=None, max_episodes=None):
        hooks = super(BaseAgent, self)._prepare_train(steps=steps, hooks=hooks, max_steps=max_steps)

        if max_episodes is not None:
            try:
                start_episode = load_variable(self._model_dir, tf.GraphKeys.GLOBAL_EPISODE)
                if max_episodes <= start_episode:
                    logging.info('Skipping training since max_episode has already saved.')
                    return self
            except:  # pylint: disable=bare-except
                pass

        hooks = self._check_hooks(hooks)
        if steps is not None or max_steps is not None:
            hooks.append(plx_hooks.StopAtEpisodeHook(episodes, max_episodes))

        return hooks
  def get_weights(self, model_dir):
    """Returns weights per feature of the linear part.

    Args:
      model_dir: Directory where model parameters, graph and etc. are saved.

    Returns:
      The weights created by this model (without the optimizer weights).
    """
    all_variables = [name for name, _ in list_variables(model_dir)]
    values = {}
    optimizer_regex = r".*/" + self._get_optimizer().get_name() + r"(_\d)?$"
    for name in all_variables:
      if (name.startswith(self._scope + "/") and
          name != self._scope + "/bias_weight" and
          not re.match(optimizer_regex, name)):
        values[name] = load_variable(model_dir, name)
    if len(values) == 1:
      return values[list(values.keys())[0]]
    return values
Example #20
0
  def get_weights(self, model_dir):
    """Returns weights per feature of the linear part.

    Args:
      model_dir: Directory where model parameters, graph and etc. are saved.

    Returns:
      The weights created by this model (without the optimizer weights).
    """
    all_variables = [name for name, _ in list_variables(model_dir)]
    values = {}
    optimizer_regex = r".*/" + self._get_optimizer().get_name() + r"(_\d)?$"
    for name in all_variables:
      if (name.startswith(self._scope + "/") and
          name != self._scope + "/bias_weight" and
          not re.match(optimizer_regex, name)):
        values[name] = load_variable(model_dir, name)
    if len(values) == 1:
      return values[list(values.keys())[0]]
    return values
Example #21
0
    def train(self, input_fn=None, steps=None, hooks=None, max_steps=None):
        """Trains a model given training data `x` predictions and `y` labels.

        Args:
            input_fn: Input function returning a tuple of:
                features - `Tensor` or dictionary of string feature name to `Tensor`.
                labels - `Tensor` or dictionary of `Tensor` with labels.
            steps: Number of steps for which to train model. If `None`, train forever.
                'steps' works incrementally. If you call two times fit(steps=10) then
                training occurs in total 20 steps. If you don't want to have incremental
                behaviour please set `max_steps` instead. If set, `max_steps` must be
                `None`.
            hooks: List of `BaseMonitor` subclass instances.
                Used for callbacks inside the training loop.
            max_steps: Number of total steps for which to train model. If `None`,
                train forever. If set, `steps` must be `None`.

            Two calls to `fit(steps=100)` means 200 training iterations.
            On the other hand, two calls to `fit(max_steps=100)` means
            that the second call will not do any iteration since first call did all 100 steps.

        Returns:
            `self`, for chaining.
        """
        if max_steps is not None:
            try:
                start_step = load_variable(self._model_dir, ops.GraphKeys.GLOBAL_STEP)
                if max_steps <= start_step:
                    logging.info('Skipping training since max_steps has already saved.')
                    return self
            except:  # pylint: disable=bare-except
                pass

        hooks = self._prepare_train(steps, hooks, max_steps)
        loss = self._train_model(input_fn=input_fn, hooks=hooks)
        logging.info('Loss for final step: %s.', loss)
        return self
Example #22
0
    def _prepare_train(self,
                       first_update=35,
                       update_frequency=1,
                       episodes=None,
                       steps=None,
                       hooks=None,
                       max_steps=None,
                       max_episodes=None):
        if first_update < 0:
            raise ValueError("Must specify first_update > 0, given: {}".format(
                first_update))
        if update_frequency < 0:
            raise ValueError(
                "Must specify update_frequency > 0, given: {}".format(
                    update_frequency))
        hooks = super(Agent, self)._prepare_train(steps=steps,
                                                  hooks=hooks,
                                                  max_steps=max_steps)

        if max_episodes is not None:
            try:
                start_episode = load_variable(self._model_dir,
                                              ops.GraphKeys.GLOBAL_EPISODE)
                if max_episodes <= start_episode:
                    logging.info(
                        'Skipping training since max_episode has already saved.'
                    )
                    return self
            except:  # pylint: disable=bare-except
                pass

        hooks = self._check_hooks(hooks)
        if steps is not None or max_steps is not None:
            hooks.append(plx_hooks.StopAtEpisodeHook(episodes, max_episodes))

        return hooks
Example #23
0
 def get_variable_value(self, name):
   return load_variable(self.model_dir, name)
Example #24
0
def main():

    args = parse_args()
    BATCH_SIZE = 10

    train_data, train_labels, train_weights = load_pascal(args.data_dir,
                                                          split='trainval')
    eval_data, eval_labels, eval_weights = load_pascal(args.data_dir,
                                                       split='test')

    pascal_classifier = tf.estimator.Estimator(model_fn=partial(
        cnn_model_fn, num_classes=eval_labels.shape[1]),
                                               model_dir="./alexnet_models/")
    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={
        "x": eval_data,
        "w": eval_weights
    },
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={
        "x": train_data,
        "w": train_weights
    },
                                                        y=train_labels,
                                                        batch_size=BATCH_SIZE,
                                                        num_epochs=None,
                                                        shuffle=True)
    for given_iter in range(0, 5):
        if given_iter == 0:
            pascal_classifier.train(input_fn=train_input_fn, steps=1)
            print([
                name for name, _ in list_variables(pascal_classifier.model_dir)
            ])
            weights = load_variable(pascal_classifier.model_dir,
                                    'conv2d/kernel')
            f, axrr = plt.subplots(16, 6)
            for i in range(0, 96):
                im = Image.fromarray(
                    (weights[:, :, :, i] * 255).astype('uint8')).resize(
                        (50, 50)).convert('LA')
                axrr[int(i / 6)][int(i % 6)].axis('off')
                axrr[int(i / 6)][int(i % 6)].imshow(im)
            plt.axis('off')
            f.savefig('conv2d_' + str(given_iter) + '.png')
        elif given_iter == 2 or given_iter == 1:
            pascal_classifier.train(input_fn=train_input_fn, steps=10000)
            weights = load_variable(pascal_classifier.model_dir,
                                    'conv2d/kernel')
            f, axarr = plt.subplots(16, 6)
            for i in range(0, 96):
                im = Image.fromarray(
                    (weights[:, :, :, i] * 255).astype('uint8')).resize(
                        (50, 50)).convert('LA')
                axrr[int(i / 6)][int(i % 6)].axis('off')
                axrr[int(i / 6)][int(i % 6)].imshow(im)
            plt.axis('off')
            f.savefig('conv2d_' + str(given_iter) + '.png')

        else:
            pascal_classifier.train(input_fn=train_input_fn, steps=10000)
    weights = load_variable(pascal_classifier.model_dir, 'conv2d/kernel')
    f, axarr = plt.subplots(16, 6)
    for i in range(0, 96):
        im = Image.fromarray(
            (weights[:, :, :, i] * 255).astype('uint8')).resize(
                (50, 50)).convert('LA')
        axrr[int(i / 6)][int(i % 6)].axis('off')
        axrr[int(i / 6)][int(i % 6)].imshow(im)
    plt.axis('off')
    f.savefig('conv2d_final.png')
Example #25
0
 def weights_(self):
   hiddenlayer_weights = [load_variable(
       self._model_dir, name=("dnn/hiddenlayer_%d/weights" % i))
                          for i, _ in enumerate(self._hidden_units)]
   logits_weights = [load_variable(self._model_dir, name="dnn/logits/weights")]
   return hiddenlayer_weights + logits_weights
Example #26
0
def _train_internal(graph,
                    output_dir,
                    train_op,
                    loss_op,
                    global_step_tensor,
                    init_op,
                    init_feed_dict,
                    init_fn,
                    log_every_steps,
                    supervisor_is_chief,
                    supervisor_master,
                    supervisor_save_model_secs,
                    keep_checkpoint_max,
                    supervisor_save_summaries_steps,
                    feed_fn,
                    steps,
                    fail_on_nan_loss,
                    monitors,
                    max_steps):
  """See train."""
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')

  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
    if global_step_tensor is None:
      raise ValueError('No "global_step" was provided or found in the graph.')

    # Get current step.
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
    except (errors.NotFoundError, ValueError):
      start_step = 0

    summary_writer = (get_summary_writer(output_dir)
                      if supervisor_is_chief else None)

    # Add default chief monitors if none were provided.
    if not monitors:
      monitors = monitors_lib.get_default_monitors(
          loss_op=loss_op,
          summary_op=logging_ops.get_summary_op(),
          save_summary_steps=supervisor_save_summaries_steps,
          summary_writer=summary_writer) if supervisor_is_chief else []

    # TODO(ipolosukhin): Replace all functionality of Supervisor
    # with Chief-Exclusive Monitors.
    if not supervisor_is_chief:
      # Prune list of monitor to the ones runnable on all workers.
      monitors = [monitor for monitor in monitors if monitor.run_on_all_workers]

    if max_steps is None:
      max_steps = (start_step + steps) if steps else None
    # Start monitors, can create graph parts.
    for monitor in monitors:
      monitor.begin(max_steps=max_steps)

  supervisor = tf_supervisor.Supervisor(
      graph,
      init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT,
      init_feed_dict=init_feed_dict,
      is_chief=supervisor_is_chief,
      logdir=output_dir,
      saver=_make_saver(graph, keep_checkpoint_max),
      global_step=global_step_tensor,
      summary_op=None,
      summary_writer=summary_writer,
      save_model_secs=supervisor_save_model_secs,
      init_fn=init_fn)
  session = supervisor.PrepareSession(master=supervisor_master,
                                      start_standard_services=True)
  supervisor.StartQueueRunners(session)

  with session:
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
    last_step = start_step
    last_log_step = start_step
    loss_value = None
    logging.info('Training steps [%d,%s)', last_step, 'inf'
                 if max_steps is None else str(max_steps))

    excinfo = None
    try:
      while not supervisor.ShouldStop() and (
          (max_steps is None) or (last_step < max_steps)):
        start_time = time.time()
        feed_dict = feed_fn() if feed_fn is not None else None

        outputs, should_stop = _run_with_monitors(
            session, last_step + 1, [train_op, loss_op], feed_dict, monitors)

        loss_value = outputs[loss_op.name]
        if np.isnan(loss_value):
          failure_message = 'Model diverged with loss = NaN.'
          if fail_on_nan_loss:
            logging.error(failure_message)
            raise monitors_lib.NanLossDuringTrainingError()
          else:
            logging.warning(failure_message)

        if should_stop:
          break

        this_step = get_current_step()

        if this_step <= last_step:
          logging.error(
              'Global step was not incremented by train op at step %s'
              ': new step %d', last_step, this_step)

        last_step = this_step
        is_last_step = (max_steps is not None) and (last_step >= max_steps)
        if is_last_step or (last_step - last_log_step >= log_every_steps):
          logging.info(
              'training step %d, loss = %.5f (%.3f sec/batch).',
              last_step, loss_value, float(time.time() - start_time))
          last_log_step = last_step
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)
    except StopIteration:
      logging.info('Exhausted input iterarator.')
    except BaseException as e:  # pylint: disable=broad-except
      # Hold on to any other exceptions while we try recording a final
      # checkpoint and summary.
      excinfo = sys.exc_info()
    finally:
      try:
        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop(close_summary_writer=False)

        # Save one last checkpoint and summaries
        # TODO(wicke): This should be handled by Supervisor

        # In case we encountered an exception in the try block before we updated
        # last_step, update it here (again).
        last_step = get_current_step()
        if supervisor_is_chief:
          ckpt_path = supervisor.save_path
          logging.info('Saving checkpoint for step %d to checkpoint: %s.',
                       last_step, ckpt_path)
          supervisor.saver.save(session, ckpt_path, global_step=last_step)

          # Finish monitors.
          for monitor in monitors:
            monitor.end()

      # catch OutOfRangeError which is thrown when queue is out of data (and for
      # other reasons as well).
      except errors.OutOfRangeError as e:
        logging.warn('OutOfRangeError in tf.learn final checkpoint possibly '
                     'due to exhausted input queue. Note: summary_op is not '
                     'expected to trigger dequeues. %s.', e)
      except BaseException as e:  # pylint: disable=broad-except
        # If we don't already have an exception to re-raise, raise this one.
        if not excinfo:
          raise
        # Otherwise, log this one and raise the other in the finally block.
        logging.error('Got exception during tf.learn final checkpoint %s.', e)
      finally:
        if excinfo:
          reraise(*excinfo)
    return loss_value
Example #27
0
def _monitored_train(graph,
                     output_dir,
                     train_op,
                     loss_op,
                     global_step_tensor=None,
                     init_op=None,
                     init_feed_dict=None,
                     init_fn=None,
                     log_every_steps=10,
                     supervisor_is_chief=True,
                     supervisor_master='',
                     supervisor_save_model_secs=600,
                     supervisor_save_model_steps=None,
                     keep_checkpoint_max=5,
                     supervisor_save_summaries_secs=None,
                     supervisor_save_summaries_steps=100,
                     feed_fn=None,
                     steps=None,
                     fail_on_nan_loss=True,
                     hooks=None,
                     max_steps=None):
  """Train a model via monitored_session.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss. A `0` or negative value disables logging.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save checkpoints every this many seconds. Can
        not be specified with `supervisor_save_model_steps`.
    supervisor_save_model_steps: Save checkpoints every this many steps. Can not
        be specified with `supervisor_save_model_secs`.
    keep_checkpoint_max: The maximum number of recent checkpoint files to
      keep. As new files are created, older files are deleted. If None or 0,
      all checkpoint files are kept. This is simply passed as the max_to_keep
      arg to `tf.Saver` constructor.
    supervisor_save_summaries_secs: Save summaries every
      `supervisor_save_summaries_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` steps when training. Exactly one of
      `supervisor_save_model_steps` and `supervisor_save_model_secs` should be
      specified, and the other should be None.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    hooks: List of `SessionRunHook` subclass instances. Used for callbacks
      inside the training loop.
    max_steps: Number of total steps for which to train model. If `None`,
      train forever. Two calls fit(steps=100) means 200 training iterations.
      On the other hand two calls of fit(max_steps=100) means, second call
      will not do any iteration since first call did all 100 steps.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
      is not provided. See `tf.contrib.framework.get_global_step` for how we
      look up the latter if not provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
      evaluates to `NaN`.
    ValueError: If both `steps` and `max_steps` are not `None`.
  """
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')
  if hooks is None:
    hooks = []
  if not isinstance(hooks, list):
    raise ValueError('Hooks should be a list.')
  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')

  if max_steps is not None:
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
      if max_steps <= start_step:
        logging.info('Skipping training since max_steps has already saved.')
        return None
    except:  # pylint: disable=bare-except
      pass

  # Adapted SessionRunHooks such as ExportMonitor depend on the
  # CheckpointSaverHook to be executed before they should be executed.
  # The `hooks` param comprises of deprecated monitor hooks
  # (such as ExportMonitor). Appending them after the basic_session_run_hooks.
  all_hooks = []
  with graph.as_default():
    all_hooks.append(basic_session_run_hooks.NanTensorHook(
        loss_op, fail_on_nan_loss=fail_on_nan_loss))
    if log_every_steps > 0:
      all_hooks.append(basic_session_run_hooks.LoggingTensorHook({
          'loss': loss_op.name,
          'step': global_step_tensor.name
      }, every_n_iter=log_every_steps))

    def make_saver():
      return tf_saver.Saver(
          sharded=True, max_to_keep=keep_checkpoint_max, defer_build=True,
          write_version=saver_pb2.SaverDef.V1)

    scaffold = monitored_session.Scaffold(
        init_op=init_op,
        init_feed_dict=init_feed_dict,
        init_fn=init_fn,
        saver=monitored_session.Scaffold.get_or_default('saver',
                                                        ops.GraphKeys.SAVERS,
                                                        make_saver))

    if not supervisor_is_chief:
      session_creator = monitored_session.WorkerSessionCreator(
          scaffold=scaffold,
          master=supervisor_master)
    else:
      session_creator = monitored_session.ChiefSessionCreator(
          scaffold=scaffold,
          checkpoint_dir=output_dir,
          master=supervisor_master)
      summary_writer = summary_io.SummaryWriterCache.get(output_dir)
      all_hooks.append(
          basic_session_run_hooks.StepCounterHook(
              summary_writer=summary_writer))
      all_hooks.append(
          basic_session_run_hooks.SummarySaverHook(
              save_secs=supervisor_save_summaries_secs,
              save_steps=supervisor_save_summaries_steps,
              summary_writer=summary_writer,
              scaffold=scaffold))
      if (supervisor_save_model_secs is not None
          or supervisor_save_model_steps is not None):
        all_hooks.append(
            basic_session_run_hooks.CheckpointSaverHook(
                output_dir,
                save_secs=supervisor_save_model_secs,
                save_steps=supervisor_save_model_steps,
                scaffold=scaffold))

    if steps is not None or max_steps is not None:
      all_hooks.append(basic_session_run_hooks.StopAtStepHook(steps, max_steps))
    all_hooks.extend(hooks)

    with monitored_session.MonitoredSession(
        session_creator=session_creator,
        hooks=all_hooks) as super_sess:
      loss = None
      while not super_sess.should_stop():
        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                 None)
    summary_io.SummaryWriterCache.clear()
    return loss
Example #28
0
 def bias_(self):
     return load_variable(self._model_dir, name="linear/bias_weight")
Example #29
0
 def weights_(self):
   hiddenlayer_weights = [load_variable(
       self._model_dir, name=("dnn/hiddenlayer_%d/weights" % i))
                          for i, _ in enumerate(self._hidden_units)]
   logits_weights = [load_variable(self._model_dir, name="dnn/logits/weights")]
   return hiddenlayer_weights + logits_weights
Example #30
0
 def get_variable_value(self, name):
     return load_variable(self.model_dir, name)
Example #31
0
 def bias_(self):
   return load_variable(self._model_dir, name="linear/bias_weight")