def test_get_global_step(self):
   with ops.Graph().as_default() as g:
     self.assertIsNone(training_util.get_global_step())
     variables.VariableV1(
         0,
         trainable=False,
         dtype=dtypes.int32,
         name=ops.GraphKeys.GLOBAL_STEP)
     self._assert_global_step(
         training_util.get_global_step(), expected_dtype=dtypes.int32)
   self._assert_global_step(
       training_util.get_global_step(g), expected_dtype=dtypes.int32)
Example #2
0
    def __init__(self,
                 checkpoint_dir,
                 display_steps=100,
                 maximum_train_steps=None,
                 do_summary=True,
                 is_chief=True):
        """ Initializes the hook.

        Args:
            checkpoint_dir: A string, base directory for the checkpoint files.
            display_steps: A python integer, display every N steps.
            maximum_train_steps: A python integer, the maximum training steps.
            do_summary: Whether to save summaries when display.
            is_chief: Whether this is the chief process.do_summary:
        """

        tf.logging.info("Create DisplayHook.")
        self._checkpoint_dir = checkpoint_dir
        # display steps
        self._display_steps = display_steps
        self._maximum_train_steps = maximum_train_steps
        self._do_summary = do_summary
        self._is_chief = is_chief  # not used now

        # display values
        global_step = training_util.get_global_step()
        display_keys = ops.get_collection(Constants.DISPLAY_KEY_COLLECTION_NAME)
        display_values = ops.get_collection(Constants.DISPLAY_VALUE_COLLECTION_NAME)
        self._display_args = dict(zip(display_keys, display_values))
        self._display_args["global_step"] = global_step
        # timer & summary writer
        self._timer = None
        self._logging_timer = None
        self._summary_writer = None
Example #3
0
 def __init__(self):
   global_step = training_util.get_global_step()
   if global_step:
     self._global_step_incr_op = state_ops.assign_add(
         global_step, 1, name="global_step_incr").op
   else:
     self._global_step_incr_op = None
Example #4
0
 def function(tag, scope):
   if bad_color is None:
     bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
   gen_summary_ops.write_image_summary(
       context.context().summary_writer_resource,
       training_util.get_global_step(), tag, tensor, bad_color_, max_images,
       name=scope)
Example #5
0
  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
    # N.B. We have to pull the global step here to avoid it being unavailable
    # at checkpoint time; the graph has been frozen at that point.
    if training_util.get_global_step() is None and self.saver() is not None:
      raise ValueError(
          'Saver defined but no global step.  Run `get_or_create_global_step()`'
          ' in your model definition to allow checkpointing.')

    with self._graph.as_default():
      logging.info('Installing graceful shutdown hook.')
      self._session = _clone_session(training_session, self._graph)
      self._workers = WorkerHeartbeatManager.from_devices(
          self._session, all_worker_devices(self._session))
      self._heartbeat_supported = self._workers.num_workers() > 0
      if self._heartbeat_supported:
        try:
          self._workers.configure(
              event_pb2.WorkerHeartbeatRequest(
                  shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
        except errors.InvalidArgumentError:
          logging.warn(
              'TPU device does not support heartbeats. Failure '
              'handling will be disabled.')
          self._heartbeat_supported = False
      else:
        logging.warn(
            'No workers support hearbeats. Failure handling will be disabled.')
    def _ModelFn(features, labels, mode):
      if is_training:
        logits_out = self._BuildGraph(features)
      else:
        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
        logits_out = importer.import_graph_def(
            graph_def,
            input_map={INPUT_NODE_NAME: features},
            return_elements=[OUTPUT_NODE_NAME + ':0'],
            name='')[0]

      loss = losses.sparse_softmax_cross_entropy(
          labels=labels, logits=logits_out)
      summary.scalar('loss', loss)

      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
      accuracy = metrics.accuracy(
          labels=labels, predictions=classes_out, name='acc_op')
      summary.scalar('accuracy', accuracy[1])

      if mode == ModeKeys.EVAL:
        return EstimatorSpec(
            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
      elif mode == ModeKeys.TRAIN:
        optimizer = AdamOptimizer(learning_rate=1e-2)
        train_op = optimizer.minimize(loss, global_step=get_global_step())
        return EstimatorSpec(mode, loss=loss, train_op=train_op)
 def begin(self):
   self._last_saved_step = None
   self._request_summary = True
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use SummarySaverHook.")
 def begin(self):
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use CheckpointSaverHook.")
   for l in self._listeners:
     l.begin()
Example #9
0
  def get_updates(self, loss, params):
    if distribute_lib.has_distribution_strategy():
      self.updates = []

      if not params:
        # After the model vars have been created, the second call to get_updates
        # is called with params as an empty list. This ensures that we call
        # compute_gradients with params=None.
        grads = self.optimizer.compute_gradients(loss)
      else:
        grads = self.optimizer.compute_gradients(loss, params)
      global_step = training_util.get_global_step()
      opt_update = self.optimizer.apply_gradients(grads, global_step)
    else:
      if not params:
        self.updates = [state_ops.assign_add(self.iterations, 1)]
        return self.updates

      # Updates list starts out empty because the iterations variable is
      # incremented in optimizer.apply_gradients()
      self.updates = []
      grads = self.optimizer.compute_gradients(loss, params)
      opt_update = self.optimizer.apply_gradients(
          grads, global_step=self.iterations)

    self.updates.append(opt_update)
    return self.updates
 def begin(self):
   self._last_saved_time = None
   self._last_saved_step = None
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use CheckpointSaverHook.")
def record_summaries_every_n_global_steps(n):
  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
  old = collection_ref[:]
  collection_ref[:] = [training_util.get_global_step() % n == 0]
  yield
  collection_ref[:] = old
Example #12
0
 def before_run(self, run_context):
   loss = (self.loss_op if self.loss_op is not None else
           run_context.session.graph.get_operation_by_name(
               LOSS_NAME).outputs[0])
   return session_run_hook.SessionRunArgs(
       {'global_step': training_util.get_global_step(),
        'current_loss': loss})
 def begin(self):
   self._last_reported_time = None
   self._last_reported_step = None
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use StepCounterHook.")
  def _train_op_fn(loss):
    """Returns the op to optimize the loss."""
    train_ops = []
    global_step = training_util.get_global_step()
    if dnn_logits is not None:
      train_ops.append(
          dnn_optimizer.minimize(
              loss,
              var_list=ops.get_collection(
                  ops.GraphKeys.TRAINABLE_VARIABLES,
                  scope=dnn_parent_scope)))
    if linear_logits is not None:
      train_ops.append(
          linear_optimizer.minimize(
              loss,
              var_list=ops.get_collection(
                  ops.GraphKeys.TRAINABLE_VARIABLES,
                  scope=linear_parent_scope)))

    train_op = control_flow_ops.group(*train_ops)
    with ops.control_dependencies([train_op]):
      with ops.colocate_with(global_step):
        return state_ops.assign_add(global_step, 1)

    return head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        train_op_fn=_train_op_fn,
        logits=logits)
 def begin(self):
   if self._summary_writer is None and self._output_dir:
     self._summary_writer = SummaryWriterCache.get(self._output_dir)
   self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use StepCounterHook.")
   self._summary_tag = training_util.get_global_step().op.name + "/sec"
  def compute_gradients(self, loss, *args, **kwargs):
    # Record current global step for worker.
    with ops.colocate_with(loss):
      self._local_step = training_util.get_global_step() + 0

    with ops.control_dependencies([self._local_step]):
      loss = gen_array_ops.identity(loss)
      return self._opt.compute_gradients(loss, *args, **kwargs)
 def begin(self):
   self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use CheckpointSaverHook.")
   for l in self._listeners:
     l.begin()
 def begin(self):
   if self._summary_writer is None and self._output_dir:
     self._summary_writer = SummaryWriterCache.get(self._output_dir)
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use StepCounterHook.")
   self._summary_tag = self._global_step_tensor.op.name + "/sec"
 def model_fn(features, labels):
   # dummy variable:
   _ = variables_lib.Variable([0.])
   _ = labels
   predictions = features["x"]
   loss = constant_op.constant([2.])
   update_global_step = training_util.get_global_step().assign_add(1)
   return predictions, loss, update_global_step
Example #20
0
def record_summaries_every_n_global_steps(n):
  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
  old = collection_ref[:]
  with ops.device("cpu:0"):
    collection_ref[:] = [math_ops.equal(training_util.get_global_step() % n, 0)]
  yield
  collection_ref[:] = old
 def begin(self):
   if self._summary_writer is None and self._output_dir:
     self._summary_writer = SummaryWriterCache.get(self._output_dir)
   self._next_step = None
   self._global_step_tensor = training_util.get_global_step()
   if self._global_step_tensor is None:
     raise RuntimeError(
         "Global step should be created to use SummarySaverHook.")
Example #22
0
  def begin(self):
    self._global_step_tensor = training_util.get_global_step()
    if self._global_step_tensor is None:
      raise RuntimeError("Global step should be created.")

    if self._override_global_step_value is not None:
      self._override_global_step_op = state_ops.assign(
          self._global_step_tensor, self._override_global_step_value)
  def _train_op_fn(loss):
    global_step = training_util.get_global_step()
    assert global_step
    train_step = model.get_train_step(loss)

    with ops.control_dependencies(train_step):
      with ops.get_default_graph().colocate_with(global_step):
        return state_ops.assign_add(global_step, 1).op
Example #24
0
 def _train_op_fn(unused_loss):
   global_step = training_util.get_global_step()
   sdca_model, train_op = optimizer.get_train_step(
       columns_to_variables, weight_column_name, loss_type, features, labels,
       global_step)
   if update_weights_hook is not None:
     update_weights_hook.set_parameters(sdca_model, train_op)
   return train_op
Example #25
0
 def _train_op_fn(loss):
   global_step = training_util.get_global_step()
   my_vars = ops.get_collection(parent_scope)
   grads = gradients.gradients(loss, my_vars)
   if gradient_clip_norm:
     grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
   return (_get_optimizer(optimizer).apply_gradients(
       zip(grads, my_vars), global_step=global_step))
Example #26
0
 def function(tag, scope):
   if bad_color is None:
     bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
   # Note the identity to move the tensor to the CPU.
   return gen_summary_ops.write_image_summary(
       context.context().summary_writer_resource,
       training_util.get_global_step(), tag, array_ops.identity(tensor),
       bad_color_,
       max_images, name=scope)
Example #27
0
 def record():
   if bad_color is None:
     bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
   with summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     gen_summary_ops.write_image_summary(
         context.context().summary_writer_resource,
         training_util.get_global_step(), tag, tensor, bad_color_, max_images,
         name=scope)
Example #28
0
 def function(tag, scope):
   gen_summary_ops.write_audio_summary(
       context.context().summary_writer_resource,
       training_util.get_global_step(),
       tag,
       tensor,
       sample_rate=sample_rate,
       max_outputs=max_outputs,
       name=scope)
def linear_model_fn_with_model_fn_ops(features, labels, mode):
  """Same as linear_model_fn, but returns `ModelFnOps`."""
  assert mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
                  model_fn.ModeKeys.INFER)
  prediction, loss = (models.linear_regression_zero_init(features, labels))
  train_op = optimizers.optimize_loss(
      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
  return model_fn.ModelFnOps(
      mode=mode, predictions=prediction, loss=loss, train_op=train_op)
  def test1Workers2Period(self):
    num_workers = 2
    steps = 2
    num_ps = 1
    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
                                               num_ps=num_ps)

    sessions, graphs, train_ops = _get_workers(num_workers,
                                               steps,
                                               workers)

    var_0 = graphs[0].get_tensor_by_name('v0:0')
    var_1 = graphs[0].get_tensor_by_name('v1:0')
    global_step = training_util.get_global_step(graphs[0])
    global_var_0 = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
    global_var_1 = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")

    # Verify the initialized value.
    self.assertAllEqual(0.0, sessions[0].run(var_0))
    self.assertAllEqual(1.0, sessions[0].run(var_1))
    self.assertAllEqual(0.0, sessions[0].run(global_var_0))
    self.assertAllEqual(1.0, sessions[0].run(global_var_1))
    self.assertAllEqual(0, sessions[0].run(global_step))

    sessions[0].run(train_ops[0])
    sessions[1].run(train_ops[1])

    self.assertAllEqual(1.0, sessions[0].run(var_0))
    self.assertAllEqual(2.0, sessions[0].run(var_1))
    self.assertAllEqual(0.0, sessions[0].run(global_var_0))
    self.assertAllEqual(1.0, sessions[0].run(global_var_1))
    self.assertAllEqual(0, sessions[0].run(global_step))

    # iteration 2, global varibale update
    thread_0 = self.checkedThread(
      target=self._run, args=(train_ops[0], sessions[0]))
    thread_1 = self.checkedThread(
      target=self._run, args=(train_ops[1], sessions[1]))
    thread_0.start()
    thread_1.start()
    thread_0.join()
    thread_1.join()

    self.assertAllEqual(3.0, sessions[0].run(var_0))
    self.assertAllEqual(4.0, sessions[0].run(var_1))
    self.assertAllEqual(3.0, sessions[0].run(global_var_0))
    self.assertAllEqual(4.0, sessions[0].run(global_var_1))
    self.assertAllEqual(1, sessions[0].run(global_step))

    # iteration 3
    sessions[0].run(train_ops[0])

    self.assertAllEqual(4.0, sessions[0].run(var_0))
    self.assertAllEqual(5.0, sessions[0].run(var_1))
    self.assertAllEqual(3.0, sessions[0].run(global_var_0))
    self.assertAllEqual(4.0, sessions[0].run(global_var_1))
    self.assertAllEqual(1, sessions[0].run(global_step))
Example #31
0
def _export_graph(graph, saver, checkpoint_path, export_dir,
                  default_graph_signature, named_graph_signatures,
                  exports_to_keep):
  """Exports graph via session_bundle, by creating a Session."""
  with graph.as_default():
    with tf_session.Session('') as session:
      variables.local_variables_initializer()
      lookup_ops.tables_initializer()
      saver.restore(session, checkpoint_path)

      export = exporter.Exporter(saver)
      export.init(
          init_op=control_flow_ops.group(
              variables.local_variables_initializer(),
              lookup_ops.tables_initializer()),
          default_graph_signature=default_graph_signature,
          named_graph_signatures=named_graph_signatures,
          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
      return export.export(export_dir, training_util.get_global_step(),
                           session, exports_to_keep=exports_to_keep)
Example #32
0
 def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy):
     strategy, _, _ = create_test_objects(
         num_gpus=1, use_core_strategy=use_core_strategy)
     with ops.Graph().as_default(), strategy.scope():
         created_step = training_util.create_global_step()
         get_step = training_util.get_global_step()
         self.assertEqual(
             created_step,
             get_step,
             msg=('created_step %s type %s vs. get_step %s type %s' %
                  (id(created_step), created_step.__class__.__name__,
                   id(get_step), get_step.__class__.__name__)))
         self.assertIs(resource_variable_ops.ResourceVariable,
                       type(created_step))
         self.assertIs(resource_variable_ops.ResourceVariable,
                       type(get_step))
         # All variables have an _distribute_strategy parameter. Only variable
         # subclasses in distribution strategy expose it publicly.
         self.assertFalse(hasattr(strategy, 'distribute_strategy'))
         self.assertIs(strategy, created_step._distribute_strategy)
Example #33
0
 def _train_op_fn(loss):
     """Returns the op to optimize the loss."""
     train_ops = []
     global_step = training_util.get_global_step()
     if dnn_logits is not None:
         train_ops.append(
             dnn_optimizer.minimize(loss,
                                    var_list=ops.get_collection(
                                        ops.GraphKeys.TRAINABLE_VARIABLES,
                                        scope=dnn_parent_scope)))
     if fm_first_logits is not None or fm_second_logits is not None:
         train_ops.append(
             linear_optimizer.minimize(
                 loss,
                 var_list=ops.get_collection(
                     ops.GraphKeys.TRAINABLE_VARIABLES,
                     scope=fm_parent_scope)))
     train_op = control_flow_ops.group(*train_ops)
     with ops.control_dependencies([train_op]):
         return distribute_lib.increment_var(global_step)
Example #34
0
    def __init__(self,
                 checkpoint_dir,
                 save_checkpoint_steps=1000,
                 saver=None,
                 pretrain_model=None,
                 problem_name=None,
                 model_name="njunmt.models.SequenceToSequence",
                 do_summary=True,
                 is_chief=True):
        """ Initializes the hook.

        Args:
            checkpoint_dir: A string, base directory for the checkpoint files.
            save_checkpoint_steps: A python integer, save every N steps.
            saver: `Saver` object, used for saving.
            pretrain_model: The pretrained model dir.
            problem_name: A string.
            model_name: The model name.
            do_summary: Whether to save summaries.
            is_chief: Whether this is the chief process.
        """
        tf.logging.info("Create CheckpointSaverHook.")
        if saver is None:
            saver = get_saver_or_default(max_to_keep=8)  # pylint: disable=protected-access
        self._saver = saver
        self._checkpoint_dir = checkpoint_dir
        self._save_path = os.path.join(checkpoint_dir,
                                       Constants.MODEL_CKPT_FILENAME)
        self._pretrain_model = pretrain_model
        self._problem_name = problem_name
        self._model_name = model_name
        # save every n steps
        self._save_checkpoint_steps = save_checkpoint_steps
        # variable for session.run
        self._global_step = training_util.get_global_step()
        # for after create session
        self._do_summary = do_summary
        self._is_chief = is_chief
        # timer & summary writer
        self._timer = None
        self._summary_writer = None
Example #35
0
    def after_run(self, run_context, run_values):
        del run_values

        lame_workers = self._workers.lame_workers()
        if lame_workers:
            logging.info('ShutdownHook: lame workers found: %s', lame_workers)

            if self.saver():
                logging.info('ShutdownHook: saving checkpoint to %s',
                             self._checkpoint_prefix)
                self.saver().save(
                    run_context.session,
                    self._checkpoint_prefix,
                    global_step=training_util.get_global_step(),
                    write_state=True,
                )
            else:
                logging.info('ShutdownHook: no Saver defined.')

            for fn in self._on_shutdown_hooks:
                fn(run_context, self._workers, lame_workers)
Example #36
0
def create_estimator_spec_train_op(head_name,
                                   optimizer=None,
                                   train_op_fn=None,
                                   update_ops=None,
                                   regularized_training_loss=None):
    """Create train_op for estimator_spec."""
    with ops.name_scope(head_name, 'head'):
        if optimizer is not None:
            if train_op_fn is not None:
                raise ValueError(
                    'train_op_fn and optimizer cannot both be set.')
            train_op = optimizer.minimize(
                regularized_training_loss,
                global_step=training_util.get_global_step())
        elif train_op_fn is not None:
            train_op = train_op_fn(regularized_training_loss)
        else:
            raise ValueError('train_op_fn and optimizer cannot both be None.')
        if update_ops is not None:
            train_op = control_flow_ops.group(train_op, *update_ops)
        return train_op
    def _train_op_fn(loss):
        """Returns the op to optimize the loss."""
        train_ops = []
        global_step = training_util.get_global_step()
        if dnn_logits is not None:
            train_ops.append(
                dnn_optimizer.minimize(loss,
                                       var_list=ops.get_collection(
                                           ops.GraphKeys.TRAINABLE_VARIABLES,
                                           scope=dnn_absolute_scope)))
        if linear_logits is not None:
            train_ops.append(
                linear_optimizer.minimize(
                    loss,
                    var_list=ops.get_collection(
                        ops.GraphKeys.TRAINABLE_VARIABLES,
                        scope=linear_absolute_scope)))

        train_op = control_flow_ops.group(*train_ops)
        with ops.control_dependencies([train_op]):
            return state_ops.assign_add(global_step, 1).op
  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
    # N.B. We have to pull the global step here to avoid it being unavailable
    # at checkpoint time; the graph has been frozen at that point.
    if training_util.get_global_step() is None and self.saver() is not None:
      raise ValueError(
          'Saver defined but no global step.  Run `get_or_create_global_step()`'
          ' in your model definition to allow checkpointing.')

    with self._graph.as_default():
      logging.info('Installing graceful shutdown hook.')
      self._session = _clone_session(training_session, self._graph)
      self._workers = WorkerHeartbeatManager.from_devices(
          self._session, all_worker_devices(self._session))
      self._heartbeat_supported = self._workers.num_workers() > 0
      if self._heartbeat_supported:
        self._workers.configure(
            event_pb2.WorkerHeartbeatRequest(
                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
      else:
        logging.warn(
            'No workers support hearbeats. Failure handling will be disabled.')
Example #39
0
  def begin(self):
    #pylint: disable=W0201
    features = graph_utils.get_dict_from_collection("features")
    labels = graph_utils.get_dict_from_collection("labels")
    num_source_tokens = tf.reduce_sum(features["source_len"])
    num_target_tokens = tf.reduce_sum(labels["target_len"])

    self._tokens_last_step = 0
    self._global_step_tensor = training_util.get_global_step()
    self._num_tokens_tensor = num_source_tokens + num_target_tokens

    # Create a variable that stores how many tokens have been processed
    # Should be global for distributed training
    with tf.variable_scope("tokens_counter"):
      self._tokens_processed_var = tf.get_variable(
          name="count",
          shape=[],
          dtype=tf.int32,
          initializer=tf.constant_initializer(0, dtype=tf.int32))
      self._tokens_processed_add = tf.assign_add(
          self._tokens_processed_var, self._num_tokens_tensor)
Example #40
0
  def _make_training_op(training_loss):
    """Training op for the DNN linear combined model."""
    train_ops = []
    global_step = training_util.get_global_step()
    if dnn_logits is not None:
      train_ops.append(
          optimizers.optimize_loss(
              loss=training_loss,
              global_step=global_step,
              learning_rate=_DNN_LEARNING_RATE,
              optimizer=dnn_optimizer,
              gradient_multipliers=_extract_embedding_lr_multipliers(  # pylint: disable=protected-access
                  embedding_lr_multipliers, dnn_parent_scope,
                  dnn_input_scope.name),
              clip_gradients=gradient_clip_norm,
              variables=ops.get_collection(dnn_parent_scope),
              name=dnn_parent_scope,
              # Empty summaries, because head already logs "loss" summary.
              summaries=[],
              increment_global_step=not fix_global_step_increment_bug))
    if linear_logits is not None:
      train_ops.append(
          optimizers.optimize_loss(
              loss=training_loss,
              global_step=global_step,
              learning_rate=_linear_learning_rate(len(linear_feature_columns)),
              optimizer=linear_optimizer,
              clip_gradients=gradient_clip_norm,
              variables=ops.get_collection(linear_parent_scope),
              name=linear_parent_scope,
              # Empty summaries, because head already logs "loss" summary.
              summaries=[],
              increment_global_step=not fix_global_step_increment_bug))

    train_op = control_flow_ops.group(*train_ops)
    if fix_global_step_increment_bug:
      with ops.control_dependencies([train_op]):
        with ops.colocate_with(global_step):
          return state_ops.assign_add(global_step, 1).op
    return train_op
Example #41
0
def audio(name,
          tensor,
          sample_rate,
          max_outputs,
          family=None,
          global_step=None):
    """Writes an audio summary if possible."""
    if global_step is None:
        global_step = training_util.get_global_step()

    def function(tag, scope):
        # Note the identity to move the tensor to the CPU.
        return gen_summary_ops.write_audio_summary(
            context.context().summary_writer_resource,
            global_step,
            tag,
            array_ops.identity(tensor),
            sample_rate=sample_rate,
            max_outputs=max_outputs,
            name=scope)

    return summary_writer_function(name, tensor, function, family=family)
    def test1WorkerNegativeStaleness(self):
        num_workers = 1
        sessions, graphs, train_ops = _get_workers(num_workers, -1)
        with graphs[0].as_default():
            sessions[0].run(variables.global_variables_initializer())
        global_step = training_util.get_global_step(graphs[0])
        var_0 = graphs[0].get_tensor_by_name('v0:0')
        var_1 = graphs[0].get_tensor_by_name('v1:0')
        stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
        # Verify the initialized value.
        self.assertAllEqual(0.0, sessions[0].run(var_0))
        self.assertAllEqual(1.0, sessions[0].run(var_1))
        self.assertAllEqual(0.0, sessions[0].run(stale_counter))
        self.assertAllEqual(0, sessions[0].run(global_step))

        sessions[0].run(train_ops[0])

        # Verify no updates because max staleness is negative.
        self.assertAllEqual(0, sessions[0].run(global_step))
        self.assertAllEqual(1.0, sessions[0].run(stale_counter))
        self.assertAllEqual(0.0, sessions[0].run(var_0))
        self.assertAllEqual(1.0, sessions[0].run(var_1))
Example #43
0
def _minimize_towers(tower_specs, optimizer):
    """Aggregate and apply gradients for computed losses."""
    grad_lists = {}
    for tower_spec in tower_specs:
        with ops_lib.device(tower_spec.loss.device):
            variables = variables_lib.trainable_variables()
            gradients = gradients_lib.gradients(tower_spec.loss, variables)

            for var, grad in zip(variables, gradients):
                if grad is not None:
                    grad_lists.setdefault(var, []).append(grad)

    aggregated_grads = []
    with ops_lib.name_scope('gradient_aggregating'):
        for var, grads in six.iteritems(grad_lists):
            grad = _compute_sum_on_device(grads, var.device)
            aggregated_grads.append((grad, var))

    train_op = optimizer.apply_gradients(
        aggregated_grads, global_step=training_util.get_global_step())

    return train_op
Example #44
0
 def _prepare_train_hooks(self,
                          epochs,
                          steps_per_epoch,
                          initial_epoch=0):
     saver_hooks = [h for h in self.train_hooks if isinstance(h, CkPtSaverHook)]
     if not saver_hooks:
         self.train_hooks.append(CkPtSaverHook(
             file_dir=self._checkpoint_dir,
             global_step_tensor=training_util.get_global_step(),
             save_steps=steps_per_epoch))
     if self.summary_ops:
         self.train_hooks.append(SummaryHook(
             save_steps=steps_per_epoch,
             summary_op=self.summary_ops,
             output_dir=self._checkpoint_dir + '/train'))
     self.train_hooks.append(ProgressHook(
         title='Training',
         target=steps_per_epoch,
         epochs=epochs,
         initial_epoch=initial_epoch,
         metric_names=self.metric_names,
         stateful_metric_names=self.stateful_metric_names))
Example #45
0
    def _merge_train(self, all_estimator_spec, optimizer, train_op_fn):
        """Merges list of `EstimatorSpec` for training.

    Args:
      all_estimator_spec: list of `EstimatorSpec` for the individual heads.
      optimizer: `Optimizer` instance to create train op. See
        `create_estimator_spec` documentation for more details.
      train_op_fn: Function to create train op. Used if `optimizer` is `None`.

    Returns:
      `EstimatorSpec` that merges all heads for TRAIN.

    Raises:
      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
        mode.
    """
        losses = []
        metrics = {}
        for spec in all_estimator_spec:
            losses.append(spec.loss)
            # Metric keys already contain head.name.
            metrics.update(spec.eval_metric_ops or {})
        loss = _merge_losses(losses, self._head_weights)
        if optimizer is not None:
            if train_op_fn is not None:
                raise ValueError(
                    'train_op_fn and optimizer cannot both be set.')
            train_op = optimizer.minimize(
                loss, global_step=training_util.get_global_step())
        elif train_op_fn is not None:
            train_op = train_op_fn(loss)
        else:
            raise ValueError('train_op_fn and optimizer cannot both be None.')

        return model_fn.EstimatorSpec(mode=model_fn.ModeKeys.TRAIN,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=metrics)
Example #46
0
def generic(name, tensor, metadata=None, family=None, global_step=None):
    """Writes a tensor summary if possible."""
    if global_step is None:
        global_step = training_util.get_global_step()

    def function(tag, scope):
        if metadata is None:
            serialized_metadata = constant_op.constant("")
        elif hasattr(metadata, "SerializeToString"):
            serialized_metadata = constant_op.constant(
                metadata.SerializeToString())
        else:
            serialized_metadata = metadata
        # Note the identity to move the tensor to the CPU.
        return gen_summary_ops.write_summary(
            context.context().summary_writer_resource,
            global_step,
            array_ops.identity(tensor),
            tag,
            serialized_metadata,
            name=scope)

    return summary_writer_function(name, tensor, function, family=family)
Example #47
0
    def __init__(self,
                 checkpoint_dir,
                 display_steps=100,
                 maximum_train_steps=None,
                 do_summary=True,
                 is_chief=True):
        """ Initializes the hook.

        Args:
            checkpoint_dir: A string, base directory for the checkpoint files.
            display_steps: A python integer, display every N steps.
            maximum_train_steps: A python integer, the maximum training steps.
            do_summary: Whether to save summaries when display.
            is_chief: Whether this is the chief process.do_summary:
        """

        tf.logging.info("Create DisplayHook.")
        self._checkpoint_dir = checkpoint_dir
        # display steps
        self._display_steps = display_steps
        self._maximum_train_steps = maximum_train_steps
        self._do_summary = do_summary
        self._is_chief = is_chief  # not used now

        # display values
        global_step = training_util.get_global_step()
        display_keys = ops.get_collection(
            Constants.DISPLAY_KEY_COLLECTION_NAME)
        display_values = ops.get_collection(
            Constants.DISPLAY_VALUE_COLLECTION_NAME)
        self._display_args = dict(zip(display_keys, display_values))
        self._display_args["global_step"] = global_step
        # timer & summary writer
        self._timer = None
        self._logging_timer = None
        self._summary_writer = None
 def model_fn(features, labels, mode):
   del features, labels
   global_step = training_util.get_global_step()
   if mode == model_fn_lib.ModeKeys.TRAIN:
     train_hook1 = basic_session_run_hooks.StepCounterHook(
         every_n_steps=1, output_dir=self.get_temp_dir())
     train_hook2 = tf.compat.v1.test.mock.MagicMock(
         wraps=tf.compat.v1.train.SessionRunHook(),
         spec=tf.compat.v1.train.SessionRunHook)
     return model_fn_lib.EstimatorSpec(
         mode,
         loss=tf.constant(1.),
         train_op=global_step.assign_add(1),
         training_hooks=[train_hook1, train_hook2])
   if mode == model_fn_lib.ModeKeys.EVAL:
     eval_hook1 = basic_session_run_hooks.StepCounterHook(
         every_n_steps=1, output_dir=self.get_temp_dir())
     eval_hook2 = tf.compat.v1.test.mock.MagicMock(
         wraps=tf.compat.v1.train.SessionRunHook(),
         spec=tf.compat.v1.train.SessionRunHook)
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         loss=tf.constant(1.),
         evaluation_hooks=[eval_hook1, eval_hook2])
def _kmeans_clustering_model_fn(features, labels, mode, params, config):
  """Model function for KMeansClustering estimator."""
  assert labels is None, labels
  (all_scores, model_predictions, losses,
   is_initialized, init_op, training_op) = clustering_ops.KMeans(
       _parse_tensor_or_dict(features),
       params.get('num_clusters'),
       initial_clusters=params.get('training_initial_clusters'),
       distance_metric=params.get('distance_metric'),
       use_mini_batch=params.get('use_mini_batch'),
       mini_batch_steps_per_iteration=params.get(
           'mini_batch_steps_per_iteration'),
       random_seed=params.get('random_seed'),
       kmeans_plus_plus_num_retries=params.get(
           'kmeans_plus_plus_num_retries')).training_graph()
  incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
  loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
  summary.scalar('loss/raw', loss)
  training_op = with_dependencies([training_op, incr_step], loss)
  predictions = {
      KMeansClustering.ALL_SCORES: all_scores[0],
      KMeansClustering.CLUSTER_IDX: model_predictions[0],
  }
  eval_metric_ops = {KMeansClustering.SCORES: loss}
  training_hooks = [_InitializeClustersHook(
      init_op, is_initialized, config.is_chief)]
  relative_tolerance = params.get('relative_tolerance')
  if relative_tolerance is not None:
    training_hooks.append(_LossRelativeChangeHook(relative_tolerance))
  return ModelFnOps(
      mode=mode,
      predictions=predictions,
      eval_metric_ops=eval_metric_ops,
      loss=loss,
      train_op=training_op,
      training_hooks=training_hooks)
Example #50
0
    def get_updates(self, loss, params):
        if distribute_lib.has_distribution_strategy():
            self.updates = []

            if not params:
                # After the model vars have been created, the second call to get_updates
                # is called with params as an empty list. This ensures that we call
                # compute_gradients with params=None.
                grads = self.optimizer.compute_gradients(loss)
            else:
                grads = self.optimizer.compute_gradients(loss, params)
            global_step = training_util.get_global_step()
            opt_update = self.optimizer.apply_gradients(grads, global_step)
        else:
            self.updates = [state_ops.assign_add(self.iterations, 1)]
            if not params:
                return self.updates

            grads = self.optimizer.compute_gradients(loss, params)
            opt_update = self.optimizer.apply_gradients(
                grads, global_step=self.iterations)

        self.updates.append(opt_update)
        return self.updates
Example #51
0
 def _train_op_fn(loss):
     """Returns the op to optimize the loss."""
     return optimizer.minimize(
         loss, global_step=training_util.get_global_step())
Example #52
0
def model_builder(features,
                  labels,
                  mode,
                  params,
                  config,
                  output_type=ModelBuilderOutputType.MODEL_FN_OPS):
    """Multi-machine batch gradient descent tree model.

  Args:
    features: `Tensor` or `dict` of `Tensor` objects.
    labels: Labels used to train on.
    mode: Mode we are in. (TRAIN/EVAL/INFER)
    params: A dict of hyperparameters.
      The following hyperparameters are expected:
      * head: A `Head` instance.
      * learner_config: A config for the learner.
      * feature_columns: An iterable containing all the feature columns used by
          the model.
      * examples_per_layer: Number of examples to accumulate before growing a
          layer. It can also be a function that computes the number of examples
          based on the depth of the layer that's being built.
      * weight_column_name: The name of weight column.
      * center_bias: Whether a separate tree should be created for first fitting
          the bias.
    config: `RunConfig` of the estimator.
    output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec
      (new interface).

  Returns:
    A `ModelFnOps` object.
  Raises:
    ValueError: if inputs are not valid.
  """
    head = params["head"]
    learner_config = params["learner_config"]
    examples_per_layer = params["examples_per_layer"]
    feature_columns = params["feature_columns"]
    weight_column_name = params["weight_column_name"]
    num_trees = params["num_trees"]
    use_core_libs = params["use_core_libs"]
    logits_modifier_function = params["logits_modifier_function"]
    output_leaf_index = params["output_leaf_index"]

    if features is None:
        raise ValueError("At least one feature must be specified.")

    if config is None:
        raise ValueError("Missing estimator RunConfig.")

    center_bias = params["center_bias"]

    if isinstance(features, ops.Tensor):
        features = {features.name: features}

    # Make a shallow copy of features to ensure downstream usage
    # is unaffected by modifications in the model function.
    training_features = copy.copy(features)
    training_features.pop(weight_column_name, None)
    global_step = training_util.get_global_step()
    with ops.device(global_step.device):
        ensemble_handle = model_ops.tree_ensemble_variable(
            stamp_token=0,
            tree_ensemble_config="",  # Initialize an empty ensemble.
            name="ensemble_model")

    # Create GBDT model.
    gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
        is_chief=config.is_chief,
        num_ps_replicas=config.num_ps_replicas,
        ensemble_handle=ensemble_handle,
        center_bias=center_bias,
        examples_per_layer=examples_per_layer,
        learner_config=learner_config,
        feature_columns=feature_columns,
        logits_dimension=head.logits_dimension,
        features=training_features,
        use_core_columns=use_core_libs,
        output_leaf_index=output_leaf_index)
    with ops.name_scope("gbdt", "gbdt_optimizer"):
        predictions_dict = gbdt_model.predict(mode)
        logits = predictions_dict["predictions"]
        if logits_modifier_function:
            logits = logits_modifier_function(logits, features, mode)

        def _train_op_fn(loss):
            """Returns the op to optimize the loss."""
            update_op = gbdt_model.train(loss, predictions_dict, labels)
            with ops.control_dependencies(
                [update_op]), (ops.colocate_with(global_step)):
                update_op = state_ops.assign_add(global_step, 1).op
                return update_op

    create_estimator_spec_op = getattr(head, "create_estimator_spec", None)

    training_hooks = []
    if num_trees:
        if center_bias:
            num_trees += 1

        finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor(
        )
        training_hooks.append(
            trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
                                          finalized_trees))

    if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
        if use_core_libs and callable(create_estimator_spec_op):
            model_fn_ops = head.create_estimator_spec(features=features,
                                                      mode=mode,
                                                      labels=labels,
                                                      train_op_fn=_train_op_fn,
                                                      logits=logits)
            model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
                model_fn_ops)
        else:
            model_fn_ops = head.create_model_fn_ops(features=features,
                                                    mode=mode,
                                                    labels=labels,
                                                    train_op_fn=_train_op_fn,
                                                    logits=logits)

        if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
            model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
                gbdt_batch.LEAF_INDEX]

        model_fn_ops.training_hooks.extend(training_hooks)
        return model_fn_ops
    elif output_type == ModelBuilderOutputType.ESTIMATOR_SPEC:
        assert callable(create_estimator_spec_op)
        estimator_spec = head.create_estimator_spec(features=features,
                                                    mode=mode,
                                                    labels=labels,
                                                    train_op_fn=_train_op_fn,
                                                    logits=logits)

        estimator_spec = estimator_spec._replace(
            training_hooks=training_hooks +
            list(estimator_spec.training_hooks))
        return estimator_spec

    return model_fn_ops
Example #53
0
def ranking_model_builder(features,
                          labels,
                          mode,
                          params,
                          config,
                          output_type=ModelBuilderOutputType.MODEL_FN_OPS):
    """Multi-machine batch gradient descent tree model for ranking.

  Args:
    features: `Tensor` or `dict` of `Tensor` objects.
    labels: Labels used to train on.
    mode: Mode we are in. (TRAIN/EVAL/INFER)
    params: A dict of hyperparameters.
      The following hyperparameters are expected:
      * head: A `Head` instance.
      * learner_config: A config for the learner.
      * feature_columns: An iterable containing all the feature columns used by
          the model.
      * examples_per_layer: Number of examples to accumulate before growing a
          layer. It can also be a function that computes the number of examples
          based on the depth of the layer that's being built.
      * weight_column_name: The name of weight column.
      * center_bias: Whether a separate tree should be created for first fitting
          the bias.
      * ranking_model_pair_keys (Optional): Keys to distinguish between features
        for left and right part of the training pairs for ranking. For example,
        for an Example with features "a.f1" and "b.f1", the keys would be
        ("a", "b").
    config: `RunConfig` of the estimator.
    output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec
      (new interface).


  Returns:
    A `ModelFnOps` object.
  Raises:
    ValueError: if inputs are not valid.
  """
    head = params["head"]
    learner_config = params["learner_config"]
    examples_per_layer = params["examples_per_layer"]
    feature_columns = params["feature_columns"]
    weight_column_name = params["weight_column_name"]
    num_trees = params["num_trees"]
    use_core_libs = params["use_core_libs"]
    logits_modifier_function = params["logits_modifier_function"]
    output_leaf_index = params["output_leaf_index"]
    ranking_model_pair_keys = params["ranking_model_pair_keys"]

    if features is None:
        raise ValueError("At least one feature must be specified.")

    if config is None:
        raise ValueError("Missing estimator RunConfig.")

    center_bias = params["center_bias"]

    if isinstance(features, ops.Tensor):
        features = {features.name: features}

    # Make a shallow copy of features to ensure downstream usage
    # is unaffected by modifications in the model function.
    training_features = copy.copy(features)
    training_features.pop(weight_column_name, None)
    global_step = training_util.get_global_step()
    with ops.device(global_step.device):
        ensemble_handle = model_ops.tree_ensemble_variable(
            stamp_token=0,
            tree_ensemble_config="",  # Initialize an empty ensemble.
            name="ensemble_model")

    # Extract the features.
    if mode == learn.ModeKeys.TRAIN or mode == learn.ModeKeys.EVAL:
        # For ranking pairwise training, we extract two sets of features.
        if len(ranking_model_pair_keys) != 2:
            raise ValueError("You must provide keys for ranking.")
        left_pair_key = ranking_model_pair_keys[0]
        right_pair_key = ranking_model_pair_keys[1]
        if left_pair_key is None or right_pair_key is None:
            raise ValueError("Both pair keys should be provided for ranking.")

        features_1 = {}
        features_2 = {}
        for name in training_features:
            feature = training_features[name]
            new_name = name[2:]
            if name.startswith(left_pair_key + "."):
                features_1[new_name] = feature
            else:
                assert name.startswith(right_pair_key + ".")
                features_2[new_name] = feature

        main_features = features_1
        supplementary_features = features_2
    else:
        # For non-ranking or inference ranking, we have only 1 set of features.
        main_features = training_features

    # Create GBDT model.
    gbdt_model_main = gbdt_batch.GradientBoostedDecisionTreeModel(
        is_chief=config.is_chief,
        num_ps_replicas=config.num_ps_replicas,
        ensemble_handle=ensemble_handle,
        center_bias=center_bias,
        examples_per_layer=examples_per_layer,
        learner_config=learner_config,
        feature_columns=feature_columns,
        logits_dimension=head.logits_dimension,
        features=main_features,
        use_core_columns=use_core_libs,
        output_leaf_index=output_leaf_index)

    with ops.name_scope("gbdt", "gbdt_optimizer"):
        # Logits for inference.
        if mode == learn.ModeKeys.INFER:
            predictions_dict = gbdt_model_main.predict(mode)
            logits = predictions_dict[gbdt_batch.PREDICTIONS]
            if logits_modifier_function:
                logits = logits_modifier_function(logits, features, mode)
        else:
            gbdt_model_supplementary = gbdt_batch.GradientBoostedDecisionTreeModel(
                is_chief=config.is_chief,
                num_ps_replicas=config.num_ps_replicas,
                ensemble_handle=ensemble_handle,
                center_bias=center_bias,
                examples_per_layer=examples_per_layer,
                learner_config=learner_config,
                feature_columns=feature_columns,
                logits_dimension=head.logits_dimension,
                features=supplementary_features,
                use_core_columns=use_core_libs,
                output_leaf_index=output_leaf_index)

            # Logits for train and eval.
            if not supplementary_features:
                raise ValueError("Features for ranking must be specified.")

            predictions_dict_1 = gbdt_model_main.predict(mode)
            predictions_1 = predictions_dict_1[gbdt_batch.PREDICTIONS]

            predictions_dict_2 = gbdt_model_supplementary.predict(mode)
            predictions_2 = predictions_dict_2[gbdt_batch.PREDICTIONS]

            logits = predictions_1 - predictions_2
            if logits_modifier_function:
                logits = logits_modifier_function(logits, features, mode)

            predictions_dict = predictions_dict_1
            predictions_dict[gbdt_batch.PREDICTIONS] = logits

        def _train_op_fn(loss):
            """Returns the op to optimize the loss."""
            update_op = gbdt_model_main.train(loss, predictions_dict, labels)
            with ops.control_dependencies(
                [update_op]), (ops.colocate_with(global_step)):
                update_op = state_ops.assign_add(global_step, 1).op
                return update_op

    create_estimator_spec_op = getattr(head, "create_estimator_spec", None)

    training_hooks = []
    if num_trees:
        if center_bias:
            num_trees += 1

        finalized_trees, attempted_trees = (
            gbdt_model_main.get_number_of_trees_tensor())
        training_hooks.append(
            trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
                                          finalized_trees))

    if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
        if use_core_libs and callable(create_estimator_spec_op):
            model_fn_ops = head.create_estimator_spec(features=features,
                                                      mode=mode,
                                                      labels=labels,
                                                      train_op_fn=_train_op_fn,
                                                      logits=logits)
            model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
                model_fn_ops)
        else:
            model_fn_ops = head.create_model_fn_ops(features=features,
                                                    mode=mode,
                                                    labels=labels,
                                                    train_op_fn=_train_op_fn,
                                                    logits=logits)

        if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
            model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
                gbdt_batch.LEAF_INDEX]

        model_fn_ops.training_hooks.extend(training_hooks)
        return model_fn_ops

    elif output_type == ModelBuilderOutputType.ESTIMATOR_SPEC:
        assert callable(create_estimator_spec_op)
        estimator_spec = head.create_estimator_spec(features=features,
                                                    mode=mode,
                                                    labels=labels,
                                                    train_op_fn=_train_op_fn,
                                                    logits=logits)

        estimator_spec = estimator_spec._replace(
            training_hooks=training_hooks +
            list(estimator_spec.training_hooks))
        return estimator_spec

    return model_fn_ops
Example #54
0
 def begin(self):
     self._next_step = None
     self._global_step_tensor = training_util.get_global_step()
     if self._global_step_tensor is None:
         raise RuntimeError(
             "Global step should be created to use SummarySaverHook.")
Example #55
0
 def begin(self):
     self._global_step_tensor = training_util.get_global_step()
     if self._global_step_tensor is None:
         raise RuntimeError(
             "Global step should be created to use StepCounterHook.")
Example #56
0
 def end(self, session):
     last_step = session.run(training_util.get_global_step())
     if last_step != self._timer.last_triggered_step():
         self._save(last_step, session)
Example #57
0
    def model_fn(self, features, mode, config):
        """Model function for the estimator.

    Note that this does not take a `labels` arg. This works, but `input_fn` must
    return either `features` or, equivalently, `(features, None)`.

    Args:
      features: The input points. See `tf.estimator.Estimator`.
      mode: See `tf.estimator.Estimator`.
      config: See `tf.estimator.Estimator`.

    Returns:
      A `tf.estimator.EstimatorSpec` (see `tf.estimator.Estimator`) specifying
      this behavior:
        * `train_op`: Execute one mini-batch or full-batch run of Lloyd's
             algorithm.
        * `loss`: The sum of the squared distances from each input point to its
             closest center.
        * `eval_metric_ops`: Maps `SCORE` to `loss`.
        * `predictions`: Maps `ALL_DISTANCES` to the distance from each input
             point to each cluster center; maps `CLUSTER_INDEX` to the index of
             the closest cluster center for each input point.
    """
        # input_points is a single Tensor. Therefore, the sharding functionality
        # in clustering_ops is unused, and some of the values below are lists of a
        # single item.
        input_points = _parse_features_if_necessary(features,
                                                    self._feature_columns)

        # Let N = the number of input_points.
        # all_distances: A list of one matrix of shape (N, num_clusters). Each value
        #   is the distance from an input point to a cluster center.
        # model_predictions: A list of one vector of shape (N). Each value is the
        #   cluster id of an input point.
        # losses: Similar to cluster_idx but provides the distance to the cluster
        #   center.
        # is_initialized: scalar indicating whether the initial cluster centers
        #   have been chosen; see init_op.
        # init_op: an op to choose the initial cluster centers. A single worker
        #   repeatedly executes init_op until is_initialized becomes True.
        # training_op: an op that runs an iteration of training, either an entire
        #   Lloyd iteration or a mini-batch of a Lloyd iteration. Multiple workers
        #   may execute this op, but only after is_initialized becomes True.
        (all_distances, model_predictions, losses, is_initialized, init_op,
         training_op) = clustering_ops.KMeans(
             inputs=input_points,
             num_clusters=self._num_clusters,
             initial_clusters=self._initial_clusters,
             distance_metric=self._distance_metric,
             use_mini_batch=self._use_mini_batch,
             mini_batch_steps_per_iteration=self.
             _mini_batch_steps_per_iteration,
             random_seed=self._random_seed,
             kmeans_plus_plus_num_retries=self._kmeans_plus_plus_num_retries
         ).training_graph()

        loss = math_ops.reduce_sum(losses)
        summary.scalar('loss/raw', loss)

        incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
        training_op = control_flow_ops.with_dependencies(
            [training_op, incr_step], loss)

        training_hooks = [
            _InitializeClustersHook(init_op, is_initialized, config.is_chief)
        ]
        if self._relative_tolerance is not None:
            training_hooks.append(
                _LossRelativeChangeHook(loss, self._relative_tolerance))

        export_outputs = {
            KMeansClustering.ALL_DISTANCES:
            export_output.PredictOutput(all_distances[0]),
            KMeansClustering.CLUSTER_INDEX:
            export_output.PredictOutput(model_predictions[0]),
            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            export_output.PredictOutput(model_predictions[0])
        }

        return model_fn_lib.EstimatorSpec(
            mode=mode,
            predictions={
                KMeansClustering.ALL_DISTANCES: all_distances[0],
                KMeansClustering.CLUSTER_INDEX: model_predictions[0],
            },
            loss=loss,
            train_op=training_op,
            eval_metric_ops={KMeansClustering.SCORE: metrics.mean(loss)},
            training_hooks=training_hooks,
            export_outputs=export_outputs)
Example #58
0
  def _model_fn_from_saved_model(self, features, labels, mode):
    """Load a SavedModel graph and return an EstimatorSpec."""
    # TODO(kathywu): Model function loads placeholders from the graph. Calling
    # export_all_saved_models creates another placeholder for the inputs, on top
    # of the original placeholders. There should be a way to avoid this.
    self._validate_mode(mode)

    g = ops.get_default_graph()
    if  training_util.get_global_step(g) is not None:
      raise RuntimeError(
          'Graph must not contain a global step tensor before the SavedModel is'
          ' loaded. Please make sure that the input function does not create a '
          'global step.')

    # Extract SignatureDef for information about the input and output tensors.
    signature_def = self._get_signature_def_for_mode(mode)

    # Generate input map for replacing the inputs in the SavedModel graph with
    # the provided features and labels.
    input_map = _generate_input_map(signature_def, features, labels)

    # Create a list of the names of output tensors. When the graph is loaded,
    # names of the output tensors may be remapped. This ensures that the correct
    # tensors are returned in the EstimatorSpec.
    output_tensor_names = [
        value.name for value in six.itervalues(signature_def.outputs)]

    # Load the graph. `output_tensors` contains output `Tensors` in the same
    # same order as the `output_tensor_names` list.
    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
    _, output_tensors = self.saved_model_loader.load_graph(
        g, tags, input_map=input_map, return_elements=output_tensor_names)

    # Create a scaffold from the MetaGraphDef that contains ops to initialize
    # the graph. This should mirror the steps from _add_meta_graph_for_mode(),
    # which creates a MetaGraphDef from the EstimatorSpec's scaffold.
    scaffold = monitored_session.Scaffold(
        local_init_op=loader_impl._get_legacy_init_op_tensor(  # pylint: disable=protected-access
            self._get_meta_graph_def_for_mode(mode)))

    # Ensure that a global step tensor has been created.
    global_step_tensor = training_util.get_global_step(g)
    training_util.assert_global_step(global_step_tensor)

    # Extract values to return in the EstimatorSpec.
    output_map = dict(zip(output_tensor_names, output_tensors))
    outputs = {key: output_map[value.name]
               for key, value in six.iteritems(signature_def.outputs)}

    loss, predictions, metrics = _validate_and_extract_outputs(
        mode, outputs, signature_def.method_name)

    train_op = ops.get_collection(constants.TRAIN_OP_KEY)
    if len(train_op) > 1:
      raise RuntimeError('Multiple ops found in the train_op collection.')
    train_op = None if not train_op else train_op[0]

    _clear_saved_model_collections()
    return model_fn_lib.EstimatorSpec(
        scaffold=scaffold,
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions=predictions,
        eval_metric_ops=metrics)
Example #59
0
 def test_get_or_create_global_step(self):
     with ops.Graph().as_default() as g:
         self.assertIsNone(training_util.get_global_step())
         self._assert_global_step(training_util.get_or_create_global_step())
         self._assert_global_step(
             training_util.get_or_create_global_step(g))
Example #60
0
 def begin(self):
     self._global_step_tensor = get_global_step()  # pylint: disable=protected-access
     if self._global_step_tensor is None:
         raise RuntimeError(
             "Global step should be created to use EvalHook.")