Exemple #1
0
    def _Loop(self):
        with self._cluster, tf.container(self._container_id), self._GetSession(
                cluster_def=self._cluster_def,
                disable_meta_optimizer=FLAGS.disable_meta_optimizer_in_executor
        ) as sess:
            config_proto = (self._tpu_embedding.config_proto
                            if self._tpu_embedding is not None else None)
            sess.run(tf.tpu.initialize_system(embedding_config=config_proto))

            # Initialize the variables first, if needed.
            for program in self._programs:
                program.RestoreIfNeeded(sess)
                program.Compile(sess)

            sess.run(self._initialize_tables)
            sess.run(self._initialize_local_vars)

            sess.run(self._load_ops)
            while True:
                global_step = sess.run(py_utils.GetGlobalStep())
                if self._ShouldStop(sess, global_step):
                    tf.logging.info('Training finished.')
                    if not self._ml_perf_log:
                        self.save_only_checkpointer.Save(sess, global_step)
                    return

                # If a task is explicitly selected, only run the programs associated
                # with that task.
                if self._single_task_mode or self._model_task_name:
                    tf.logging.info('Single task mode: %s',
                                    self._model_task_name)
                    program_schedule = self._program_schedule_dict[
                        self._model_task_name]
                else:
                    # Otherwise, sample a task.
                    model_task = self.task_scheduler.Sample(global_step)
                    tf.logging.info('Sampled %s', model_task)
                    program_schedule = self._program_schedule_dict[model_task]

                done = program_schedule.Run(sess)
                if done:
                    tf.logging.info('Program schedule told us to stop.')
                    return

                # global_step local variable above is a result of sess.run, not a
                # tf variable, so when we do save_only_checkpointer.Save(...) here
                # py_utils.GetGlobalStep() is ahead of it by
                #   (train_executions_per_eval * train_steps_per_loop)
                # steps ahead already, due to program_schedule.Run(sess).
                #
                if not self._ml_perf_log:
                    tf.logging.info('Retrieve params.')
                    sess.run(self._retrieve_ops)
                    tf.logging.info('Retrieve params done.')
                    self.save_only_checkpointer.MaybeSave(
                        sess, py_utils.GetGlobalStep())
Exemple #2
0
    def _Loop(self):
        with tf.container(self._container_id), self._GetSession(
                cluster_def=self._cluster_def,
                disable_meta_optimizer=FLAGS.disable_meta_optimizer_in_executor
        ) as sess:
            # Initialize the variables first, if needed.
            for program in self._programs:
                program.RestoreIfNeeded(sess)
                program.Compile(sess)
            sess.run(self._initialize_tables)
            sess.run(self._initialize_local_vars)

            while True:
                global_step = sess.run(py_utils.GetGlobalStep())
                if self._ShouldStop(sess, global_step):
                    tf.logging.info('Training finished.')
                    if not self._ml_perf_log:
                        self.save_only_checkpointer.Save(sess, global_step)
                    return

                # If a task is explicitly selected, only run the programs associated
                # with that task.
                if self._single_task_mode or self._model_task_name:
                    tf.logging.info('Single task mode: %s',
                                    self._model_task_name)
                    program_schedule = self._program_schedule_dict[
                        self._model_task_name]
                else:
                    # Otherwise, sample a task.
                    model_task = self.task_scheduler.Sample(global_step)
                    tf.logging.info('Sampled %s', model_task)
                    program_schedule = self._program_schedule_dict[model_task]

                done = program_schedule.Run(sess)
                if done:
                    tf.logging.info('Program schedule told us to stop.')
                    return

                # TODO(blee): More complex saving rules. Currently, we assume
                # we save after every task's program schedule execution.
                #
                # global_step local variable above is a result of sess.run, not a
                # tf variable, so when we do save_only_checkpointer.Save(...) here
                # py_utils.GetGlobalStep() is ahead of it by
                #   (train_executions_per_eval * train_steps_per_loop)
                # steps ahead already, due to program_schedule.Run(sess).
                #
                if not self._ml_perf_log:
                    self.save_only_checkpointer.Save(sess,
                                                     py_utils.GetGlobalStep())
Exemple #3
0
 def Value(self):
   p = self.params
   num_decays = tf.floor(
       tf.div(
           tf.cast(py_utils.GetGlobalStep(), tf.float32),
           float(p.num_steps_per_decay)))
   return tf.pow(p.decay, num_decays)
Exemple #4
0
    def Run(self, sess):
        tf.logging.info('Executing decode program for %s.', self._task_name)
        self._checkpointer.RestoreIfNeeded(sess)
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)

        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        dec_metrics = self._model_task.CreateDecoderMetrics()
        start_time = time.time()
        for i in range(self._steps_per_loop):
            metrics_values = sess.run(self.metrics)
            self._model_task.PostProcessDecodeOut(metrics_values, dec_metrics)
            tf.logging.info(
                'step: %d %f' %
                (i, dec_metrics['num_samples_in_batch'].total_value))
        infeed_future.wait()
        num_examples_metric = dec_metrics['num_samples_in_batch']
        summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)}
        elapsed_secs = time.time() - start_time
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = tf.Summary(value=[
            tf.Summary.Value(tag='examples/sec', simple_value=example_rate)
        ])
        self._WriteSummaries(os.path.basename(self._program_dir), global_step,
                             summaries)
Exemple #5
0
    def Run(self, sess):
        tf.logging.info('Executing train program for %s.', self._task_name)
        gsteps = py_utils.GetGlobalStep()

        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        ary = sess.run(self.tpu_ops)
        infeed_future.wait()

        values = ary[0]
        outfeeds = ary[1]

        eval_metrics = self._eval_metrics.PackMetricsValues(values)

        task = self._model.GetTask()
        global_step, total_examples = sess.run(
            [gsteps, task.total_examples_var])
        self._RecordStepRate(global_step, total_examples)

        msg = 'step:%6d' % global_step
        for key, (val, _) in sorted(six.iteritems(eval_metrics)):
            msg += ' %s:%.8g' % (key, val)
            self._SummarizeValue(global_step, key, val)

        task.ProcessFPropResults(sess, global_step, eval_metrics, outfeeds)
Exemple #6
0
def _GetSaveableVariablesDict(models):
    """Get all variables of the model that should be saved.

  Args:
    models: a list of lingvo model objects.

  Returns:
    A map of the variables with their names as keys, trailing `:0` stripepd.

  Raises:
    RuntimeError: if there are variables with shared name.
  """
    res = {}
    for model in models:
        res = py_utils.MergeDictsWithValueCheck(res, model.GetVariablesDict())

    res_updated = {}
    for k in res:
        k_new = k
        # strip ':0' from variable names to be backwards compatible with graph mode
        # checkpoint keys
        if k[-2:] == ':0':
            k_new = k[:-2]
        res_updated[k_new] = res[k]

    res_updated['global_step'] = py_utils.GetGlobalStep()
    return res_updated
Exemple #7
0
    def _verify_timestep_counts(self, num_splits):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            py_utils.GetGlobalStep()
            tf.set_random_seed(1245)
            inputs = tf.random_uniform([batch_size, 8, 8, 1])
            net = _BuildDummyPipelineCnn(num_splits=num_splits,
                                         num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            self.assertNear(grad_norm_val, 0.269997, err=1.0e-6)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
Exemple #8
0
 def Value(self):
     values = []
     for schedule in self.schedules:
         values.append(schedule.Value())
     relative_step = tf.math.mod(py_utils.GetGlobalStep(), self._period)
     return py_utils.PiecewiseConstant(relative_step, self._boundaries,
                                       values, values[0].dtype)
Exemple #9
0
  def Run(self, sess):
    tf.logging.info('Executing train program for %s.', self._task_name)
    gsteps = py_utils.GetGlobalStep()

    infeed_future = self._infeed_pool.apply_async(
        self._InfeedLoop, args=(sess,))
    ary = sess.run(self.tpu_ops)
    infeed_future.wait()

    values = ary[0]
    outfeeds = ary[1]

    eval_metrics = self._eval_metrics.PackMetricsValues(values)

    global_step = sess.run(gsteps)
    step_rate, example_rate = self._step_rate_tracker.ComputeStepRate(
        global_step, eval_metrics['num_samples_in_batch'][0])
    self._SummarizeValue(global_step, 'global_step/sec', step_rate)
    self._SummarizeValue(global_step, 'examples/sec', example_rate)

    for key, (val, _) in sorted(six.iteritems(eval_metrics)):
      self._SummarizeValue(global_step, key, val)

    self._model.GetTask().ProcessFPropResults(sess, global_step, eval_metrics,
                                              outfeeds)
Exemple #10
0
  def Apply(self, lr, var_grad):
    p = self.params

    def _Acc(vg):
      """Updating accumulators."""

      v, g = vg
      with tf.variable_scope(v.op.name):
        _, a = py_utils.CreateVariable(
            'grad_accumulator',
            py_utils.WeightParams(v.get_shape(),
                                  py_utils.WeightInit.Constant(0.0),
                                  self.params.dtype),
            trainable=False)
        a = tf.assign_add(a, g)

      return v, a

    var_grad = var_grad.Transform(_Acc)

    def _ApplyAndReset():
      with tf.control_dependencies([
          self._opt.Apply(
              lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps))
      ]):
        return tf.group(
            *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()])

    return tf.cond(
        tf.equal(
            tf.mod(py_utils.GetGlobalStep(), p.accum_steps), p.accum_steps - 1),
        _ApplyAndReset, lambda: tf.group(tf.no_op()))
Exemple #11
0
    def InstantiateVariables(self):
        """Create variables for this layer and child layers.

    DO NOT OVERRIDE. Override self._CreateLayerVariables instead.
    """
        if self._create_variables_status != _CreateLayerVariablesStatus.NOT_CALLED:
            return
        self._create_variables_status = _CreateLayerVariablesStatus.IN_PROGRESS

        stack_size = len(_CREATE_VARIABLES_STACK.stack)
        _CREATE_VARIABLES_STACK.stack.append(self)
        try:
            self._global_step = py_utils.GetGlobalStep()
            self._CreateChildrenVariables()

            if not self._is_variable_free:
                self.AddExtraTheta('global_step', self._global_step)
                with self._SelfVariableScope():
                    for name, meta in list(self._variables_to_create.items()):
                        self._CreateVariableInternal(name, meta)
                    self._CreateLayerVariables()
        finally:
            assert _CREATE_VARIABLES_STACK.stack[-1] is self
            _CREATE_VARIABLES_STACK.stack.pop()
            assert len(_CREATE_VARIABLES_STACK.stack) == stack_size

        self._create_variables_status = _CreateLayerVariablesStatus.COMPLETED

        if not _CREATE_VARIABLES_STACK.stack:
            # Outermost layer just finished InstantiateVariables.
            self._VerifyVarsAndTheta()
Exemple #12
0
    def __init__(self, params):
        """Layer constructor.

    Sub-classes of BaseLayer should decorator its __init__ with
    @base_layer.initializer

    Args:
      params: A params used to construct this layer.
    """
        assert params.name, ('Layer params for %s must have a "name"' %
                             self.__class__.__name__)
        self._params = params.Copy()
        tf.logging.debug('Creating layer %s with params: \n %s \n',
                         self.__class__.__name__, str(params))
        # Vars created by this layer.
        self._private_vars = py_utils.NestedMap()
        # Theta derived from this layer's vars.
        self._private_theta = py_utils.NestedMap()
        # Child layers created by this layer through CreateChild/CreateChildren.
        self._private_children = py_utils.NestedMap()
        # Child layers created by this layer. A well-formed layer should
        # have self._private_children equals to self._children_list. I.e.,
        # all child layers are created using CreateChild/CreateChildren.
        self._children_list = []
        # Extra theta's not directly correpond to any underlying vars. For example,
        # the concatenated sharded variables.
        self._extra_theta = py_utils.NestedMap()
        # All registered accumulators.
        self._private_accumulators = py_utils.NestedMap()
        # Layer-private functions. Add with AddFunction.
        self._private_fns = dict()

        self.AddExtraTheta('global_step', py_utils.GetGlobalStep())
Exemple #13
0
  def testDecoderFPropDeterministicAttentionDropout(self):
    """Verify that attention dropout is deterministic given fixed seeds."""
    with self.session(use_gpu=False, graph=tf.Graph()) as sess:
      tf.set_random_seed(8372749040)
      p = self._DecoderParams(
          py_utils.VariationalNoiseParams(None, True, False, seed=1792))

      p.use_while_loop_based_unrolling = False
      p.attention.atten_dropout_prob = 0.5
      p.attention.atten_dropout_deterministic = True

      loss, per_sequence_loss = self._testDecoderFPropHelper(params=p)
      global_step = py_utils.GetGlobalStep()
      tf.global_variables_initializer().run()
      loss_val, per_sequence_loss_val, global_steps_val = sess.run(
          [loss, per_sequence_loss, global_step])

      print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val)
      self.assertAllClose([3.587372, 15.0], loss_val)
      self.assertAllClose([14.171288, 9.965696, 10.221684, 19.451914],
                          per_sequence_loss_val)
      self.assertEqual(0, global_steps_val)

      # Run another step to test global_step and time_step are incremented
      # correctly.
      sess.run(tf.assign_add(global_step, 1))
      loss_val, per_sequence_loss_val, global_steps_val = sess.run(
          [loss, per_sequence_loss, global_step])

      print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val)
      self.assertAllClose([3.626164, 15.0], loss_val)
      self.assertAllClose([14.70993, 10.572938, 10.516836, 18.592758],
                          per_sequence_loss_val)
      self.assertEqual(1, global_steps_val)
Exemple #14
0
    def Run(self, sess):
        tf.logging.info('Executing decode program for %s.', self._task_name)
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)

        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        dec_metrics = self._model_task.CreateDecoderMetrics()
        start_time = time.time()
        buffered_decode_out = []
        for i in range(self._steps_per_loop):
            metrics_values = sess.run(self.metrics)
            decode_out = self._model_task.PostProcessDecodeOut(
                metrics_values, dec_metrics)
            tf.logging.info(
                'step: %d %f' %
                (i, dec_metrics['num_samples_in_batch'].total_value))
            if decode_out:
                buffered_decode_out.extend(decode_out)
        infeed_future.wait()
        num_examples_metric = dec_metrics['num_samples_in_batch']
        summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)}
        elapsed_secs = time.time() - start_time
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = tf.Summary(value=[
            tf.Summary.Value(tag='examples/sec', simple_value=example_rate)
        ])
        self._WriteSummaries(os.path.basename(self._program_dir), global_step,
                             summaries)
        decode_out_path = os.path.join(self._program_dir,
                                       'decoder_out_%09d' % global_step)
        decode_finalize_args = base_model.DecodeFinalizeArgs(
            decode_out_path=decode_out_path, decode_out=buffered_decode_out)
        self._model_task.DecodeFinalize(decode_finalize_args)
Exemple #15
0
  def __init__(self, params):
    """Layer constructor.

    Sub-classes of BaseLayer should decorator its __init__ with
    @base_layer.initializer

    Args:
      params: A params used to construct this layer.
    """
    assert params.name, (
        'Layer params for %s must have a "name"' % self.__class__.__name__)

    tf_module_name = params.name
    tf_module_name = re.sub('[^a-zA-Z0-9_]+', '_', tf_module_name)
    tf_module_name = 'bbf_' + self.__class__.__name__ + '_' + tf_module_name
    py_utils.NestedMap.CheckKey(tf_module_name)

    # initialize the base class.
    super(BaseLayer, self).__init__(tf_module_name)

    # Note AutoTracking doesn't work properly due to its inability to walk
    # through py_utils.NestedMap data structures which are used widely
    # throughout the Lingvo codebase. Also there seems to be some performance
    # hit in turning on auto-tracking in constructing graphs. For now, we
    # disable auto-tracking.
    # TODO(lingvo): Re-enable auto-tracking when fuller support is
    # added for key data structures used in Lingvo, and performance issue is
    # debugged more and understood better.
    self._setattr_tracking = False

    self._parent = (
        _LAYER_STACK.layer_stack[-2]
        if len(_LAYER_STACK.layer_stack) > 1 else None)
    assert self._parent is not self
    self._params = params.Copy()
    tf.logging.debug('Creating layer %s with params: \n %s \n',
                     self.__class__.__name__, str(params))
    # Vars created by this layer.
    self._private_vars = py_utils.NestedMap()
    # Theta derived from this layer's vars.
    self._private_theta = py_utils.NestedMap()
    # Child layers created by this layer through CreateChild/CreateChildren.
    self._private_children = py_utils.NestedMap()
    # Child layers created by this layer. A well-formed layer should
    # have self._private_children equals to self._children_list. I.e.,
    # all child layers are created using CreateChild/CreateChildren.
    self._children_list = []
    # Extra theta's not directly correpond to any underlying vars. For example,
    # the concatenated sharded variables.
    self._extra_theta = py_utils.NestedMap()
    # All registered accumulators.
    self._private_accumulators = py_utils.NestedMap()
    # Layer-private functions. Add with AddFunction.
    self._private_fns = dict()
    # Mapping from variable names to its symbolic shape.
    # self._var_symbolic_shape_map['var_name'] will be a tuple of integers or
    # symbolic expressions, one for each dimension of the variable.
    self._var_symbolic_shape_map = dict()

    self.AddExtraTheta('global_step', py_utils.GetGlobalStep())
Exemple #16
0
    def Run(self, sess):
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)
        self.SetStatusMessage('Executing train program at step %d' %
                              global_step)
        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        ary = sess.run(self.tpu_ops)
        infeed_future.wait()

        values = ary[0]
        outfeeds = ary[1]

        self._eval_metrics.PackMetricsValues(values)
        eval_metrics = self._eval_metrics.metrics

        global_step = sess.run(gsteps)
        step_rate, example_rate, total_examples = (
            self._step_rate_tracker.ComputeStepRate(
                global_step, eval_metrics['num_samples_in_batch'][0] *
                self._steps_per_loop))
        self._SummarizeValue(global_step, 'global_step/sec', step_rate)
        self._SummarizeValue(global_step, 'examples/sec', example_rate)
        self._SummarizeValue(global_step, 'total_samples', total_examples)

        for key, (val, _) in sorted(six.iteritems(eval_metrics)):
            self._SummarizeValue(global_step, key, val)

        self._model.GetTask().ProcessFPropResults(sess, global_step,
                                                  eval_metrics, outfeeds)
        return False
Exemple #17
0
 def ApplyPruning(cls, pruning_hparams_dict, lstmobj, wm_pc, dtype, scope):  # pylint:disable=invalid-name
     if not cls._pruning_obj:
         cls.Setup(pruning_hparams_dict,
                   global_step=py_utils.GetGlobalStep())
     return apply_customized_lstm_matrix_compression(
         cls._pruning_obj, py_utils.WeightParams, py_utils.WeightInit,
         lstmobj, wm_pc.shape, dtype, scope)
Exemple #18
0
 def Value(self):
     p = self.params
     step_num = tf.cast(py_utils.GetGlobalStep(), tf.float32)
     learning_rate = tf.math.rsqrt(
         tf.maximum(step_num - p.offset, p.warmup_steps))
     learning_rate *= p.multiplier
     return learning_rate
Exemple #19
0
    def Run(self, sess):
        tf.logging.info('Executing decode program for %s.', self._task_name)
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)

        if self._ml_perf_log:
            steps_per_epoch = self._ml_perf.steps_per_epoch
            epoch = int(global_step) // steps_per_epoch
            mlp_log.mlperf_print('eval_start',
                                 None,
                                 metadata={'epoch_num': (epoch + 1)})

        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        dec_metrics = self._model_task.CreateDecoderMetrics()
        start_time = time.time()
        buffered_decode_out = []
        for i in range(self._steps_per_loop):
            metrics_values = sess.run(self.metrics)
            decode_out = self._model_task.PostProcessDecodeOut(
                metrics_values, dec_metrics)
            tf.logging.info(
                'step: %d %f' %
                (i, dec_metrics['num_samples_in_batch'].total_value))
            if decode_out:
                buffered_decode_out.extend(decode_out)
        infeed_future.wait()

        if self._ml_perf_log:
            mlp_log.mlperf_print('eval_stop',
                                 None,
                                 metadata={'epoch_num': (epoch + 1)})

        num_examples_metric = dec_metrics['num_samples_in_batch']
        summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)}
        elapsed_secs = time.time() - start_time
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = tf.Summary(value=[
            tf.Summary.Value(tag='examples/sec', simple_value=example_rate)
        ])
        self._WriteSummaries(os.path.basename(self._program_dir), global_step,
                             summaries)
        decode_out_path = os.path.join(self._program_dir,
                                       'decoder_out_%09d' % global_step)
        decode_finalize_args = base_model.DecodeFinalizeArgs(
            decode_out_path=decode_out_path, decode_out=buffered_decode_out)
        self._model_task.DecodeFinalize(decode_finalize_args)

        if self._ml_perf_log:
            mlperf_metric = self._ml_perf.decoder_metric_name
            mlperf_metric_value = dec_metrics[mlperf_metric].value
            mlp_log.mlperf_print('eval_accuracy',
                                 mlperf_metric_value,
                                 metadata={'epoch_num': epoch})
            if mlperf_metric_value > self._ml_perf.decoder_metric_success_threshold:
                tf.logging.info('ml_perf_final_threshold: %f exceeded',
                                self._ml_perf.decoder_metric_success_threshold)
                mlp_log.mlperf_print('run_stop',
                                     None,
                                     metadata={'status': 'success'})
Exemple #20
0
    def Run(self, sess):
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)
        self.SetStatusMessage('Executing decode program at step %d' %
                              global_step)
        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        decode_future = self._infeed_pool.apply_async(self._DecodeLoop,
                                                      args=(sess, ))

        dec_metrics = self._model_task.CreateDecoderMetrics()
        start_time = time.time()
        for _ in range(self._steps_per_loop):
            metrics_values = sess.run(self.metrics)
            self._model_task.PostProcessDecodeOut(metrics_values, dec_metrics)
        decode_future.wait()
        infeed_future.wait()
        summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)}
        elapsed_secs = time.time() - start_time
        num_examples_metric = dec_metrics['num_samples_in_batch']
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = tf.Summary(value=[
            tf.Summary.Value(tag='examples/sec', simple_value=example_rate)
        ])
        self._WriteSummaries(os.path.basename(self._program_dir), global_step,
                             summaries)

        return False
Exemple #21
0
  def Run(self, sess):
    p = self.params
    self._checkpointer.RestoreIfNeeded(sess)
    gsteps = py_utils.GetGlobalStep()

    self._infeed_pool.apply_async(self._InfeedLoop, args=(sess,))
    ary = sess.run(self.tpu_ops)

    values = ary[0]
    outfeeds = ary[1]

    eval_metrics = self._eval_metrics.PackMetricsValues(values)

    task = self._model.GetTask()
    global_step, total_examples = sess.run([gsteps, task.total_examples_var])
    self._RecordStepRate(global_step, total_examples)

    msg = 'step:%6d' % global_step
    for key, (val, _) in sorted(six.iteritems(eval_metrics)):
      msg += ' %s:%.8g' % (key, val)
      self._SummarizeValue(global_step, key, val)

    task.ProcessFPropResults(sess, global_step, eval_metrics, outfeeds)
    if p.always_checkpoint_after_execution:
      self._checkpointer.Save(sess, gsteps)
    else:
      self._checkpointer.MaybeSave(sess, gsteps)
Exemple #22
0
  def _Loop(self):
    with tf.container(self._container_id), self._GetSession(
        cluster_def=self._cluster_def) as sess:
      # Initialize the variables first, if needed.
      for program in self._programs:
        program.RestoreIfNeeded(sess)
      sess.run(self._initialize_tables)
      sess.run(self._initialize_local_vars)

      while True:
        global_step = sess.run(py_utils.GetGlobalStep())

        if self._ShouldStop(sess, global_step):
          tf.logging.info('Training finished.')
          self.save_only_checkpointer.Save(sess, global_step)
          return

        # If a task is explicitly selected, only run the programs associated
        # with that task.
        if self._single_task_mode or self._model_task_name:
          tf.logging.info('Single task mode: %s', self._model_task_name)
          program_schedule = self._program_schedule_dict[self._model_task_name]
        else:
          # Otherwise, sample a task.
          model_task = self.task_scheduler.Sample(global_step)
          tf.logging.info('Sampled %s', model_task)
          program_schedule = self._program_schedule_dict[model_task]

        program_schedule.Run(sess)

        # TODO(blee): More complex saving rules. Currently, we assume
        # we save after every task's program schedule execution.
        self.save_only_checkpointer.Save(sess, global_step)
Exemple #23
0
    def _LoopEnqueue(self, op, session_override=None):
        """Runs the enqueue op in a loop."""
        p = self.params
        sess = session_override or self._GetSession()

        with tf.container(self._container_id), sess:
            if self._initialize_tables is not None:
                sess.run(self._initialize_tables)
            gsteps = py_utils.GetGlobalStep()
            local_enqueue_steps = 0

            # Global enqueue steps measures how many global steps have data enqueued
            # for already. We use this to terminate; note that the enqueue op may
            # hang in session.run if we do not terminate with this check.
            global_enqueue_steps = None

            tf.logging.info(
                'params.train.max_steps: %d, enqueue_max_steps: %d',
                p.train.max_steps, p.train.enqueue_max_steps)
            while True:
                if self._dequeue_thread_complete:
                    tf.logging.info(
                        'LoopEnqueue done since consuming thread is done.')
                    return

                global_step = sess.run(gsteps)
                if global_enqueue_steps is None:
                    global_enqueue_steps = global_step
                if local_enqueue_steps % 1000 == 0:
                    tf.logging.info(
                        'Current global_enqueue_steps: %d, '
                        'local_enqueue_steps: %d, global_step: %d',
                        global_enqueue_steps, local_enqueue_steps, global_step)

                if py_utils.use_tpu():
                    global_steps_with_available_data = int(
                        global_enqueue_steps // p.train.tpu_steps_per_loop *
                        p.train.tpu_steps_per_loop)
                else:
                    global_steps_with_available_data = global_enqueue_steps

                if (self._ShouldStop(sess, global_steps_with_available_data)
                        or self._ShouldStop(sess, global_step)):
                    tf.logging.info('Done. ShouldStop is True.')
                    tf.logging.info('Enqueue loop sleeping')
                    time.sleep(15)
                    continue
                if (p.train.enqueue_max_steps > 0
                        and local_enqueue_steps >= p.train.enqueue_max_steps):
                    tf.logging.info('Done. train.enqueue_max_steps reached.')
                    tf.logging.info('Enqueue loop sleeping')
                    time.sleep(15)
                    continue
                local_enqueue_steps += 1

                # There are tpu_infeed_parallelism parallel threads enqueuing.
                # We account for all of them when updating global_enqueue_steps.
                global_enqueue_steps += p.input.tpu_infeed_parallelism

                sess.run([op])
Exemple #24
0
  def CreateVariables(self):
    """Create variables for this layer and child layers.

    DO NOT OVERRIDE. Override self._CreateVariables instead.
    """
    if self._create_variables_called:
      return
    self._create_variables_called = True

    self._global_step = py_utils.GetGlobalStep()

    if self._is_variable_free:
      for child in self._children_list:
        if not child._is_variable_free:  # pylint: disable=protected-access
          raise ValueError(
              'Variable free layer %s(%s) child %s(%s) has variables.' %
              (self.params.name, self.params.cls, child.params.name,
               child.params.cls))
    else:
      self.AddExtraTheta('global_step', self._global_step)
      self._CreateChildrenVariables()
      with tf.variable_scope(
          py_utils.SanitizeScopeKey(self.params.name),
          auxiliary_name_scope=False):
        for name, meta in list(self._variables_to_create.items()):
          self._CreateVariable(name, meta)
        self._CreateVariables()
    self._VerifyVarsAndTheta()
Exemple #25
0
    def BuildDataSource(self, data_source_from_file_pattern_fn):
        """Read and return input batch.

    Args:
      data_source_from_file_pattern_fn: a function to read and return input
        batch from a string file_pattern

    Returns:
      A NestedMap containing:
        data: a tuple of tf.Tensor or `.NestedMap` of tf.Tensor

    Raises:
      ValueError: inconsistent sizes between boundaries and datasource_params,
      specification of unsupported datasources, or out of order boundaries.
    """
        p = self.params

        if len(p.datasource_params) != len(p.boundaries) + 1:
            raise ValueError(
                'Expected p.datasource_params to have one more entry than '
                'p.boundaries. Found %d datasource_params, and %d boundaries' %
                (len(p.datasource_params), len(p.boundaries)))

        for ds_p in p.datasource_params:
            if 'bprop_variable_filters' in ds_p:
                if any(filter for filter in ds_p.bprop_variable_filters):
                    raise ValueError(
                        'CurriculumDataSource does not support distinct '
                        'bprop_variable_filters per stage.')

        for idx in range(len(p.boundaries) - 1):
            if p.boundaries[idx] > p.boundaries[idx + 1]:
                raise ValueError(
                    'Expected p.boundaries to monotonically increase, but '
                    'found %d > %d at position %d' %
                    (p.boundaries[idx], p.boundaries[idx + 1], idx))

        global_step = py_utils.GetGlobalStep()
        datasources = [ds_p.Instantiate() for ds_p in p.datasource_params]

        def GetDatasourceFn(idx):
            def DatasourceFn():
                datasource = datasources[idx].BuildDataSource(
                    data_source_from_file_pattern_fn)
                datasource.pop('bprop_variable_filters', None)
                return datasource

            return DatasourceFn

        cases = []
        for idx in range(len(p.boundaries)):
            cases.append((tf.less(
                global_step,
                tf.constant(p.boundaries[idx],
                            dtype=global_step.dtype)), GetDatasourceFn(idx)))

        ret = tf.case(cases, default=GetDatasourceFn(-1))
        ret.bprop_variable_filters = p.bprop_variable_filters
        return ret
Exemple #26
0
    def Run(self, sess):
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)
        self.dec_metrics = self._decode_model_task.CreateDecoderMetrics()
        # Start TPU program thread.
        train_future = self._train_pool.apply_async(self._TrainAndDecode,
                                                    args=(sess, ))

        if self._warmup_seconds > 0:
            # The first execution of the TPU program has a warm-up
            # so we delay feeding data yet as that's when the MLPerf timing
            # starts. This way, when we actually infeed, the TPU program
            # is immediately ready to execute/dequeue data.
            tf.logging.info('Waiting before first infeed.')
            time.sleep(self._warmup_seconds)
            self._warmup_seconds = 0

        if self._ml_perf_log:
            if not self._run_start:
                mlp_log.mlperf_print(key='init_stop', value=None)
                self._run_start = mlp_log.mlperf_print(key='run_start',
                                                       value=None)
            steps_per_epoch = self._ml_perf.steps_per_epoch
            epoch = int(global_step) // steps_per_epoch
            if epoch > self._ml_perf_epoch:
                self._ml_perf_epoch = epoch
                mlp_log.mlperf_print('block_start',
                                     None,
                                     metadata={
                                         'first_epoch_num': epoch + 1,
                                         'epoch_count': 1
                                     })
            self.SetStatusMessage('MLPerf epoch: %d' % self._ml_perf_epoch)
        # Start infeed thread.
        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))

        infeed_future.wait()
        train_future.wait()

        if self._ml_perf_log:
            mlp_log.mlperf_print('eval_stop',
                                 None,
                                 metadata={'epoch_num': (epoch + 1)})
            mlperf_metric = self._ml_perf.decoder_metric_name
            mlperf_metric_value = float(self.dec_metrics[mlperf_metric].value)
            mlp_log.mlperf_print('eval_accuracy',
                                 mlperf_metric_value,
                                 metadata={'epoch_num': epoch})
            if mlperf_metric_value > self._ml_perf.decoder_metric_success_threshold:
                tf.logging.info('ml_perf_final_threshold: %f exceeded',
                                self._ml_perf.decoder_metric_success_threshold)
                if not self._run_stop:
                    self._run_stop = mlp_log.mlperf_print(
                        'run_stop', None, metadata={'status': 'success'})
                    self.SetStatusMessage('MLPerf run_time: %.2f' %
                                          (self._run_stop - self._run_start))
                    return True
        return False
Exemple #27
0
 def Value(self):
     """Returns the current learning rate decay."""
     p = self.params
     current_step = tf.cast(py_utils.GetGlobalStep(), tf.float32)
     warmup_steps = tf.cast(p.warmup_steps, tf.float32)
     linear_warmup = tf.minimum(1.0, current_step / warmup_steps)
     rsqrt_decay = tf.math.rsqrt(tf.maximum(current_step, warmup_steps))
     return p.model_dim**-0.5 * linear_warmup * rsqrt_decay
Exemple #28
0
 def Value(self):
   p = self.params
   assert p.total_steps > 0
   with tf.name_scope(p.name):
     decay_gap = p.initial_value - p.final_value
     return p.final_value + 0.5 * decay_gap * (1 + tf.cos(math.pi * tf.minimum(
         1.0,
         tf.cast(py_utils.GetGlobalStep(), tf.float32) / p.total_steps)))
Exemple #29
0
 def Value(self):
     """Returns the current schedule value."""
     p = self.params
     current_step = tf.cast(tf.maximum(py_utils.GetGlobalStep(), 1),
                            tf.float32)
     warmup_steps = tf.cast(p.warmup_steps, tf.float32)
     return p.peak * tf.minimum(current_step / warmup_steps,
                                tf.sqrt(warmup_steps / current_step))
Exemple #30
0
def GetOverWriteGlobalStep(graph=None):
  graph = graph or tf.get_default_graph()
  mb_tensors = graph.get_collection_ref(_OVERWRITE_GLOBAL_STEP_COLLECTION)
  if len(mb_tensors) == 1:
    mb_tensor = mb_tensors[0]
  else:
    mb_tensor = py_utils.GetGlobalStep()
  return mb_tensor