Esempio n. 1
0
    def _verify_timestep_counts(self, num_splits):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.set_random_seed(1245)
            inputs = tf.random_uniform([batch_size, 8, 8, 1], seed=12345)
            net = _BuildDummyPipelineCnn(num_splits=num_splits,
                                         num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
Esempio n. 2
0
 def testParamValueSumSquared(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         all_vars = tf.trainable_variables()
         py_utils.SumSquared(all_vars)
Esempio n. 3
0
def AddNormSummary(name, vs_gs):
    """"Returns and creates summary for norms of vs and their gradients gs.

  Args:
    name: A name string for summary.
    vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient).

  Returns:
    norm of variables, and norm of gradients.
  """
    flatten = py_utils.NestedMap(child=vs_gs).Flatten()
    v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten]))
    scalar('var_norm/%s' % name, v_norm)
    g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten]))
    scalar('grad_norm/%s' % name, g_norm)
    return v_norm, g_norm
Esempio n. 4
0
def AddNormSummary(name, vs_gs):
    """"Returns and creates summary for norms of vs and their gradients gs.

  Args:
    name: A name string for summary.
    vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient).

  Returns:
    norm of variables, and norm of gradients.
  """
    flatten = py_utils.Flatten(vs_gs)
    v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten]))
    g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten]))
    if py_utils.IsEagerMode():
        scalar_v2(f'var_norm/{name}', v_norm)
        scalar_v2(f'grad_norm/{name}', g_norm)
    else:
        scalar(f'var_norm/{name}', v_norm)
        scalar(f'grad_norm/{name}', g_norm)
    return v_norm, g_norm
Esempio n. 5
0
  def ApplyGradients(self, task_call_scope, feature_to_gradient_dict):
    """Apply tpu embedding gradient updates.

    Args:
      task_call_scope: The current task call scope name.
      feature_to_gradient_dict: A `py_utils.NestedMap` of: tpu embedding feature
        name -> gradient tensor for the embedding feature.

    Returns:
      The gradient update op and a dict of eval metrics.

    Raises:
      ValueError: if gradients have been applied before for the current task.
    """
    # TODO(laigd): we need a way to tell which task needs backprop, and whether
    # send gradient ops are created for that task.
    if task_call_scope in self._send_gradient_op_by_task:
      raise ValueError(
          f'Send gradient op for task {task_call_scope} already exist.')

    # Apply gradient multiplier schedule.
    grad_multiplier = self._gradient_multiplier_schedule.Value()
    feature_to_gradient_dict = feature_to_gradient_dict.Transform(
        lambda g: g * grad_multiplier)

    send_gradient_op = (
        self._tpu_embedding.generate_send_gradients_op(
            feature_to_gradient_dict, step=py_utils.GetGlobalStep()))
    self._send_gradient_op_by_task[task_call_scope] = send_gradient_op

    activations = self.GetActivations(task_call_scope).values()
    eval_metrics = {
        'tpu_embedding_activation_norm':
            (tf.sqrt(py_utils.SumSquared(activations)), tf.constant(1.0)),
        'tpu_embedding_grad_norm':
            (tf.sqrt(py_utils.SumSquared(feature_to_gradient_dict.Flatten())),
             tf.constant(1.0)),
        'tpu_embedding_gradient_multiplier':
            (grad_multiplier, tf.constant(1.0)),
    }
    return send_gradient_op, eval_metrics
Esempio n. 6
0
    def _verify_timestep_counts(self,
                                num_splits,
                                auto_partition=False,
                                micro_batch_size=None):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.random.set_seed(1245)
            inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345)
            if auto_partition:
                layers = [
                    _SimpyLayer.Params().Set(name='layer_{}'.format(i))
                    for i in range(16)
                ]
                net = PipeliningLayer.Params().Set(
                    name='pipeline',
                    num_micro_batches=num_micro_batches,
                    cell_tpl=_Partition(layers, num_splits,
                                        tshape.Shape([batch_size, 8, 8,
                                                      1]))).Instantiate()
            else:
                net = _BuildDummyPipelineCnn(
                    num_splits=num_splits,
                    micro_batch_size=micro_batch_size,
                    num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
Esempio n. 7
0
    def ScaleGradients(self, var_grads, gradient_adjuster=None):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      A `.NestedMap` containing:
      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs,
        where gradients have already been scaled.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step. (Optional, only returned in case global norm clipping is
        used.)
    """
        p = self.params

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(name + '/' + p.name,
                                         py_utils.NestedMap(s=vg))
        all_grad_norm = tf.sqrt(
            py_utils.SumSquared([
                g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        all_var_norm = tf.sqrt(
            py_utils.SumSquared([
                v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        if gradient_adjuster is not None:
            tf.logging.info('gradient_adjuster=%s', gradient_adjuster)
            var_grads = gradient_adjuster(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        return_values = py_utils.NestedMap()
        if p.clip_gradient_single_norm_to_value:
            # Currently using both types of clipping simultaneously is unsupported.
            if p.clip_gradient_norm_to_value:
                raise ValueError(
                    'Cannot use clip_gradient_single_norm_to_value=%f and '
                    'clip_gradient_norm_to_value=%f.' %
                    (p.clip_gradient_single_norm_to_value,
                     p.clip_gradient_norm_to_value))
            final_var_grads = py_utils.ApplyGradNormCliping(
                var_grads, p.clip_gradient_single_norm_to_value)

        else:
            grad_scale = self._GetGlobalGradScale(all_grad_norm,
                                                  has_nan_or_inf)
            self._AddEvalMetric('grad_norm/all', all_grad_norm,
                                tf.constant(1.0))
            self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0))
            self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0))
            final_var_grads = py_utils.ApplyGradMultiplier(
                var_grads, grad_scale)
            return_values.grad_scale = grad_scale

        return_values.has_nan_or_inf = has_nan_or_inf
        return_values.final_var_grads = final_var_grads
        return return_values