def testScaleGradientsError(self):
     p = self.TestParams()
     p.train.clip_gradient_single_norm_to_value = 1.0
     p.train.clip_gradient_norm_to_value = 1.0
     task = p.Instantiate()
     var_a = task.theta.a
     var_grads = py_utils.NestedMap(
         a=py_utils.VarGrad(var_a, tf.ones_like(var_a)))
     self.assertRaises(ValueError, task.learners[0].ScaleGradients,
                       var_grads)
Exemple #2
0
    def _ComputeLossesAndGradients(self, metrics, vmap):
        p = self.params
        vmap = self.GetTrainableVariables(vmap)

        for v in vmap.Flatten():
            tf.logging.info('%s: bprop variable: %s', p.name, v.name)

        def LossAndGradients(metric_name):
            """Returns (loss, var_grads) computed from metrics[metric_name]."""
            metric = metrics.get(metric_name, None)
            if metric is None:
                raise ValueError('Loss %s not found in metrics %s' %
                                 (metric_name, list(metrics.keys())))
            # TODO(b/154785713): pass (loss, loss_weight) to ComputeGradients().
            loss = metric[0]
            return metric, self.optimizer.ComputeGradients(
                loss,
                vmap,
                p.grad_aggregation_method,
                p.colocate_gradients_with_ops,
                p.gate_gradients,
                compute_gradients_fn=self._CustomComputeGradientsFn(),
                skip_zero_gradients=p.skip_zero_gradients,
                skip_none_gradients=False)

        loss_name = p.loss_name or p.name
        losses = []
        eval_metrics = {}
        if isinstance(loss_name, (list, tuple)):
            losses_and_grads = {}
            variables = None
            for metric_name in loss_name:
                loss_metric, var_grads = LossAndGradients(metric_name)
                losses_and_grads[metric_name] = py_utils.NestedMap(
                    loss_metric=loss_metric,
                    grads=tf.nest.map_structure(lambda vg: vg.grad, var_grads))
                current_vars = tf.nest.map_structure(lambda vg: vg.var,
                                                     var_grads)
                if variables is None:
                    variables = current_vars
                else:
                    tf.nest.assert_same_structure(variables, current_vars)
                losses.append(loss_metric[0])

            grads, eval_metrics = self.gradient_combiner.Combine(
                variables, losses_and_grads)
            var_grads = tf.nest.map_structure(
                lambda v, g: py_utils.VarGrad(var=v, grad=g), variables, grads)
        else:
            loss_metric, var_grads = LossAndGradients(loss_name)
            losses.append(loss_metric[0])

        return losses, py_utils.SkipNoneGradients(var_grads), eval_metrics
Exemple #3
0
 def testScaleGradientsError(self):
   p = self.TestParams()
   p.input = base_input_generator.BaseSequenceInputGenerator.Params()
   p.train.clip_gradient_single_norm_to_value = 1.0
   p.train.clip_gradient_norm_to_value = 1.0
   task = p.Instantiate()
   task.CreateVariable(
       'a',
       py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0)))
   var_a = task.theta.a
   var_grads = py_utils.NestedMap(
       a=py_utils.VarGrad(var_a, tf.ones_like(var_a)))
   self.assertRaises(ValueError, task.learners[0].ScaleGradients, var_grads)
Exemple #4
0
  def testScaleGradientsSingleTensorNorm(self):
    p = self.TestParams()
    p.train.clip_gradient_single_norm_to_value = 1.0
    p.train.clip_gradient_norm_to_value = None
    task = p.Instantiate()

    var_a = task.theta.a
    var_b = task.theta.b
    var_grads = py_utils.NestedMap(
        a=py_utils.VarGrad(var_a,
                           tf.ones_like(var_a) * 10.0),
        b=py_utils.VarGrad(var_b,
                           tf.ones_like(var_b) * 0.5))
    scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

    FLAGS.enable_check_numerics = False
    with self.session():
      self.evaluate(tf.global_variables_initializer())

      # Each variable is clipped indipendently to grad scale of 1.
      self.assertAllClose(scaled_grads_map.final_var_grads.a[1].eval(), 1.0)
      self.assertAllClose(scaled_grads_map.final_var_grads.b[1].eval(), 0.5)
Exemple #5
0
        def _Acc(vg):
            """Updating accumulators."""

            v, g = vg
            with tf.variable_scope(v.op.name):
                _, a = py_utils.CreateVariable(
                    'grad_accumulator',
                    py_utils.WeightParams(v.get_shape(),
                                          py_utils.WeightInit.Constant(0.0),
                                          self.params.dtype),
                    trainable=False)
                a = tf.assign_add(a, g)

            return py_utils.VarGrad(v, a)
Exemple #6
0
        def _Acc(vg):
            """Updating accumulators."""

            v, g = vg
            scope_name = v.name
            if scope_name.endswith(':0'):
                scope_name = scope_name[:-2]
            with tf.variable_scope(scope_name):
                a = py_utils.CreateVariable(
                    'grad_accumulator',
                    py_utils.WeightParams(v.get_shape(),
                                          py_utils.WeightInit.Constant(0.0),
                                          self.params.dtype),
                    trainable=False)
                a = tf.assign_add(a, g)

            return py_utils.VarGrad(v, a)
Exemple #7
0
  def testScaleGradientsInf(self):
    FLAGS.enable_check_numerics = False
    p = self.TestParams()
    task = p.Instantiate()
    var_a = task.theta.a
    # Infinite gradient.
    var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, tf.math.log(0.)))
    scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

    with self.session():
      self.evaluate(tf.global_variables_initializer())
      self.assertEqual(0., scaled_grads_map.grad_scale.eval())
      # The final gradient must be finite.
      self.assertFalse(
          tf.math.is_nan(scaled_grads_map.final_var_grads.a[1]).eval())
      self.assertTrue(
          tf.math.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
Exemple #8
0
    def testScaleGradientsCheckNumerics(self):
        """ScaleGradients when enable_check_numerics=True."""
        FLAGS.enable_check_numerics = True
        p = self.TestParams()
        task = p.Instantiate()
        var_a = task.theta.a
        # Make a NaN gradient.
        var_grads = py_utils.NestedMap(
            a=py_utils.VarGrad(var_a, 0. * tf.math.log(0.)))
        scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

        with self.session():
            self.evaluate(tf.global_variables_initializer())
            self.assertEqual(0., scaled_grads_map.grad_scale.eval())
            # Fetching the gradient raises an exception with enable_check_numerics.
            with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                        'is not finite'):
                _ = scaled_grads_map.final_var_grads.a[1].eval()
Exemple #9
0
  def testScaleGradientsInf(self):
    FLAGS.enable_check_numerics = False
    p = self.TestParams()
    p.input = base_input_generator.BaseSequenceInputGenerator.Params()
    task = p.Instantiate()
    task.CreateVariable(
        'a',
        py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0)))
    var_a = task.theta.a
    # Infinite gradient.
    var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, tf.log(0.)))
    scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

    with self.session():
      tf.global_variables_initializer().run()
      self.assertEqual(0., scaled_grads_map.grad_scale.eval())
      # The final gradient must be finite.
      self.assertFalse(tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval())
      self.assertTrue(
          tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
Exemple #10
0
  def testScaleGradientsCheckNumerics(self):
    """ScaleGradients when enable_check_numerics=True."""
    FLAGS.enable_check_numerics = True
    p = self.TestParams()
    p.input = base_input_generator.BaseSequenceInputGenerator.Params()
    task = p.Instantiate()
    task.CreateVariable(
        'a',
        py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0)))
    var_a = task.theta.a
    # Make a NaN gradient.
    var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, 0. * tf.log(0.)))
    scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

    with self.session():
      tf.global_variables_initializer().run()
      self.assertEqual(0., scaled_grads_map.grad_scale.eval())
      # Fetching the gradient raises an exception with enable_check_numerics.
      with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                  'is not finite'):
        _ = scaled_grads_map.final_var_grads.a[1].eval()
Exemple #11
0
    def _ComputeLossesAndGradients(self, metrics, vmap):
        p = self.params
        vmap = self.GetTrainableVariables(vmap)

        # Get tpu embedding activations to compute the gradients for.
        tpu_embedding_activations = py_utils.NestedMap()
        tpu_embedding_graph_collection = py_utils.GetTpuEmbeddingGraphCollection(
        )
        if tpu_embedding_graph_collection:
            tpu_embedding_collection = tpu_embedding_graph_collection[0]
            task_call_scope = py_utils.GetTaskCallScope()
            tpu_embedding_activations = py_utils.NestedMap(
                tpu_embedding_collection.GetActivations(task_call_scope) or {})
            # It's possible that task_call_scope is None and its mode is not set in
            # tpu_embedding_collection (e.g. in unit test), but if the activation is
            # not empty, the mode must have been set.
            if tpu_embedding_activations and (
                    tpu_embedding_collection.ShouldStopGradient(
                        task_call_scope)):
                tpu_embedding_activations = py_utils.NestedMap()

        for v in vmap.Flatten():
            tf.logging.info('%s: bprop variable: %s', p.name, v.name)

        def LossAndGradients(metric_name):
            """Returns (loss, var_grads) computed from metrics[metric_name]."""
            metric = metrics.get(metric_name, None)
            if metric is None:
                raise ValueError('Loss %s not found in metrics %s' %
                                 (metric_name, list(metrics.keys())))
            # TODO(b/154785713): pass (loss, loss_weight) to ComputeGradients().
            loss = metric[0]
            return metric, self.optimizer.ComputeGradients(
                loss,
                vmap,
                p.grad_aggregation_method,
                p.colocate_gradients_with_ops,
                p.gate_gradients,
                compute_gradients_fn=self._CustomComputeGradientsFn(),
                skip_zero_gradients=p.skip_zero_gradients,
                skip_none_gradients=False,
                tpu_embedding_activations=tpu_embedding_activations)

        loss_name = p.loss_name or p.name
        losses = []
        eval_metrics = {}
        if isinstance(loss_name, (list, tuple)):
            assert not tpu_embedding_activations, (
                'TPU embedding does not support multiple loss currently.')
            losses_and_grads = {}
            variables = None
            for metric_name in loss_name:
                loss_metric, var_grads = LossAndGradients(metric_name)
                losses_and_grads[metric_name] = py_utils.NestedMap(
                    loss_metric=loss_metric,
                    grads=tf.nest.map_structure(lambda vg: vg.grad, var_grads))
                current_vars = tf.nest.map_structure(lambda vg: vg.var,
                                                     var_grads)
                if variables is None:
                    variables = current_vars
                else:
                    tf.nest.assert_same_structure(variables, current_vars)
                losses.append(loss_metric[0])

            grads, eval_metrics = self.gradient_combiner.Combine(
                variables, losses_and_grads)
            var_grads = tf.nest.map_structure(
                lambda v, g: py_utils.VarGrad(var=v, grad=g), variables, grads)
        else:
            loss_metric, var_grads = LossAndGradients(loss_name)
            losses.append(loss_metric[0])

        return losses, py_utils.SkipNoneGradients(var_grads), eval_metrics