def testScaleGradientsError(self): p = self.TestParams() p.train.clip_gradient_single_norm_to_value = 1.0 p.train.clip_gradient_norm_to_value = 1.0 task = p.Instantiate() var_a = task.theta.a var_grads = py_utils.NestedMap( a=py_utils.VarGrad(var_a, tf.ones_like(var_a))) self.assertRaises(ValueError, task.learners[0].ScaleGradients, var_grads)
def _ComputeLossesAndGradients(self, metrics, vmap): p = self.params vmap = self.GetTrainableVariables(vmap) for v in vmap.Flatten(): tf.logging.info('%s: bprop variable: %s', p.name, v.name) def LossAndGradients(metric_name): """Returns (loss, var_grads) computed from metrics[metric_name].""" metric = metrics.get(metric_name, None) if metric is None: raise ValueError('Loss %s not found in metrics %s' % (metric_name, list(metrics.keys()))) # TODO(b/154785713): pass (loss, loss_weight) to ComputeGradients(). loss = metric[0] return metric, self.optimizer.ComputeGradients( loss, vmap, p.grad_aggregation_method, p.colocate_gradients_with_ops, p.gate_gradients, compute_gradients_fn=self._CustomComputeGradientsFn(), skip_zero_gradients=p.skip_zero_gradients, skip_none_gradients=False) loss_name = p.loss_name or p.name losses = [] eval_metrics = {} if isinstance(loss_name, (list, tuple)): losses_and_grads = {} variables = None for metric_name in loss_name: loss_metric, var_grads = LossAndGradients(metric_name) losses_and_grads[metric_name] = py_utils.NestedMap( loss_metric=loss_metric, grads=tf.nest.map_structure(lambda vg: vg.grad, var_grads)) current_vars = tf.nest.map_structure(lambda vg: vg.var, var_grads) if variables is None: variables = current_vars else: tf.nest.assert_same_structure(variables, current_vars) losses.append(loss_metric[0]) grads, eval_metrics = self.gradient_combiner.Combine( variables, losses_and_grads) var_grads = tf.nest.map_structure( lambda v, g: py_utils.VarGrad(var=v, grad=g), variables, grads) else: loss_metric, var_grads = LossAndGradients(loss_name) losses.append(loss_metric[0]) return losses, py_utils.SkipNoneGradients(var_grads), eval_metrics
def testScaleGradientsError(self): p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() p.train.clip_gradient_single_norm_to_value = 1.0 p.train.clip_gradient_norm_to_value = 1.0 task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a var_grads = py_utils.NestedMap( a=py_utils.VarGrad(var_a, tf.ones_like(var_a))) self.assertRaises(ValueError, task.learners[0].ScaleGradients, var_grads)
def testScaleGradientsSingleTensorNorm(self): p = self.TestParams() p.train.clip_gradient_single_norm_to_value = 1.0 p.train.clip_gradient_norm_to_value = None task = p.Instantiate() var_a = task.theta.a var_b = task.theta.b var_grads = py_utils.NestedMap( a=py_utils.VarGrad(var_a, tf.ones_like(var_a) * 10.0), b=py_utils.VarGrad(var_b, tf.ones_like(var_b) * 0.5)) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) FLAGS.enable_check_numerics = False with self.session(): self.evaluate(tf.global_variables_initializer()) # Each variable is clipped indipendently to grad scale of 1. self.assertAllClose(scaled_grads_map.final_var_grads.a[1].eval(), 1.0) self.assertAllClose(scaled_grads_map.final_var_grads.b[1].eval(), 0.5)
def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): _, a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a)
def _Acc(vg): """Updating accumulators.""" v, g = vg scope_name = v.name if scope_name.endswith(':0'): scope_name = scope_name[:-2] with tf.variable_scope(scope_name): a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a)
def testScaleGradientsInf(self): FLAGS.enable_check_numerics = False p = self.TestParams() task = p.Instantiate() var_a = task.theta.a # Infinite gradient. var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, tf.math.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): self.evaluate(tf.global_variables_initializer()) self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # The final gradient must be finite. self.assertFalse( tf.math.is_nan(scaled_grads_map.final_var_grads.a[1]).eval()) self.assertTrue( tf.math.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
def testScaleGradientsCheckNumerics(self): """ScaleGradients when enable_check_numerics=True.""" FLAGS.enable_check_numerics = True p = self.TestParams() task = p.Instantiate() var_a = task.theta.a # Make a NaN gradient. var_grads = py_utils.NestedMap( a=py_utils.VarGrad(var_a, 0. * tf.math.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): self.evaluate(tf.global_variables_initializer()) self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # Fetching the gradient raises an exception with enable_check_numerics. with self.assertRaisesRegex(tf.errors.InvalidArgumentError, 'is not finite'): _ = scaled_grads_map.final_var_grads.a[1].eval()
def testScaleGradientsInf(self): FLAGS.enable_check_numerics = False p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a # Infinite gradient. var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, tf.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): tf.global_variables_initializer().run() self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # The final gradient must be finite. self.assertFalse(tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval()) self.assertTrue( tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
def testScaleGradientsCheckNumerics(self): """ScaleGradients when enable_check_numerics=True.""" FLAGS.enable_check_numerics = True p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a # Make a NaN gradient. var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, 0. * tf.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): tf.global_variables_initializer().run() self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # Fetching the gradient raises an exception with enable_check_numerics. with self.assertRaisesRegex(tf.errors.InvalidArgumentError, 'is not finite'): _ = scaled_grads_map.final_var_grads.a[1].eval()
def _ComputeLossesAndGradients(self, metrics, vmap): p = self.params vmap = self.GetTrainableVariables(vmap) # Get tpu embedding activations to compute the gradients for. tpu_embedding_activations = py_utils.NestedMap() tpu_embedding_graph_collection = py_utils.GetTpuEmbeddingGraphCollection( ) if tpu_embedding_graph_collection: tpu_embedding_collection = tpu_embedding_graph_collection[0] task_call_scope = py_utils.GetTaskCallScope() tpu_embedding_activations = py_utils.NestedMap( tpu_embedding_collection.GetActivations(task_call_scope) or {}) # It's possible that task_call_scope is None and its mode is not set in # tpu_embedding_collection (e.g. in unit test), but if the activation is # not empty, the mode must have been set. if tpu_embedding_activations and ( tpu_embedding_collection.ShouldStopGradient( task_call_scope)): tpu_embedding_activations = py_utils.NestedMap() for v in vmap.Flatten(): tf.logging.info('%s: bprop variable: %s', p.name, v.name) def LossAndGradients(metric_name): """Returns (loss, var_grads) computed from metrics[metric_name].""" metric = metrics.get(metric_name, None) if metric is None: raise ValueError('Loss %s not found in metrics %s' % (metric_name, list(metrics.keys()))) # TODO(b/154785713): pass (loss, loss_weight) to ComputeGradients(). loss = metric[0] return metric, self.optimizer.ComputeGradients( loss, vmap, p.grad_aggregation_method, p.colocate_gradients_with_ops, p.gate_gradients, compute_gradients_fn=self._CustomComputeGradientsFn(), skip_zero_gradients=p.skip_zero_gradients, skip_none_gradients=False, tpu_embedding_activations=tpu_embedding_activations) loss_name = p.loss_name or p.name losses = [] eval_metrics = {} if isinstance(loss_name, (list, tuple)): assert not tpu_embedding_activations, ( 'TPU embedding does not support multiple loss currently.') losses_and_grads = {} variables = None for metric_name in loss_name: loss_metric, var_grads = LossAndGradients(metric_name) losses_and_grads[metric_name] = py_utils.NestedMap( loss_metric=loss_metric, grads=tf.nest.map_structure(lambda vg: vg.grad, var_grads)) current_vars = tf.nest.map_structure(lambda vg: vg.var, var_grads) if variables is None: variables = current_vars else: tf.nest.assert_same_structure(variables, current_vars) losses.append(loss_metric[0]) grads, eval_metrics = self.gradient_combiner.Combine( variables, losses_and_grads) var_grads = tf.nest.map_structure( lambda v, g: py_utils.VarGrad(var=v, grad=g), variables, grads) else: loss_metric, var_grads = LossAndGradients(loss_name) losses.append(loss_metric[0]) return losses, py_utils.SkipNoneGradients(var_grads), eval_metrics