Esempio n. 1
0
    def testScaleGradientsCheckNumerics(self):
        """ScaleGradients when enable_check_numerics=True."""
        FLAGS.enable_check_numerics = True
        p = self.TestParams()
        p.input = base_input_generator.BaseSequenceInputGenerator.Params()
        task = p.Instantiate()
        task.CreateVariable(
            'a',
            py_utils.WeightParams(shape=[],
                                  init=py_utils.WeightInit.Constant(0)))
        var_a = task.theta.a
        # Make a NaN gradient.
        var_grads = py_utils.NestedMap(a=(var_a, 0. * tf.log(0.)))
        scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

        with self.session():
            tf.global_variables_initializer().run()
            with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
                                         'is not finite'):
                self.assertTrue(scaled_grads_map.has_nan_or_inf.eval())
                self.assertEqual(0., scaled_grads_map.grad_scale.eval())
                # The final gradient must be finite.
                self.assertFalse(
                    tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval())
                self.assertTrue(
                    tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
Esempio n. 2
0
    def QuantizeTensors(self, t_name, ts, eval_only=False):
        p = self.params
        # Always straddle a real zero point.
        if p.is_eval:
            # At eval/inference time, use the memorized range.
            # Important: Don't capture these variables in training mode so as to
            # avoid extra/unnecessary captures.
            min_var = self._GetQStateVar(t_name, 'min')
            max_var = self._GetQStateVar(t_name, 'max')
            return [
                self._MaybeFakeQuant(t, min_var, max_var, num_bits=p.bits)
                for t in ts
            ]
        else:
            # At training time, use the batch calculated min/max.
            accumulator_name = self._GetAccumulatorNameForTensor(t_name)
            # Calculate min/max for all tensors.
            batch_min = 0.0
            batch_max = 0.0
            for t in ts:
                batch_min = tf.minimum(tf.reduce_min(t), batch_min)
                batch_max = tf.maximum(tf.reduce_max(t), batch_max)

            # New state.
            state1 = tf.stack([1.0, batch_min, batch_max])
            self.accumulators[accumulator_name].Update(state1)

            # Results.
            ts_out = []
            for i, t in enumerate(ts):
                if eval_only:
                    # If only quantizing at eval time, still record ranges as above
                    # but don't quantize.
                    quant_t = t
                else:
                    # If quantizing during training, skip quantization if it produces
                    # NANs. Sometimes early in the training process, things are unstable
                    # and ranges can produce numerical instability that makes it
                    # impossible to perform a fake_quant.
                    quant_t = self._MaybeFakeQuant(t,
                                                   batch_min,
                                                   batch_max,
                                                   num_bits=p.bits)
                    # TODO(laurenzo): Plumb quant_t_has_nans through state and report.
                    quant_t_has_nans = tf.is_nan(quant_t)
                    quant_t = tf.where(quant_t_has_nans, t, quant_t)
                ts_out.append(quant_t)
                summary_utils.histogram(
                    '%s/%s_%d' % (self._qvars_scope.name, t_name, i), t)
            return ts_out
Esempio n. 3
0
 def QuantizeWeight(self, w):
     p = self.params
     w_min = tf.reduce_min(w)
     w_max = tf.reduce_max(w)
     # NOTE: We force a small, non-zero range because otherwise, zero weights
     # can cause downstream inference engines to blow up.
     w_min = tf.minimum(w_min, -p.quantize_weight_epsilon)
     w_max = tf.maximum(w_max, p.quantize_weight_epsilon)
     quant_w = self._MaybeFakeQuant(w, w_min, w_max, num_bits=p.bits)
     if p.is_eval:
         return quant_w
     else:
         # If quantizing during training, skip quantization if it produces
         # NANs. Sometimes early in the training process, things are unstable
         # and ranges can produce numerical instability that makes it
         # impossible to perform a fake_quant.
         quant_w_has_nans = tf.is_nan(quant_w)
         return tf.where(quant_w_has_nans, w, quant_w)
Esempio n. 4
0
  def testScaleGradients(self):
    p = self.TestParams()
    p.input = base_input_generator.BaseSequenceInputGenerator.Params()
    task = p.Instantiate()
    task.CreateVariable(
        'a',
        py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0)))
    var_a = task.theta.a
    var_grads = py_utils.NestedMap(a=(var_a, tf.ones_like(var_a)))
    scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

    FLAGS.enable_check_numerics = False
    with self.session():
      tf.global_variables_initializer().run()
      self.assertEqual(1.0, scaled_grads_map.grad_scale.eval())
      # The final gradient must be finite.
      self.assertFalse(tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval())
      self.assertTrue(
          tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
Esempio n. 5
0
    def ScaleGradients(self, var_grads, gradient_adjuster=None):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      A `.NestedMap` containing:
      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs,
        where gradients have already been scaled.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step. (Optional, only returned in case global norm clipping is
        used.)
    """
        p = self.params

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(name + '/' + p.name,
                                         py_utils.NestedMap(s=vg))
        all_grad_norm = tf.sqrt(
            py_utils.SumSquared([
                g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        all_var_norm = tf.sqrt(
            py_utils.SumSquared([
                v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        if gradient_adjuster is not None:
            tf.logging.info('gradient_adjuster=%s', gradient_adjuster)
            var_grads = gradient_adjuster(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        return_values = py_utils.NestedMap()
        if p.clip_gradient_single_norm_to_value:
            # Currently using both types of clipping simultaneously is unsupported.
            if p.clip_gradient_norm_to_value:
                raise ValueError(
                    'Cannot use clip_gradient_single_norm_to_value=%f and '
                    'clip_gradient_norm_to_value=%f.' %
                    (p.clip_gradient_single_norm_to_value,
                     p.clip_gradient_norm_to_value))
            final_var_grads = py_utils.ApplyGradNormCliping(
                var_grads, p.clip_gradient_single_norm_to_value)

        else:
            grad_scale = self._GetGlobalGradScale(all_grad_norm,
                                                  has_nan_or_inf)
            self._AddEvalMetric('grad_norm/all', all_grad_norm,
                                tf.constant(1.0))
            self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0))
            self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0))
            final_var_grads = py_utils.ApplyGradMultiplier(
                var_grads, grad_scale)
            return_values.grad_scale = grad_scale

        return_values.has_nan_or_inf = has_nan_or_inf
        return_values.final_var_grads = final_var_grads
        return return_values