コード例 #1
0
ファイル: base_model.py プロジェクト: fanlu/lingvo
    def ScaleGradients(self, var_grads):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.

    Returns:
      (has_nan_or_inf, grad_scale, final_var_grads).

      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where
        gradients have already been scaled.
    """
        p = self.params
        tp = p.train

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(p, name, py_utils.NestedMap(s=vg))
        _, all_grad_norm = summary_utils.AddNormSummary(p, 'all', var_grads)
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        var_grads = self.AdjustGradients(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = self._HasNanOrInf(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        # Computes gradient's scale.
        grad_scale = tf.constant(1.0)
        if tp.clip_gradient_norm_to_value:
            # If all_grad_norm > tp.clip_gradient_norm_to_value, scales
            # all_grads so that the norm is 1.0.
            grad_scale = tf.minimum(
                1.0, tp.clip_gradient_norm_to_value / all_grad_norm)

        if tp.grad_norm_to_clip_to_zero:
            # If all_grad_norm > tp.grad_norm_to_clip_to_zero, treats
            # grad_scale as 0. This way, we ignore this step.
            grad_scale *= tf.cast(all_grad_norm < tp.grad_norm_to_clip_to_zero,
                                  p.dtype)

        if tp.grad_norm_tracker:
            grad_scale *= self.grad_norm_tracker.FPropDefaultTheta(
                all_grad_norm, has_nan_or_inf)

        # Force grad_scale to be 0 if there is any NaN or Inf in gradients.
        grad_scale = tf.where(has_nan_or_inf, 0.0, grad_scale)

        summary_utils.scalar(p, 'grad_scale_all', grad_scale)
        final_var_grads = py_utils.ApplyGradMultiplier(var_grads, grad_scale)
        return has_nan_or_inf, grad_scale, final_var_grads
コード例 #2
0
ファイル: model.py プロジェクト: zy1620454507/lingvo
    def BProp(self):
        super(RNMTModel, self).BProp()

        p = self.params
        if p.add_summary:
            vg = self._var_grads
            # Computes gradients' norm and adds their summaries.
            emb_grads = []
            rnn_grads = []
            atten_grads = []
            softmax_grads = []
            if 'enc' in vg:
                emb_grads += [vg.enc.emb] if 'emb' in vg.enc else []
                rnn_grads += [vg.enc.rnn] if 'rnn' in vg.enc else []
            if 'dec' in vg:
                emb_grads += [vg.dec.emb] if 'emb' in vg.dec else []
                rnn_grads += [vg.dec.frnn] if 'frnn' in vg.dec else []
                softmax_grads += [vg.dec.softmax
                                  ] if 'softmax' in vg.dec else []
                if 'frnn_with_atten' in vg.dec:
                    if 'cell' in vg.dec.frnn_with_atten:
                        rnn_grads += [vg.dec.frnn_with_atten.cell]
                    if 'atten' in vg.dec.frnn_with_atten:
                        atten_grads += [vg.dec.frnn_with_atten.atten]

            if emb_grads:
                summary_utils.AddNormSummary('emb', emb_grads)
            if rnn_grads:
                summary_utils.AddNormSummary('lstm', rnn_grads)
            if atten_grads:
                summary_utils.AddNormSummary('atten', atten_grads)
            if softmax_grads:
                summary_utils.AddNormSummary('softmax', softmax_grads)
コード例 #3
0
    def BProp(self):
        super(TransformerModel, self).BProp()
        # Computes gradients' norm and adds their summaries.
        p = self.params
        vg = self._var_grads
        emb_vg = py_utils.NestedMap()
        emb_vg.child = [vg.enc.token_emb, vg.dec.token_emb]

        # Note that positional embedding layer has no trainable variable
        # if its trainable_scaling is false.
        if 'position_emb' in vg.enc:
            emb_vg.child += [vg.enc.position_emb]
        if 'position_emb' in vg.dec:
            emb_vg.child += [vg.dec.position_emb]
        summary_utils.AddNormSummary(p, 'emb', emb_vg)
        summary_utils.AddNormSummary(
            p, 'atten', [vg.enc.transformer_stack.trans, vg.dec.trans])
        summary_utils.AddNormSummary(p, 'softmax', vg.dec.softmax)
コード例 #4
0
ファイル: learner.py プロジェクト: lbxcfx/lingvo
    def ScaleGradients(self, var_grads, gradient_adjuster=None):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      A `.NestedMap` containing:
      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs,
        where gradients have already been scaled.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step. (Optional, only returned in case global norm clipping is
        used.)
    """
        p = self.params

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(name + '/' + p.name,
                                         py_utils.NestedMap(s=vg))
        all_grad_norm = tf.sqrt(
            py_utils.SumSquared([
                g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        all_var_norm = tf.sqrt(
            py_utils.SumSquared([
                v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        if gradient_adjuster is not None:
            tf.logging.info('gradient_adjuster=%s', gradient_adjuster)
            var_grads = gradient_adjuster(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        return_values = py_utils.NestedMap()
        if p.clip_gradient_single_norm_to_value:
            # Currently using both types of clipping simultaneously is unsupported.
            if p.clip_gradient_norm_to_value:
                raise ValueError(
                    'Cannot use clip_gradient_single_norm_to_value=%f and '
                    'clip_gradient_norm_to_value=%f.' %
                    (p.clip_gradient_single_norm_to_value,
                     p.clip_gradient_norm_to_value))
            final_var_grads = py_utils.ApplyGradNormCliping(
                var_grads, p.clip_gradient_single_norm_to_value)

        else:
            grad_scale = self._GetGlobalGradScale(all_grad_norm,
                                                  has_nan_or_inf)
            self._AddEvalMetric('grad_norm/all', all_grad_norm,
                                tf.constant(1.0))
            self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0))
            self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0))
            final_var_grads = py_utils.ApplyGradMultiplier(
                var_grads, grad_scale)
            return_values.grad_scale = grad_scale

        return_values.has_nan_or_inf = has_nan_or_inf
        return_values.final_var_grads = final_var_grads
        return return_values