def ScaleGradients(self, var_grads): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. Returns: (has_nan_or_inf, grad_scale, final_var_grads). - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN or Inf in input gradients. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. """ p = self.params tp = p.train # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary(p, name, py_utils.NestedMap(s=vg)) _, all_grad_norm = summary_utils.AddNormSummary(p, 'all', var_grads) grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm), tf.is_inf(all_grad_norm)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. var_grads = self.AdjustGradients(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = self._HasNanOrInf(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) # Computes gradient's scale. grad_scale = tf.constant(1.0) if tp.clip_gradient_norm_to_value: # If all_grad_norm > tp.clip_gradient_norm_to_value, scales # all_grads so that the norm is 1.0. grad_scale = tf.minimum( 1.0, tp.clip_gradient_norm_to_value / all_grad_norm) if tp.grad_norm_to_clip_to_zero: # If all_grad_norm > tp.grad_norm_to_clip_to_zero, treats # grad_scale as 0. This way, we ignore this step. grad_scale *= tf.cast(all_grad_norm < tp.grad_norm_to_clip_to_zero, p.dtype) if tp.grad_norm_tracker: grad_scale *= self.grad_norm_tracker.FPropDefaultTheta( all_grad_norm, has_nan_or_inf) # Force grad_scale to be 0 if there is any NaN or Inf in gradients. grad_scale = tf.where(has_nan_or_inf, 0.0, grad_scale) summary_utils.scalar(p, 'grad_scale_all', grad_scale) final_var_grads = py_utils.ApplyGradMultiplier(var_grads, grad_scale) return has_nan_or_inf, grad_scale, final_var_grads
def BProp(self): super(RNMTModel, self).BProp() p = self.params if p.add_summary: vg = self._var_grads # Computes gradients' norm and adds their summaries. emb_grads = [] rnn_grads = [] atten_grads = [] softmax_grads = [] if 'enc' in vg: emb_grads += [vg.enc.emb] if 'emb' in vg.enc else [] rnn_grads += [vg.enc.rnn] if 'rnn' in vg.enc else [] if 'dec' in vg: emb_grads += [vg.dec.emb] if 'emb' in vg.dec else [] rnn_grads += [vg.dec.frnn] if 'frnn' in vg.dec else [] softmax_grads += [vg.dec.softmax ] if 'softmax' in vg.dec else [] if 'frnn_with_atten' in vg.dec: if 'cell' in vg.dec.frnn_with_atten: rnn_grads += [vg.dec.frnn_with_atten.cell] if 'atten' in vg.dec.frnn_with_atten: atten_grads += [vg.dec.frnn_with_atten.atten] if emb_grads: summary_utils.AddNormSummary('emb', emb_grads) if rnn_grads: summary_utils.AddNormSummary('lstm', rnn_grads) if atten_grads: summary_utils.AddNormSummary('atten', atten_grads) if softmax_grads: summary_utils.AddNormSummary('softmax', softmax_grads)
def BProp(self): super(TransformerModel, self).BProp() # Computes gradients' norm and adds their summaries. p = self.params vg = self._var_grads emb_vg = py_utils.NestedMap() emb_vg.child = [vg.enc.token_emb, vg.dec.token_emb] # Note that positional embedding layer has no trainable variable # if its trainable_scaling is false. if 'position_emb' in vg.enc: emb_vg.child += [vg.enc.position_emb] if 'position_emb' in vg.dec: emb_vg.child += [vg.dec.position_emb] summary_utils.AddNormSummary(p, 'emb', emb_vg) summary_utils.AddNormSummary( p, 'atten', [vg.enc.transformer_stack.trans, vg.dec.trans]) summary_utils.AddNormSummary(p, 'softmax', vg.dec.softmax)
def ScaleGradients(self, var_grads, gradient_adjuster=None): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: A `.NestedMap` containing: - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN or Inf in input gradients. - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. (Optional, only returned in case global norm clipping is used.) """ p = self.params # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary(name + '/' + p.name, py_utils.NestedMap(s=vg)) all_grad_norm = tf.sqrt( py_utils.SumSquared([ g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten() ])) all_var_norm = tf.sqrt( py_utils.SumSquared([ v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten() ])) grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm), tf.is_inf(all_grad_norm)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. if gradient_adjuster is not None: tf.logging.info('gradient_adjuster=%s', gradient_adjuster) var_grads = gradient_adjuster(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) return_values = py_utils.NestedMap() if p.clip_gradient_single_norm_to_value: # Currently using both types of clipping simultaneously is unsupported. if p.clip_gradient_norm_to_value: raise ValueError( 'Cannot use clip_gradient_single_norm_to_value=%f and ' 'clip_gradient_norm_to_value=%f.' % (p.clip_gradient_single_norm_to_value, p.clip_gradient_norm_to_value)) final_var_grads = py_utils.ApplyGradNormCliping( var_grads, p.clip_gradient_single_norm_to_value) else: grad_scale = self._GetGlobalGradScale(all_grad_norm, has_nan_or_inf) self._AddEvalMetric('grad_norm/all', all_grad_norm, tf.constant(1.0)) self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0)) self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0)) final_var_grads = py_utils.ApplyGradMultiplier( var_grads, grad_scale) return_values.grad_scale = grad_scale return_values.has_nan_or_inf = has_nan_or_inf return_values.final_var_grads = final_var_grads return return_values