def testCollectVarHistogram(self): with self.session(use_gpu=False, graph=tf.Graph()): p = self._testParams() mdl = p.Instantiate() mdl.FPropDefaultTheta() var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars) summary_utils.CollectVarHistogram(var_grads)
def AdjustGradients(self, var_grads, gradient_mask=None, gradient_adjuster=None): """Adjusts gradients according to learner params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: (var_grads, stats), where var_grads is a `.NestedMap` whose values (var, grad) pairs representing adjusted gradients, and stats is a `.NestedMap` containing 'has_nan_or_inf' and 'eval_metrics'. """ p = self.params # L2 regularizer. if p.l2_regularizer_weight is not None: l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l2_regularizer_weight, p=2.0) self._AddScalarSummary('l2_loss', l2_loss) # L1 regularizer. if p.l1_regularizer_weight is not None: l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l1_regularizer_weight, p=1.0) self._AddScalarSummary('l1_loss', l1_loss) # Mask gradients only if the mask is set. if gradient_mask: var_grads = py_utils.MaskGradients(var_grads, gradient_mask) # Apply gradient clipping. scaled_vars = self.ScaleGradients(var_grads, gradient_adjuster=gradient_adjuster) has_nan_or_inf = scaled_vars.has_nan_or_inf var_grads = scaled_vars.final_var_grads # Histogram summary. summary_utils.CollectVarHistogram(var_grads) stats = py_utils.NestedMap(has_nan_or_inf=has_nan_or_inf, eval_metrics=self._eval_metrics) return var_grads, stats
def AdjustGradients(self, var_grads, gradient_mask=None, gradient_adjuster=None): """Adjusts gradients according to learner params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: (var_grads, eval_metrics), where var_grads is a `.NestedMap` whose values (var, grad) pairs representing adjusted gradients. """ p = self.params # L2 regularizer. if p.l2_regularizer_weight is not None: l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l2_regularizer_weight, p=2.0) self._AddEvalMetric('l2_loss', l2_loss, tf.constant(1.0)) # L1 regularizer. if p.l1_regularizer_weight is not None: l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l1_regularizer_weight, p=1.0) self._AddEvalMetric('l1_loss', l1_loss, tf.constant(1.0)) # Mask gradients only if the mask is set. if gradient_mask: var_grads = py_utils.MaskGradients(var_grads, gradient_mask) # Scale gradients, e.g., gradient clipping. if p.scale_gradients: scaled_vars = self.ScaleGradients( var_grads, gradient_adjuster=gradient_adjuster) var_grads = scaled_vars.final_var_grads # Histogram summary. summary_utils.CollectVarHistogram(var_grads) return var_grads, self._eval_metrics
def Apply(self, loss, vmap, gradient_mask=None, gradient_adjuster=None): """Computes updates on 'vmap' to optimize 'loss'. TODO(rpang): explore merging gradient_mask and gradient_adjuster. Args: loss: A scalar Tensor. vmap: A `.NestedMap` object containing variables to optimize. gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: (op, stats), where op is a tf.Operation to update variables and stats is a NestedMap containing 'has_nan_or_inf' and 'eval_metrics'. """ # We apply gradients outside the name_scope to maintain backwards # compatibility on variables created by self.optimizer.Apply(). p = self.params vmap = self.GetTrainableVariables(vmap) for v in vmap.Flatten(): tf.logging.info('%s: bprop variable: %s', p.name, v.name) # Compute gradients. var_grads = self.optimizer.ComputeGradients( loss, vmap, p.grad_aggregation_method, p.colocate_gradients_with_ops, p.gate_gradients) # L2 regularizer. if p.l2_regularizer_weight is not None: l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l2_regularizer_weight, p=2.0) self._AddScalarSummary('l2_loss', l2_loss) # L1 regularizer. if p.l1_regularizer_weight is not None: l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l1_regularizer_weight, p=1.0) self._AddScalarSummary('l1_loss', l1_loss) # Mask gradients only if the mask is set. if gradient_mask: var_grads = py_utils.MaskGradients(var_grads, gradient_mask) # Apply gradient clipping. scaled_vars = self.ScaleGradients(var_grads, gradient_adjuster) has_nan_or_inf = scaled_vars.has_nan_or_inf var_grads = scaled_vars.final_var_grads # Histogram summary. summary_utils.CollectVarHistogram(var_grads) self._var_grads = var_grads assert self.theta.global_step is not None, self.theta lrs = self.lr_schedule.Value(self.theta.global_step) self._AddScalarSummary('lr_schedule', lrs) lr = p.learning_rate * lrs var_update_op = self.optimizer.Apply(lr, var_grads) stats = py_utils.NestedMap(has_nan_or_inf=has_nan_or_inf, eval_metrics=self._eval_metrics) return var_update_op, stats
def _BPropForVariables(self, vmap): """Constructs the backward graph for the given variables. Args: vmap: a `.NestedMap` of variables. """ p = self.params tp = p.train # Compute gradients. self._var_grads = py_utils.ComputeGradients(self.loss, vmap) # L2 regularizer. if tp.l2_regularizer_weight is not None: l2_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss( self._var_grads, tp.l2_regularizer_weight, p=2.0) summary_utils.scalar(p, 'l2_loss', l2_loss) # L1 regularizer. if tp.l1_regularizer_weight is not None: l1_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss( self._var_grads, tp.l1_regularizer_weight, p=1.0) summary_utils.scalar(p, 'l1_loss', l1_loss) # Mask gradients only if the mask is set. if self._per_input_gradient_mask: bprop_onehot = self.input_generator.GetInputSourceOneHot() self._var_grads = py_utils.MaskGradients( self._var_grads, self._per_input_gradient_mask, bprop_onehot) # Apply gradient clipping. has_nan_or_inf, _, self._var_grads = self.ScaleGradients(self._var_grads) # Histogram summary. summary_utils.CollectVarHistogram(p, self._var_grads) lrs = self.lr_schedule.Value(self._global_step) summary_utils.scalar(p, 'lr_schedule', lrs) lr = tp.learning_rate * lrs var_update_op = self.optimizer.Apply(lr, self._var_grads) increment_global_step_ops = [] with tf.colocate_with(self._shared_global_step): increment_global_step_ops.append( tf.assign_add(self._shared_global_step, 1)) if self._task_global_step: with tf.colocate_with(self._task_global_step): increment_global_step_ops.append( tf.assign_add(self._task_global_step, 1)) increment_global_steps = tf.group(*increment_global_step_ops) relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates( self.loss, tf.get_collection(py_utils.BATCH_NORM_UPDATES)) batch_norm_updates = tf.group(*relevant_bn_updates) # Update stats. stats_updates = tf.group( self.IncrementTotalSamples(), self.IncrementTotalNans(tf.to_int32(has_nan_or_inf))) # Post training step update. post_training_step_updates = self.PostTrainingStepUpdate(self._global_step) # Get the op to update the weight masks and thresholds mask_update_op = self._GetMaskUpdateOp() # TODO(rpang): try to structure _train_op as: # tf.cond(skip_step, <only update skip stats>, <all updates>) # so that we skip all other updates when a step is skipped. # if p.contiguous: var_update_op = tf.group(var_update_op, self.last_state_group_op) self._train_op = tf.group( var_update_op, batch_norm_updates, stats_updates, post_training_step_updates, increment_global_steps, mask_update_op, name='train')