Beispiel #1
0
 def testCollectVarHistogram(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars)
         summary_utils.CollectVarHistogram(var_grads)
Beispiel #2
0
    def AdjustGradients(self,
                        var_grads,
                        gradient_mask=None,
                        gradient_adjuster=None):
        """Adjusts gradients according to learner params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      (var_grads, stats), where var_grads is a `.NestedMap` whose values
      (var, grad) pairs representing adjusted gradients, and stats is a
      `.NestedMap` containing 'has_nan_or_inf' and 'eval_metrics'.
    """
        p = self.params
        # L2 regularizer.
        if p.l2_regularizer_weight is not None:
            l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l2_regularizer_weight, p=2.0)
            self._AddScalarSummary('l2_loss', l2_loss)

        # L1 regularizer.
        if p.l1_regularizer_weight is not None:
            l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l1_regularizer_weight, p=1.0)
            self._AddScalarSummary('l1_loss', l1_loss)

        # Mask gradients only if the mask is set.
        if gradient_mask:
            var_grads = py_utils.MaskGradients(var_grads, gradient_mask)

        # Apply gradient clipping.
        scaled_vars = self.ScaleGradients(var_grads,
                                          gradient_adjuster=gradient_adjuster)
        has_nan_or_inf = scaled_vars.has_nan_or_inf
        var_grads = scaled_vars.final_var_grads

        # Histogram summary.
        summary_utils.CollectVarHistogram(var_grads)
        stats = py_utils.NestedMap(has_nan_or_inf=has_nan_or_inf,
                                   eval_metrics=self._eval_metrics)
        return var_grads, stats
Beispiel #3
0
    def AdjustGradients(self,
                        var_grads,
                        gradient_mask=None,
                        gradient_adjuster=None):
        """Adjusts gradients according to learner params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      (var_grads, eval_metrics), where var_grads is a `.NestedMap` whose values
      (var, grad) pairs representing adjusted gradients.
    """
        p = self.params
        # L2 regularizer.
        if p.l2_regularizer_weight is not None:
            l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l2_regularizer_weight, p=2.0)
            self._AddEvalMetric('l2_loss', l2_loss, tf.constant(1.0))

        # L1 regularizer.
        if p.l1_regularizer_weight is not None:
            l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l1_regularizer_weight, p=1.0)
            self._AddEvalMetric('l1_loss', l1_loss, tf.constant(1.0))

        # Mask gradients only if the mask is set.
        if gradient_mask:
            var_grads = py_utils.MaskGradients(var_grads, gradient_mask)

        # Scale gradients, e.g., gradient clipping.
        if p.scale_gradients:
            scaled_vars = self.ScaleGradients(
                var_grads, gradient_adjuster=gradient_adjuster)
            var_grads = scaled_vars.final_var_grads

        # Histogram summary.
        summary_utils.CollectVarHistogram(var_grads)
        return var_grads, self._eval_metrics
Beispiel #4
0
    def Apply(self, loss, vmap, gradient_mask=None, gradient_adjuster=None):
        """Computes updates on 'vmap' to optimize 'loss'.

    TODO(rpang): explore merging gradient_mask and gradient_adjuster.

    Args:
      loss: A scalar Tensor.
      vmap: A `.NestedMap` object containing variables to optimize.
      gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      (op, stats), where op is a tf.Operation to update variables and stats
      is a NestedMap containing 'has_nan_or_inf' and 'eval_metrics'.
    """
        # We apply gradients outside the name_scope to maintain backwards
        # compatibility on variables created by self.optimizer.Apply().
        p = self.params

        vmap = self.GetTrainableVariables(vmap)

        for v in vmap.Flatten():
            tf.logging.info('%s: bprop variable: %s', p.name, v.name)

        # Compute gradients.
        var_grads = self.optimizer.ComputeGradients(
            loss, vmap, p.grad_aggregation_method,
            p.colocate_gradients_with_ops, p.gate_gradients)

        # L2 regularizer.
        if p.l2_regularizer_weight is not None:
            l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l2_regularizer_weight, p=2.0)
            self._AddScalarSummary('l2_loss', l2_loss)

        # L1 regularizer.
        if p.l1_regularizer_weight is not None:
            l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l1_regularizer_weight, p=1.0)
            self._AddScalarSummary('l1_loss', l1_loss)

        # Mask gradients only if the mask is set.
        if gradient_mask:
            var_grads = py_utils.MaskGradients(var_grads, gradient_mask)

        # Apply gradient clipping.
        scaled_vars = self.ScaleGradients(var_grads, gradient_adjuster)
        has_nan_or_inf = scaled_vars.has_nan_or_inf
        var_grads = scaled_vars.final_var_grads

        # Histogram summary.
        summary_utils.CollectVarHistogram(var_grads)
        self._var_grads = var_grads

        assert self.theta.global_step is not None, self.theta
        lrs = self.lr_schedule.Value(self.theta.global_step)
        self._AddScalarSummary('lr_schedule', lrs)
        lr = p.learning_rate * lrs

        var_update_op = self.optimizer.Apply(lr, var_grads)

        stats = py_utils.NestedMap(has_nan_or_inf=has_nan_or_inf,
                                   eval_metrics=self._eval_metrics)
        return var_update_op, stats
Beispiel #5
0
  def _BPropForVariables(self, vmap):
    """Constructs the backward graph for the given variables.

    Args:
      vmap: a `.NestedMap` of variables.
    """
    p = self.params
    tp = p.train

    # Compute gradients.
    self._var_grads = py_utils.ComputeGradients(self.loss, vmap)

    # L2 regularizer.
    if tp.l2_regularizer_weight is not None:
      l2_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss(
          self._var_grads, tp.l2_regularizer_weight, p=2.0)
      summary_utils.scalar(p, 'l2_loss', l2_loss)

    # L1 regularizer.
    if tp.l1_regularizer_weight is not None:
      l1_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss(
          self._var_grads, tp.l1_regularizer_weight, p=1.0)
      summary_utils.scalar(p, 'l1_loss', l1_loss)

    # Mask gradients only if the mask is set.
    if self._per_input_gradient_mask:
      bprop_onehot = self.input_generator.GetInputSourceOneHot()
      self._var_grads = py_utils.MaskGradients(
          self._var_grads, self._per_input_gradient_mask, bprop_onehot)

    # Apply gradient clipping.
    has_nan_or_inf, _, self._var_grads = self.ScaleGradients(self._var_grads)

    # Histogram summary.
    summary_utils.CollectVarHistogram(p, self._var_grads)

    lrs = self.lr_schedule.Value(self._global_step)
    summary_utils.scalar(p, 'lr_schedule', lrs)
    lr = tp.learning_rate * lrs

    var_update_op = self.optimizer.Apply(lr, self._var_grads)

    increment_global_step_ops = []
    with tf.colocate_with(self._shared_global_step):
      increment_global_step_ops.append(
          tf.assign_add(self._shared_global_step, 1))
    if self._task_global_step:
      with tf.colocate_with(self._task_global_step):
        increment_global_step_ops.append(
            tf.assign_add(self._task_global_step, 1))
    increment_global_steps = tf.group(*increment_global_step_ops)

    relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates(
        self.loss, tf.get_collection(py_utils.BATCH_NORM_UPDATES))
    batch_norm_updates = tf.group(*relevant_bn_updates)

    # Update stats.
    stats_updates = tf.group(
        self.IncrementTotalSamples(),
        self.IncrementTotalNans(tf.to_int32(has_nan_or_inf)))

    # Post training step update.
    post_training_step_updates = self.PostTrainingStepUpdate(self._global_step)

    # Get the op to update the weight masks and thresholds
    mask_update_op = self._GetMaskUpdateOp()

    # TODO(rpang): try to structure _train_op as:
    #   tf.cond(skip_step, <only update skip stats>, <all updates>)
    # so that we skip all other updates when a step is skipped.
    
    # 
    if p.contiguous:
        var_update_op = tf.group(var_update_op, self.last_state_group_op)

    self._train_op = tf.group(
        var_update_op,
        batch_norm_updates,
        stats_updates,
        post_training_step_updates,
        increment_global_steps,
        mask_update_op,
        name='train')