Ejemplo n.º 1
0
 def testGradientMult(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars)
         py_utils.ApplyGradMultiplier(var_grads, -1.1)
Ejemplo n.º 2
0
    def ScaleGradients(self, var_grads):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.

    Returns:
      (has_nan_or_inf, grad_scale, final_var_grads).

      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where
        gradients have already been scaled.
    """
        p = self.params
        tp = p.train

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(p, name, py_utils.NestedMap(s=vg))
        _, all_grad_norm = summary_utils.AddNormSummary(p, 'all', var_grads)
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        var_grads = self.AdjustGradients(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = self._HasNanOrInf(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        # Computes gradient's scale.
        grad_scale = tf.constant(1.0)
        if tp.clip_gradient_norm_to_value:
            # If all_grad_norm > tp.clip_gradient_norm_to_value, scales
            # all_grads so that the norm is 1.0.
            grad_scale = tf.minimum(
                1.0, tp.clip_gradient_norm_to_value / all_grad_norm)

        if tp.grad_norm_to_clip_to_zero:
            # If all_grad_norm > tp.grad_norm_to_clip_to_zero, treats
            # grad_scale as 0. This way, we ignore this step.
            grad_scale *= tf.cast(all_grad_norm < tp.grad_norm_to_clip_to_zero,
                                  p.dtype)

        if tp.grad_norm_tracker:
            grad_scale *= self.grad_norm_tracker.FPropDefaultTheta(
                all_grad_norm, has_nan_or_inf)

        # Force grad_scale to be 0 if there is any NaN or Inf in gradients.
        grad_scale = tf.where(has_nan_or_inf, 0.0, grad_scale)

        summary_utils.scalar(p, 'grad_scale_all', grad_scale)
        final_var_grads = py_utils.ApplyGradMultiplier(var_grads, grad_scale)
        return has_nan_or_inf, grad_scale, final_var_grads
Ejemplo n.º 3
0
 def _ApplyAndReset():
   with tf.control_dependencies([
       self._opt.Apply(
           lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps))
   ]):
     return tf.group(
         *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()])
Ejemplo n.º 4
0
    def _Apply1(proj_layer, opt):
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)

      _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
      _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))

      vars1_1 = proj_layer.vars.Flatten()

      grads1_1 = var_grads1.Transform(tuple)
      grads1_2 = var_grads2.Transform(tuple)

      return vars1_1, grads1_1, grads1_2
Ejemplo n.º 5
0
    def testAccumulator(self):
        # testAccumulator compares
        #   - explicit averaging of independently computed var_grads1 and
        #     var_grads2,
        #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
        np.random.seed(12345)
        np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
        np.random.seed(12346)
        np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.random.set_seed(123456)
            params = layers.ProjectionLayer.Params()
            params.name = 'proj'
            params.dtype = tf.float64
            params.input_dim = 3
            params.output_dim = 2
            params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

            params.batch_norm = False
            proj_layer = layers.ProjectionLayer(params)
            inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
            inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
            output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
            output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
            loss1 = tf.reduce_sum(output1)
            loss2 = tf.reduce_sum(output2)
            var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
            var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
            op = optimizer.SGD.Params()
            opt = op.Instantiate()
            lr = 1e-1
            with tf.control_dependencies([loss1, loss2]):
                var_update_op1 = opt.Apply(
                    lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
                with tf.control_dependencies([var_update_op1]):
                    var_update_op2 = opt.Apply(
                        lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))

            self.evaluate(tf.global_variables_initializer())
            vars1 = self.evaluate(proj_layer.vars.Flatten())
            loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
                [
                    loss1,
                    var_grads1.Transform(tuple), loss2,
                    var_grads2.Transform(tuple)
                ],
                feed_dict={
                    inputs1: np_input1,
                    inputs2: np_input2,
                },
            )
            sess.run([var_update_op2],
                     feed_dict={
                         inputs1: np_input1,
                         inputs2: np_input2,
                     })
            vars1_1 = self.evaluate(proj_layer.vars.Flatten())

        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.random.set_seed(123456)
            params = layers.ProjectionLayer.Params()
            params.name = 'proj'
            params.dtype = tf.float64
            params.input_dim = 3
            params.output_dim = 2
            params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

            params.batch_norm = False
            proj_layer = layers.ProjectionLayer(params)
            in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
            inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
            loss = tf.reduce_sum(output1)
            var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
            op = optimizer.Accumulator.Params().Set(
                accum_steps=2,
                dtype=tf.float64,
                optimizer_tpl=optimizer.SGD.Params())
            opt = op.Instantiate()
            lr = 1e-1
            with cluster_factory.ForTestingWorker(add_summary=True):
                var_update_op = opt.Apply(lr, var_grads)
            increment_global_step_op = tf.assign_add(
                py_utils.GetOrCreateGlobalStepVar(), 1)

            self.evaluate(tf.global_variables_initializer())
            vars2 = self.evaluate(proj_layer.vars.Flatten())
            loss2_1, grads2_1 = sess.run(
                [loss, var_grads.Transform(tuple)],
                feed_dict={
                    inputs1: np_input1,
                })
            loss2_2, grads2_2 = sess.run(
                [loss, var_grads.Transform(tuple)],
                feed_dict={
                    inputs1: np_input2,
                })
            acc_0 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            sess.run([var_update_op], feed_dict={
                inputs1: np_input1,
            })
            acc_1 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            vars2_intermediate = self.evaluate(proj_layer.vars.Flatten())
            self.evaluate(increment_global_step_op)
            sess.run([var_update_op], feed_dict={
                inputs1: np_input2,
            })
            acc_2 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            vars2_1 = self.evaluate(proj_layer.vars.Flatten())

            summary = tf.Summary.FromString(
                self.evaluate(tf.summary.merge_all()))
            tf.logging.info(f'summary: {summary}')
            self.assertEqual(summary.value[0].tag, 'sgd_lr')

        self.assertAllClose(vars1, vars2)

        self.assertAllClose(acc_0, np.zeros_like(acc_0))
        self.assertAllClose(acc_1, grads2_1['w'][1])
        self.assertAllClose(acc_2, np.zeros_like(acc_0))

        self.assertAllClose(loss1_1, loss2_1)
        self.assertAllClose(loss1_2, loss2_2)
        self.assertAllClose(grads1_1, grads2_1)
        self.assertAllClose(grads1_2, grads2_2)

        self.assertAllClose(vars1, vars2_intermediate)

        self.assertAllClose(vars2[0], grads2_1['w'][0])
        self.assertAllClose(vars2[0], grads2_2['w'][0])

        self.assertAllClose(
            vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]),
            vars1_1[0])

        self.assertAllClose(
            vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]),
            vars2_1[0])

        self.assertAllClose(vars2, vars2_intermediate)
        self.assertAllClose(vars1_1, vars2_1)
Ejemplo n.º 6
0
    def ScaleGradients(self, var_grads, gradient_adjuster=None):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      A `.NestedMap` containing:
      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs,
        where gradients have already been scaled.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step. (Optional, only returned in case global norm clipping is
        used.)
    """
        p = self.params

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(name + '/' + p.name,
                                         py_utils.NestedMap(s=vg))
        all_grad_norm = tf.sqrt(
            py_utils.SumSquared([
                g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        all_var_norm = tf.sqrt(
            py_utils.SumSquared([
                v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        if gradient_adjuster is not None:
            tf.logging.info('gradient_adjuster=%s', gradient_adjuster)
            var_grads = gradient_adjuster(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        return_values = py_utils.NestedMap()
        if p.clip_gradient_single_norm_to_value:
            # Currently using both types of clipping simultaneously is unsupported.
            if p.clip_gradient_norm_to_value:
                raise ValueError(
                    'Cannot use clip_gradient_single_norm_to_value=%f and '
                    'clip_gradient_norm_to_value=%f.' %
                    (p.clip_gradient_single_norm_to_value,
                     p.clip_gradient_norm_to_value))
            final_var_grads = py_utils.ApplyGradNormCliping(
                var_grads, p.clip_gradient_single_norm_to_value)

        else:
            grad_scale = self._GetGlobalGradScale(all_grad_norm,
                                                  has_nan_or_inf)
            self._AddEvalMetric('grad_norm/all', all_grad_norm,
                                tf.constant(1.0))
            self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0))
            self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0))
            final_var_grads = py_utils.ApplyGradMultiplier(
                var_grads, grad_scale)
            return_values.grad_scale = grad_scale

        return_values.has_nan_or_inf = has_nan_or_inf
        return_values.final_var_grads = final_var_grads
        return return_values
Ejemplo n.º 7
0
  def testAccumulator(self):
    # testAccumulator compares
    #   - explicit averaging of independently computed var_grads1 and
    #     var_grads2,
    #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
    np.random.seed(12345)
    np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
    np.random.seed(12346)
    np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

    g1 = tf.Graph()
    with g1.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
      op = optimizer.SGD.Params().Set(add_summary=False)
      opt = op.cls(op)
      lr = 1e-1
      with tf.control_dependencies([loss1, loss2]):
        var_update_op1 = opt.Apply(
            lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
        with tf.control_dependencies([var_update_op1]):
          var_update_op2 = opt.Apply(
              lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))
      init_op = tf.global_variables_initializer()

    with self.session(use_gpu=True, graph=g1) as sess:
      sess.run(init_op)
      vars1 = sess.run(proj_layer.vars.Flatten())
      loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
          [loss1, var_grads1, loss2, var_grads2],
          feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      sess.run(
          [var_update_op2], feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      vars1_1 = sess.run(proj_layer.vars.Flatten())

    g2 = tf.Graph()
    with g2.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss = tf.reduce_sum(output1)
      var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
      op = optimizer.Accumulator.Params().Set(
          accum_steps=2,
          dtype=tf.float64,
          optimizer_tpl=optimizer.SGD.Params().Set(add_summary=False))
      opt = op.cls(op)
      lr = 1e-1
      var_update_op = opt.Apply(lr, var_grads)
      init_op = tf.global_variables_initializer()
      global_step = py_utils.GetOrCreateGlobalStep()
      increment_global_step_op = tf.assign_add(global_step, 1)
    with self.session(use_gpu=True, graph=g2) as sess:
      sess.run(init_op)
      vars2, global_step = sess.run([proj_layer.vars.Flatten(), global_step])
      loss2_1, grads2_1 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input1,
          })
      loss2_2, grads2_2 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input2,
          })
      acc_0 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input1,
          })
      acc_1 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_intermediate = sess.run(proj_layer.vars.Flatten())
      sess.run(increment_global_step_op)
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input2,
          })
      acc_2 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_1 = sess.run(proj_layer.vars.Flatten())

    self.assertAllClose(vars1, vars2)

    self.assertAllClose(acc_0, np.zeros_like(acc_0))
    self.assertAllClose(acc_1, grads2_1['w'][1])
    self.assertAllClose(acc_2, np.zeros_like(acc_0))

    self.assertAllClose(loss1_1, loss2_1)
    self.assertAllClose(loss1_2, loss2_2)
    self.assertAllClose(grads1_1, grads2_1)
    self.assertAllClose(grads1_2, grads2_2)

    self.assertAllClose(vars1, vars2_intermediate)

    self.assertAllClose(vars2[0], grads2_1['w'][0])
    self.assertAllClose(vars2[0], grads2_2['w'][0])

    self.assertAllClose(
        vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0])

    self.assertAllClose(
        vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0])

    self.assertAllClose(vars2, vars2_intermediate)
    self.assertAllClose(vars1_1, vars2_1)