Example #1
0
    def testDynamicUpdate(self, strategy_fn):
        with strategy_fn().scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=2, increment_period=1, multiplier=2)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

            # Test optimizer with finite gradients
            loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Gradient is 2, so variable will have 2 subtracted from it
            self.assertAllClose([-1.0, 0.0], self.evaluate(var))
            # Loss scale has doubled from 2 to 4
            self.assertEqual(4., self.evaluate(opt.loss_scale()))

            # Test optimizer with NaN gradients
            loss = lambda: var * float('NaN')
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(2., self.evaluate(opt.loss_scale()))
Example #2
0
    def testDynamicLossScale(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        expected_gradient = variables.Variable(learning_rate /
                                               strategy.num_replicas_in_sync)
        with strategy.scope():
            var = variables.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=2, increment_period=1, multiplier=2)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            self.assertEqual(
                loss_scale.initial_loss_scale % strategy.num_replicas_in_sync,
                0)

            run_fn = self._run_fn_with_grad_check(strategy, var, opt,
                                                  expected_gradient)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))

            # Loss scale will be double, so the expected gradient is also doubled.
            self.evaluate(
                expected_gradient.assign(2 * learning_rate /
                                         strategy.num_replicas_in_sync))
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # As before, the 2 is subtracted from the variable, making it's new value
            # 1.
            self.assertAllClose([1.], self.evaluate(var))
Example #3
0
    def testSerializationWithBuiltInOptimizer(self, use_v1):
        opt = gradient_descent.SGD(2., momentum=0.5)
        if use_v1:
            loss_scale = tf_loss_scale_module.DynamicLossScale(
                initial_loss_scale=2., increment_period=3.)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
        else:
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2., dynamic_growth_steps=3.)
        config = optimizers.serialize(opt)
        opt = optimizers.deserialize(config)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertTrue(opt.dynamic, 4.)
        # Deserializing a LossScaleOptimizer always always results in a V2
        # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1.
        self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)

        # Ensure the optimizer can be used
        var = variables.Variable([5.0])
        run_op = self._run_fn_with_grad_check(
            distribution_strategy_context.get_strategy(), var, opt, 2)()
        self.evaluate(variables.global_variables_initializer())
        self._run_if_in_graph_mode(run_op)
        self.assertEqual(self.evaluate(var), [3.])
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
    def testDynamicLossScaleWithSlots(self, strategy_fn):
        with strategy_fn().scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            # An SGD optimizer with momentum has slot variables.
            opt = gradient_descent.SGD(1.0, momentum=1.)
            initial_loss_scale = 2.
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=initial_loss_scale,
                increment_period=1,
                multiplier=4)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            loss = lambda: var / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator starts at 0 and the gradient is 1. The
            # accumulator is incremented by the gradient, so it is now 1. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 1.
            self.assertAllClose([0.0, 1.0], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale()),
                             initial_loss_scale * 4)

            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator was 1 before this step and the gradient is 1.
            # The accumulator is incremented by the gradient, so it is now 2. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 2.
            self.assertAllClose([-2., -1.], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale()),
                             initial_loss_scale * 16)
  def testSerializationWithCustomOptimizer(self):
    class MySGD(gradient_descent.SGD):

      def __init__(self, *args, **kwargs):
        super(MySGD, self).__init__(*args, **kwargs)
        self.my_attribute = 123

    opt = MySGD(2., momentum=0.5)
    loss_scale = loss_scale_module.DynamicLossScale(
        initial_loss_scale=2., increment_period=3.,
        multiplier=4.)
    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
    config = optimizers.serialize(opt)
    custom_objects = {'MySGD': MySGD}
    opt = optimizers.deserialize(config, custom_objects=custom_objects)
    # Force hyperparameters to be created
    opt.lr  # pylint: disable=pointless-statement
    self.evaluate(variables.global_variables_initializer())

    self.assertEqual(self.evaluate(opt.lr), 2.)
    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
    self.assertEqual(opt.loss_scale.increment_period, 3.)
    self.assertEqual(opt.loss_scale.multiplier, 4.)
    self.assertEqual(opt._optimizer.my_attribute, 123)
Example #6
0
    def test_dynamic_scale_to_one_on_non_finite_gradient_on_last_replica(
            self, use_tf_function):
        if context.num_gpus() < 1:
            # Requires the mirrored strategy to have two replicas: one on the CPU and
            # one on the GPU
            self.skipTest('Test requires at least 1 GPU')
        loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
        strategy = create_mirrored_strategy()
        with strategy.scope():
            x = variables.Variable(3.0)

        def run_fn():
            with lsgt.LossScaleGradientTape(loss_scale) as g:
                # The gradient will be finite on the first replica, and infinite on the
                # second
                rep_ctx = distribution_strategy_context.get_replica_context()
                if rep_ctx.replica_id_in_sync_group == rep_ctx.num_replicas_in_sync - 1:
                    y = x * np.inf
                else:
                    y = x * 2
            return g.gradient(y, x)

        replica0_grad, replica1_grad = self._run_with_strategy(
            run_fn, strategy, use_tf_function)
        self.assertEqual(self.evaluate(loss_scale()), 1.0)
        self.assertEqual(replica0_grad, 2.0)
        self.assertEqual(replica1_grad, np.inf)
  def from_config(cls, config, custom_objects=None):
    config = config.copy()  # Make a copy, since we mutate config
    config['optimizer'] = optimizers.deserialize(
        config['optimizer'], custom_objects=custom_objects)

    # If loss_scale is in config, we assume we are deserializing a
    # LossScaleOptimizer from TF 2.3 or below. Otherwise, we assume we are
    # deserializing a LossScaleOptimizer from TF 2.4 or above.
    if 'loss_scale' in config:
      config['loss_scale'] = keras_loss_scale_module.deserialize(
          config['loss_scale'])
      if (isinstance(config['loss_scale'], loss_scale_module.DynamicLossScale)
          and config['loss_scale'].multiplier != 2):
        raise ValueError('Cannot deserialize LossScaleOptimizer with a '
                         'DynamicLossScale whose multiplier is not 2. Got '
                         'DynamicLossScale: %s' % (config['loss_scale'],))
      return cls(**config)

    # We convert the config, as generated by LossScaleOptimizer.get_config, to a
    # version that can be passed to LossScaleOptimizerV1.__init__
    if config['dynamic']:
      config['loss_scale'] = loss_scale_module.DynamicLossScale(
          config['initial_scale'], config['dynamic_growth_steps'], multiplier=2)
    else:
      config['loss_scale'] = loss_scale_module.FixedLossScale(
          config['initial_scale'])

    del config['dynamic']
    del config['initial_scale']
    del config['dynamic_growth_steps']
    return cls(**config)
Example #8
0
  def _test_helper(self,
                   inputs,
                   expected_outputs,
                   initial_loss_scale=1.,
                   increment_period=2,
                   multiplier=2):
    loss_scale = loss_scale_module.DynamicLossScale(
        initial_loss_scale=initial_loss_scale,
        increment_period=increment_period,
        multiplier=multiplier)
    itr = _get_example_iter(inputs)

    def update():
      is_finite = itr.get_next()
      grad = self._get_tensor(is_finite)
      update_op, should_apply_gradients = loss_scale.update([grad])
      assert_op = check_ops.assert_equal(should_apply_gradients, is_finite)
      if context.executing_eagerly():
        return
      with ops.control_dependencies([assert_op]):
        return array_ops.identity(update_op)

    actual_outputs = []

    if not context.executing_eagerly():
      update_op = update()
      self.evaluate(variables.global_variables_initializer())
    for _ in range(len(inputs)):
      if context.executing_eagerly():
        update()
      else:
        self.evaluate(update_op)
      actual_outputs.append(self.evaluate(loss_scale()))
    self.assertEqual(actual_outputs, expected_outputs)
Example #9
0
 def test_dynamic_scale_to_one_on_non_finite_gradient(
         self, non_finite_term):
     loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
     x = constant_op.constant(1.0)
     with lsgt.LossScalingGradientTape(loss_scale) as g:
         g.watch(x)
         y = x * non_finite_term
     g.gradient(y, x)
     self.assertEqual(self.evaluate(loss_scale()), 1.0)
Example #10
0
 def test_serialization(self):
   loss_scale = loss_scale_module.DynamicLossScale(
       initial_loss_scale=1, increment_period=2, multiplier=3)
   config = loss_scale.get_config()
   loss_scale = loss_scale_module.DynamicLossScale.from_config(config)
   self.evaluate(variables.global_variables_initializer())
   self.assertEqual(self.evaluate(loss_scale()), 1)
   self.assertEqual(loss_scale.increment_period, 2)
   self.assertEqual(loss_scale.multiplier, 3)
Example #11
0
 def test_dynamic_loss_scaling_down_loop(self):
     loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
     x = constant_op.constant(1.0)
     with lsgt.LossScalingGradientTape(loss_scale) as g:
         g.watch(x)
         y = x * (3.0 * (10**37))  # grad will be inf after scaling
     dy_dx = g.gradient(y, x)
     self.assertEqual(self.evaluate(loss_scale()), 8.0)
     self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06)
Example #12
0
    def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False):
        self._skip_if_strategy_unsupported(strategy_fn)
        # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy
        # as well.
        strategy = strategy_fn()
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        # Create and run model.
        with strategy.scope():
            x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32)
            y = mp_test_util.AddLayer()(x)
            model = models.Model(inputs=x, outputs=y)

            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=1., increment_period=2., multiplier=2.)
            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            model.compile(optimizer=opt,
                          loss='mse',
                          run_eagerly=testing_utils.should_run_eagerly(),
                          experimental_run_tf_function=testing_utils.
                          should_run_tf_function())
        # Run for 3 steps (6 examples with a batch size of 2)
        model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
        self.assertEqual(backend.get_value(loss_scale()), 2)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
        (weight, ) = model.trainable_weights
        orig_weight = backend.get_value(weight)

        # Save model weights.
        save_path = os.path.join(self.get_temp_dir(), 'model')
        model.save(save_path, save_format='h5' if h5 else 'tf')

        # Run model again for 1 step (2 examples with a batch size of 2)
        model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
        new_weight = backend.get_value(weight)
        self.assertNotEqual(new_weight, orig_weight)
        self.assertEqual(backend.get_value(loss_scale()), 4)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0)

        # Load model weights and ensure loss scale weights are restored.
        model = save.load_model(
            save_path, custom_objects={'AddLayer': mp_test_util.AddLayer})
        loss_scale = model.optimizer.loss_scale
        (weight, ) = model.trainable_weights
        loaded_weight = backend.get_value(weight)
        self.assertEqual(loaded_weight, orig_weight)
        # Currently the loss scale isn't always saved when the model is saved with
        # Model.save(). So we assert the loss scale either has the value when it was
        # saved, or the value it was initialized with.
        # TODO(reedwm): Always save/restore the loss scale with Model.save().
        self.assertIn(backend.get_value(loss_scale()), (1, 2))
        self.assertIn(backend.get_value(loss_scale._num_good_steps), (0, 1))
Example #13
0
 def test_dynamic_loss_scaling_inf_target_post_scale(self):
     loss_scale = loss_scale_module.DynamicLossScale(
         initial_loss_scale=32.0)
     x = constant_op.constant(3.0 * (10**37))
     with lsgt.LossScalingGradientTape(loss_scale) as g:
         g.watch(x)
         y = x * 3.0  # target will be inf after scaling
     dy_dx = g.gradient(y, x)
     self.assertAllClose(self.evaluate(dy_dx), 3.0)
     self.assertEqual(self.evaluate(loss_scale()), 32.0)
  def test_dynamic_scale_to_one_on_non_finite_gradient(
      self, strategy_fn, non_finite_term, use_tf_function):
    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
    strategy = strategy_fn()
    with strategy.scope():
      x = variables.Variable(3.0)
    def run_fn():
      with lsgt.LossScaleGradientTape(loss_scale) as g:
        y = x * non_finite_term
      g.gradient(y, x)

    self._run_with_strategy(run_fn, strategy, use_tf_function)
    self.assertEqual(self.evaluate(loss_scale()), 1.0)
Example #15
0
    def test_dynamic_scale_to_one_on_non_finite_gradient(
            self, strategy_fn, non_finite_term, use_tf_function):
        loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)

        def run_fn():
            x = constant_op.constant(1.0)
            with lsgt.LossScaleGradientTape(loss_scale) as g:
                g.watch(x)
                y = x * non_finite_term
            g.gradient(y, x)

        self._run_with_strategy(run_fn, strategy_fn(), use_tf_function)
        self.assertEqual(self.evaluate(loss_scale()), 1.0)
Example #16
0
 def test_repr(self, strategy_fn):
   with strategy_fn().scope():
     loss_scale = loss_scale_module.DynamicLossScale(
         initial_loss_scale=1, increment_period=2, multiplier=3)
     if context.executing_eagerly():
       self.assertEqual(repr(loss_scale),
                        'DynamicLossScale(current_loss_scale=1.0, '
                        'num_good_steps=0, initial_loss_scale=1.0, '
                        'increment_period=2, multiplier=3.0)')
     else:
       self.assertEqual(repr(loss_scale),
                        'DynamicLossScale(initial_loss_scale=1.0, '
                        'increment_period=2, multiplier=3.0)')
  def test_dynamic_loss_scaling_down_loop(self, strategy_fn, use_tf_function):
    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
    strategy = strategy_fn()
    with strategy.scope():
      x = variables.Variable(3.0)
    def run_fn():
      with lsgt.LossScaleGradientTape(loss_scale) as g:
        y = x * (3.0 * (10**37))  # grad will be inf after scaling
      return g.gradient(y, x)

    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
    self.assertEqual(self.evaluate(loss_scale()), 8.0)
    for dy_dx in dy_dx_list:
      self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06)
Example #18
0
def get(identifier):
    """Get a loss scale object."""
    if isinstance(identifier, dict):
        return deserialize(identifier)

    if isinstance(identifier, (int, float)):
        return loss_scale_module.FixedLossScale(identifier)
    if identifier == 'dynamic':
        return loss_scale_module.DynamicLossScale()
    if isinstance(identifier, loss_scale_module.LossScale):
        return identifier
    elif identifier is None:
        return None
    else:
        raise ValueError('Could not interpret loss scale identifier: %s' %
                         identifier)
Example #19
0
    def testSerializationWithBuiltInOptimizer(self):
        opt = gradient_descent.SGD(2., momentum=0.5)
        loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=2.,
                                                        increment_period=3.,
                                                        multiplier=4.)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
        config = optimizers.serialize(opt)
        opt = optimizers.deserialize(config)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
        self.assertEqual(opt.loss_scale.increment_period, 3.)
        self.assertEqual(opt.loss_scale.multiplier, 4.)
Example #20
0
    def test_dynamic_loss_scaling_inf_target_post_scale(
            self, strategy_fn, use_tf_function):
        loss_scale = loss_scale_module.DynamicLossScale(
            initial_loss_scale=32.0)

        def run_fn():
            x = constant_op.constant(3.0 * (10**37))
            with lsgt.LossScaleGradientTape(loss_scale) as g:
                g.watch(x)
                y = x * 3.0  # target will be inf after scaling
            return g.gradient(y, x)

        dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(),
                                             use_tf_function)
        self.assertEqual(self.evaluate(loss_scale()), 32.0)
        for dy_dx in dy_dx_list:
            self.assertAllClose(self.evaluate(dy_dx), 3.0)
Example #21
0
  def testWeightMethods(self):
    var = variables.Variable([1.0])
    opt = gradient_descent.SGD(1.0)
    initial_loss_scale = 2.
    loss_scale = loss_scale_module.DynamicLossScale(
        initial_loss_scale=initial_loss_scale, increment_period=1,
        multiplier=4)
    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
    run_op = opt.minimize(lambda: var * 2, [var])
    self.evaluate(variables.global_variables_initializer())
    self._run_if_in_graph_mode(run_op)

    self.assertLen(opt.weights, 1)  # The 'iterations' weight
    self.assertEqual(self.evaluate(opt.weights[0]), 1)
    self.assertEqual(opt.get_weights()[0], 1)
    self.assertEqual(self.evaluate(opt.variables()[0]), 1)
    opt.set_weights([np.array(2.)])
    self.assertEqual(self.evaluate(opt.variables()[0]), 2)
Example #22
0
    def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        if not self._is_strategy_supported(strategy_fn):
            return
        strategy = strategy_fn()
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        # Create and run model.
        with strategy.scope():
            x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32)
            y = AddLayer(assert_type=dtypes.float32)(x)
            model = models.Model(inputs=x, outputs=y)

            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=1., increment_period=2., multiplier=2.)
            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            model.compile(
                optimizer=opt,
                loss='mse',
                run_eagerly=testing_utils.should_run_eagerly(),
                run_distributed=testing_utils.should_run_distributed())
        # Run for 3 steps (6 examples with a batch size of 2)
        model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
        self.assertEqual(backend.get_value(loss_scale()), 2)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)

        # Save model weights.
        save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
        model.save_weights(save_prefix)

        # Run model again for 1 step (2 examples with a batch size of 2)
        model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
        self.assertEqual(backend.get_value(loss_scale()), 4)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0)

        # Load model weights and ensure loss scale weights are restored.
        model.load_weights(save_prefix)
        self.assertEqual(backend.get_value(loss_scale()), 2)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
Example #23
0
    def testClipping(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'):
            with strategy.scope(), self.subTest(clip_type=clip_type):
                var = variables.Variable([5.0])
                opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0})
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=2, increment_period=1, multiplier=2)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
                self.assertEqual(getattr(opt, clip_type), 2.0)
                self.assertEqual(
                    loss_scale.initial_loss_scale %
                    strategy.num_replicas_in_sync, 0)

                loss = lambda: var * 4 / strategy.num_replicas_in_sync
                run_fn = lambda: opt.minimize(loss, var_list=[var])

                # Test running with clipped gradients
                run_op = strategy.experimental_run(run_fn)
                self.evaluate(variables.global_variables_initializer())
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 2, so the variable will be
                # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
                self.assertAllClose([1.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale()), 4)

                # Test changing the clip amount and running again
                setattr(opt, clip_type, 3.0)
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 3, so the variable will be
                # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
                self.assertAllClose([-5.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale()), 8)

                # Test Inf gradients are still skipped instead of being clipped
                loss = lambda: var * float('Inf')
                run_fn = lambda: opt.minimize(loss, var_list=[var])
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                self.assertAllClose([-5.],
                                    self.evaluate(var))  # Var does not change
                self.assertEqual(self.evaluate(opt.loss_scale()), 4)
  def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
    strategy = strategy_fn()
    learning_rate = 2.
    with strategy.scope():
      var = variables.Variable([5.0])
      opt = gradient_descent.SGD(learning_rate)
      loss_scale = loss_scale_module.DynamicLossScale(
          initial_loss_scale=2, increment_period=1, multiplier=2)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

      def loss():
        return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16')
      run_fn = lambda: opt.minimize(loss, var_list=[var])
      run_op = strategy.experimental_run(run_fn)
      self.evaluate(variables.global_variables_initializer())
      self._run_if_in_graph_mode(run_op)
      # The loss is the identity of the variable. Therefore the gradient is 1,
      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
      self.assertAllClose([3.], self.evaluate(var))
Example #25
0
    def testPassingV1LossScale(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        with strategy.scope():
            # Test FixedLossScale
            var = variables.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            loss_scale = tf_loss_scale_module.FixedLossScale(2.)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
            self.assertIsInstance(opt.loss_scale, ops.Tensor)
            self.evaluate(variables.global_variables_initializer())
            self.assertEqual(self.evaluate(opt.loss_scale), 2)
            run_fn = self._run_fn_with_grad_check(
                strategy, var, opt, 2 / strategy.num_replicas_in_sync)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))

            # Test DynamicLossScale
            var = variables.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            loss_scale = tf_loss_scale_module.DynamicLossScale(
                initial_loss_scale=4, increment_period=1, multiplier=2)
            loss_scale._current_loss_scale.assign(2)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
            self.assertEqual(opt.initial_scale, 4)
            self.assertEqual(opt.dynamic_growth_steps, 1)
            self.evaluate(variables.global_variables_initializer())
            # Current loss scale is not copied so loss scale is reinitialized to 4
            self.assertEqual(self.evaluate(opt.loss_scale), 4)
            for s in strategy.experimental_local_results(opt.dynamic_counter):
                self.assertEqual(self.evaluate(s), 0)

            run_fn = self._run_fn_with_grad_check(
                strategy, var, opt, 4 / strategy.num_replicas_in_sync)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            self.assertAllClose([3.], self.evaluate(var))
Example #26
0
    def testCheckpoint(self, strategy_fn):
        strategy = strategy_fn()
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        with self.test_session(), strategy.scope():
            # Build and run a simple model.
            var = variables.Variable([2.0])
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=1., increment_period=2., multiplier=2.)
            opt = gradient_descent.SGD(1., momentum=1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var])
            opt_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self.evaluate(opt_op)
            self.assertEqual(self.evaluate(loss_scale()), 1.)
            self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
            slot_var = opt._optimizer.get_slot(var, 'momentum')
            slot_value = self.evaluate(slot_var).item()

            # Save a checkpoint.
            checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
            prefix = os.path.join(self.get_temp_dir(), 'ckpt')
            save_path = checkpoint.save(prefix)

            # Run model again.
            self.evaluate(strategy.experimental_run(run_fn))
            self.assertEqual(self.evaluate(loss_scale()), 2.)
            self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
            self.assertNotAlmostEqual(
                self.evaluate(slot_var).item(), slot_value)

            # Load checkpoint and ensure loss scale is back to it's original value.
            status = checkpoint.restore(save_path)
            status.assert_consumed()
            status.run_restore_ops()
            self.assertEqual(self.evaluate(loss_scale()), 1.)
            self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
            self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
Example #27
0
    def testDynamicLossScaleWithSlots(self, strategy_fn):
        strategy_obj = strategy_fn()
        if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy)
                and control_flow_v2_toggles.control_flow_v2_enabled()
                and not context.executing_eagerly()):
            self.skipTest('b/138667997')
        with strategy_obj.scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            # An SGD optimizer with momentum has slot variables.
            opt = gradient_descent.SGD(1.0, momentum=1.)
            initial_loss_scale = 2.
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=initial_loss_scale,
                increment_period=1,
                multiplier=4)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            loss = lambda: var / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator starts at 0 and the gradient is 1. The
            # accumulator is incremented by the gradient, so it is now 1. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 1.
            self.assertAllClose([0.0, 1.0], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale()),
                             initial_loss_scale * 4)

            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator was 1 before this step and the gradient is 1.
            # The accumulator is incremented by the gradient, so it is now 2. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 2.
            self.assertAllClose([-2., -1.], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale()),
                             initial_loss_scale * 16)

            self.assertEqual(opt.get_slot_names(), ['momentum'])
Example #28
0
  def test_loss_scale(self):
    policy = mp_policy.Policy('float32')
    self.assertEqual(policy.loss_scale, None)

    policy = mp_policy.Policy('float32', loss_scale=None)
    self.assertEqual(policy.loss_scale, None)

    ls = loss_scale_module.DynamicLossScale()
    policy = mp_policy.Policy('float32', loss_scale=ls)
    self.assertIs(policy.loss_scale, ls)

    policy = mp_policy.Policy('float32', loss_scale='dynamic')
    self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale)

    policy = mp_policy.Policy('mixed_float16')
    self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale)

    policy = mp_policy.Policy('mixed_float16', loss_scale=None)
    self.assertEqual(policy.loss_scale, None)

    policy = mp_policy.Policy('mixed_bfloat16')
    self.assertEqual(policy.loss_scale, None)
Example #29
0
    def testPassingV1LossScaleErrors(self):
        opt = gradient_descent.SGD()
        loss_scale = tf_loss_scale_module.DynamicLossScale(multiplier=4)
        with self.assertRaisesRegex(
                ValueError, 'When passing a DynamicLossScale to "loss_scale", '
                'DynamicLossScale.multiplier must be 2. Got: '
                'DynamicLossScale'):
            loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)

        class MyLossScale(tf_loss_scale_module.LossScale):
            def __call__(self):
                return 1.

            def update(self, grads):
                return None, True

            def get_config(self):
                return {}

        with self.assertRaisesRegex(
                TypeError,
                'Passing a LossScale that is not a FixedLossScale or a '
                'DynamicLossScale is no longer supported. Got:'):
            loss_scale_optimizer.LossScaleOptimizerV1(opt, MyLossScale())
Example #30
0
    def testCheckpoint(self, strategy_fn, save_with_ls, restore_with_ls):
        class MySGD(gradient_descent.SGD):
            """A custom optimizer that tracks an extra variable."""
            def __init__(self, *args, **kwargs):
                super(MySGD, self).__init__(*args, **kwargs)
                self.my_var = variables.Variable(0.)
                self._track_trackable(self.my_var, 'my_var')

        strategy = strategy_fn()
        replicas = strategy.num_replicas_in_sync
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        with self.test_session(), strategy.scope():
            # Build and run a simple model.
            var = variables.Variable([2.0])
            opt = inner_opt = MySGD(1., momentum=1.)
            if save_with_ls:
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=1., increment_period=2., multiplier=2.)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            run_fn = lambda: opt.minimize(lambda: var / replicas + 1.,
                                          var_list=[var])
            opt_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self.evaluate(strategy.experimental_local_results(opt_op))

            # Assert values.
            self.assertEqual(self.evaluate(var), 1.)
            if save_with_ls:
                self.assertEqual(self.evaluate(loss_scale()), 1.)
                self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
            slot_var = opt.get_slot(var, 'momentum')
            self.assertEqual(self.evaluate(slot_var).item(), -1)
            self.assertEqual(self.evaluate(opt.iterations), 1)

            # Set optimizer variable to check arbitrary optimizer attributes can be
            # saved/restored
            self.evaluate(inner_opt.my_var.assign(1.))

            # Save a checkpoint.
            checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
            prefix = os.path.join(self.get_temp_dir(), 'ckpt')
            save_path = checkpoint.save(prefix)

            # Create new model
            var = variables.Variable([2.0])
            opt = inner_opt = MySGD(1., momentum=1.)
            if restore_with_ls:
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=1., increment_period=2., multiplier=2.)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

            # Restore new model.
            checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
            status = checkpoint.restore(save_path)
            if save_with_ls:
                status.assert_existing_objects_matched()
            else:
                status.assert_nontrivial_match()

            # Assert restored values. We can only assert in eager mode since the
            # variables are uninitialized in graph mode
            if context.executing_eagerly():
                self.assertEqual(self.evaluate(var), 1.)
                if save_with_ls and restore_with_ls:
                    self.assertEqual(self.evaluate(loss_scale()), 1.)
                    self.assertEqual(self.evaluate(loss_scale._num_good_steps),
                                     1)
                elif restore_with_ls:
                    self.assertEqual(self.evaluate(loss_scale()), 1.)
                    self.assertEqual(self.evaluate(loss_scale._num_good_steps),
                                     0)
                self.assertEqual(self.evaluate(opt.iterations), 1)

            # Run the model again.
            run_fn = lambda: opt.minimize(lambda: var / replicas + 1.,
                                          var_list=[var])
            opt_op = strategy.experimental_run(run_fn)

            # Assert new values.
            self.evaluate(variables.global_variables_initializer())
            status.run_restore_ops()
            self.evaluate(strategy.experimental_local_results(opt_op))
            self.assertEqual(self.evaluate(var), -1)
            slot_var = opt.get_slot(var, 'momentum')
            self.assertEqual(self.evaluate(slot_var).item(), -2)
            self.assertEqual(self.evaluate(opt.iterations), 2)
            self.assertEqual(self.evaluate(inner_opt.my_var), 1)

            # Restore model again to test restoring after slots are created
            status = checkpoint.restore(save_path)
            if save_with_ls and restore_with_ls:
                status.assert_consumed()
            elif save_with_ls:
                status.assert_existing_objects_matched()
            elif restore_with_ls:
                status.assert_nontrivial_match()
            status.run_restore_ops()
            self.assertEqual(self.evaluate(var), 1)
            self.assertEqual(self.evaluate(slot_var).item(), -1)