Exemple #1
0
    def testGetConfig(self):
        opt = gradient_descent.SGD(2., momentum=0.5)
        loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=2.,
                                                        increment_period=3.,
                                                        multiplier=4.)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
        config = opt.get_config()
        opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
        self.assertEqual(opt.loss_scale.increment_period, 3.)
        self.assertEqual(opt.loss_scale.multiplier, 4.)
def _wrap_optimizer(opt, loss_scale, use_v1_behavior):
  """Wraps an optimizer with a LossScaleOptimizer."""

  if isinstance(opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer):
    raise ValueError('"opt" must not already be an instance of a '
                     'MixedPrecisionLossScaleOptimizer. '
                     '`enable_mixed_precision_graph_rewrite` will '
                     'automatically wrap the optimizer with a '
                     'MixedPrecisionLossScaleOptimizer.')
  # To avoid a circular dependency, we cannot depend on tf.keras. Because
  # LossScaleOptimizer is in Keras, we cannot use isinstance, so instead check
  # the class name.
  if opt.__class__.__name__ == 'LossScaleOptimizer':
    raise ValueError('"opt" must not already be an instance of a '
                     'LossScaleOptimizer. '
                     '`enable_mixed_precision_graph_rewrite` will '
                     'automatically wrap the optimizer with a '
                     'LossScaleOptimizer.')

  if isinstance(opt, optimizer.Optimizer):
    # For convenience, we allow the V2 version of this function to wrap the V1
    # optimizer, even though we do not document this.
    return loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer(opt,
                                                                    loss_scale)

  # Because we cannot depend on tf.keras, we see if `opt` is an instance of the
  # Keras OptimizerV2 class by checking the subclass names.
  base_classes = tf_inspect.getmro(opt.__class__)
  base_class_names = [cls.__name__ for cls in base_classes]
  is_loss_scale_optimizer_v2 = 'OptimizerV2' in base_class_names

  if is_loss_scale_optimizer_v2:
    # Because we cannot depend on tf.keras, we cannot unconditionally do this
    # import. But since `opt` is a Keras OptimizerV2, we know keras is
    # importable, so it is safe to do this import. (Technically, it's possible
    # to have a dependency on OptimizerV2 and not LossScaleOptimizer, but this
    # is not done in practice).
    from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2  # pylint: disable=g-import-not-at-top
    return loss_scale_optimizer_v2.LossScaleOptimizer(opt, loss_scale)

  if use_v1_behavior:
    raise ValueError('"opt" must be an instance of a tf.train.Optimizer or a '
                     'tf.keras.optimizers.Optimizer, but got: %s' % opt)
  else:
    raise ValueError('"opt" must be an instance of a '
                     'tf.keras.optimizers.Optimizer, but got: %s' % opt)
Exemple #3
0
    def test_fixed_loss_scaling(self, strategy_fn, cloning=True):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        # Note: We do not test mixed precision in this method, only loss scaling.
        if not self._is_strategy_supported(strategy_fn):
            return
        loss_scale = 8.
        batch_size = 4
        with strategy_fn().scope():
            x = layers.Input(shape=(1, ), batch_size=batch_size)
            layer = AddLayer()
            y = layer(x)

            # The gradient of 'y' at this point is 1. With loss scaling, the gradient
            # is 'loss_scale'. We divide by the batch size since the loss is averaged
            # across batch elements.
            expected_gradient = loss_scale / batch_size
            identity_with_grad_check_fn = (
                mp_test_util.create_identity_with_grad_check_fn(
                    [expected_gradient]))
            y = core.Lambda(identity_with_grad_check_fn)(y)
            model = models.Model(inputs=x, outputs=y)

            def loss_fn(y_true, y_pred):
                del y_true
                return math_ops.reduce_mean(y_pred)

            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            model.compile(
                opt,
                loss=loss_fn,
                cloning=cloning,
                run_eagerly=testing_utils.should_run_eagerly(),
                run_distributed=testing_utils.should_run_distributed())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # Variable starts at 1, and should have gradient of 1 subtracted from it.
        expected = 0
        self.assertEqual(backend.eval(layer.v), expected)
Exemple #4
0
 def testFixedLossScaleAppliedToLossWithMinimize(self, strategy_fn):
     with strategy_fn().scope() as strategy:
         var = variables.Variable([5.0])
         opt = gradient_descent.SGD(2.0)
         loss_scale = 10.
         opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
         # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale
         # / strategy.num_replicas_in_sync will not be exact, which could lead to
         # assertion failures due to rounding issues.
         self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0)
         run_fn = self._run_fn_with_grad_check(
             strategy, var, opt, loss_scale / strategy.num_replicas_in_sync)
         run_op = strategy.experimental_run(run_fn)
         self.evaluate(variables.global_variables_initializer())
         self._run_if_in_graph_mode(run_op)
         # The loss is the identity of the variable. Therefore the gradient is 1,
         # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
         self.assertAllClose([3.], self.evaluate(var))
  def testWeightMethods(self):
    var = variables.Variable([1.0])
    opt = gradient_descent.SGD(1.0)
    initial_loss_scale = 2.
    loss_scale = loss_scale_module.DynamicLossScale(
        initial_loss_scale=initial_loss_scale, increment_period=1,
        multiplier=4)
    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
    run_op = opt.minimize(lambda: var * 2, [var])
    self.evaluate(variables.global_variables_initializer())
    self._run_if_in_graph_mode(run_op)

    self.assertLen(opt.weights, 1)  # The 'iterations' weight
    self.assertEqual(self.evaluate(opt.weights[0]), 1)
    self.assertEqual(opt.get_weights()[0], 1)
    self.assertEqual(self.evaluate(opt.variables()[0]), 1)
    opt.set_weights([np.array(2.)])
    self.assertEqual(self.evaluate(opt.variables()[0]), 2)
  def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
    strategy = strategy_fn()
    learning_rate = 2.
    with strategy.scope():
      var = variables.Variable([5.0])
      opt = gradient_descent.SGD(learning_rate)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
                                                    dynamic_growth_steps=1)

      def loss():
        return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16')
      run_fn = lambda: opt.minimize(loss, var_list=[var])
      run_op = strategy.experimental_run(run_fn)
      self.evaluate(variables.global_variables_initializer())
      self._run_if_in_graph_mode(run_op)
      # The loss is the identity of the variable. Therefore the gradient is 1,
      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
      self.assertAllClose([3.], self.evaluate(var))
Exemple #7
0
    def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        if not self._is_strategy_supported(strategy_fn):
            return
        strategy = strategy_fn()
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        # Create and run model.
        with strategy.scope():
            x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32)
            y = AddLayer(assert_type=dtypes.float32)(x)
            model = models.Model(inputs=x, outputs=y)

            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=1., increment_period=2., multiplier=2.)
            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            model.compile(
                optimizer=opt,
                loss='mse',
                run_eagerly=testing_utils.should_run_eagerly(),
                run_distributed=testing_utils.should_run_distributed())
        # Run for 3 steps (6 examples with a batch size of 2)
        model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
        self.assertEqual(backend.get_value(loss_scale()), 2)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)

        # Save model weights.
        save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
        model.save_weights(save_prefix)

        # Run model again for 1 step (2 examples with a batch size of 2)
        model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
        self.assertEqual(backend.get_value(loss_scale()), 4)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0)

        # Load model weights and ensure loss scale weights are restored.
        model.load_weights(save_prefix)
        self.assertEqual(backend.get_value(loss_scale()), 2)
        self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
Exemple #8
0
    def testClipping(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'):
            with strategy.scope(), self.subTest(clip_type=clip_type):
                var = variables.Variable([5.0])
                opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0})
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=2, increment_period=1, multiplier=2)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
                self.assertEqual(getattr(opt, clip_type), 2.0)
                self.assertEqual(
                    loss_scale.initial_loss_scale %
                    strategy.num_replicas_in_sync, 0)

                loss = lambda: var * 4 / strategy.num_replicas_in_sync
                run_fn = lambda: opt.minimize(loss, var_list=[var])

                # Test running with clipped gradients
                run_op = strategy.experimental_run(run_fn)
                self.evaluate(variables.global_variables_initializer())
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 2, so the variable will be
                # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
                self.assertAllClose([1.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale()), 4)

                # Test changing the clip amount and running again
                setattr(opt, clip_type, 3.0)
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 3, so the variable will be
                # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
                self.assertAllClose([-5.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale()), 8)

                # Test Inf gradients are still skipped instead of being clipped
                loss = lambda: var * float('Inf')
                run_fn = lambda: opt.minimize(loss, var_list=[var])
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                self.assertAllClose([-5.],
                                    self.evaluate(var))  # Var does not change
                self.assertEqual(self.evaluate(opt.loss_scale()), 4)
Exemple #9
0
    def testCheckpoint(self, strategy_fn):
        strategy = strategy_fn()
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        with self.test_session(), strategy.scope():
            # Build and run a simple model.
            var = variables.Variable([2.0])
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=1., increment_period=2., multiplier=2.)
            opt = gradient_descent.SGD(1., momentum=1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var])
            opt_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self.evaluate(opt_op)
            self.assertEqual(self.evaluate(loss_scale()), 1.)
            self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
            slot_var = opt._optimizer.get_slot(var, 'momentum')
            slot_value = self.evaluate(slot_var).item()

            # Save a checkpoint.
            checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
            prefix = os.path.join(self.get_temp_dir(), 'ckpt')
            save_path = checkpoint.save(prefix)

            # Run model again.
            self.evaluate(strategy.experimental_run(run_fn))
            self.assertEqual(self.evaluate(loss_scale()), 2.)
            self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
            self.assertNotAlmostEqual(
                self.evaluate(slot_var).item(), slot_value)

            # Load checkpoint and ensure loss scale is back to it's original value.
            status = checkpoint.restore(save_path)
            status.assert_consumed()
            status.run_restore_ops()
            self.assertEqual(self.evaluate(loss_scale()), 1.)
            self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
            self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
Exemple #10
0
    def testDynamicLossScaleWithSlots(self, strategy_fn):
        strategy_obj = strategy_fn()
        if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy)
                and control_flow_v2_toggles.control_flow_v2_enabled()
                and not context.executing_eagerly()):
            self.skipTest('b/138667997')
        with strategy_obj.scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            # An SGD optimizer with momentum has slot variables.
            opt = gradient_descent.SGD(1.0, momentum=1.)
            initial_loss_scale = 2.
            loss_scale = loss_scale_module.DynamicLossScale(
                initial_loss_scale=initial_loss_scale,
                increment_period=1,
                multiplier=4)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            loss = lambda: var / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator starts at 0 and the gradient is 1. The
            # accumulator is incremented by the gradient, so it is now 1. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 1.
            self.assertAllClose([0.0, 1.0], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale()),
                             initial_loss_scale * 4)

            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator was 1 before this step and the gradient is 1.
            # The accumulator is incremented by the gradient, so it is now 2. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 2.
            self.assertAllClose([-2., -1.], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale()),
                             initial_loss_scale * 16)

            self.assertEqual(opt.get_slot_names(), ['momentum'])
Exemple #11
0
    def testSerializationWithCustomOptimizer(self):
        class MySGD(gradient_descent.SGD):
            def __init__(self, *args, **kwargs):
                super(MySGD, self).__init__(*args, **kwargs)
                self.my_attribute = 123

        opt = MySGD(2., momentum=0.5)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                      initial_scale=2.,
                                                      dynamic_growth_steps=3.)
        config = optimizers.serialize(opt)
        custom_objects = {'MySGD': MySGD}
        opt = optimizers.deserialize(config, custom_objects=custom_objects)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertEqual(opt._optimizer.my_attribute, 123)
  def testIterationsIncremented(self, strategy_fn):
    with strategy_fn().scope() as strategy:
      # Test iterations is incremented in opt.minimize.
      opt = gradient_descent.SGD(1.0)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale='dynamic')
      var = variables.Variable([5.0])
      loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
      run_fn = lambda: opt.minimize(loss, [var])
      run_op = strategy.experimental_run(run_fn)
      self.evaluate(variables.global_variables_initializer())
      self._run_if_in_graph_mode(run_op)
      self.assertEqual(self.evaluate(var), 3.0)  # Grad is 2, so var is 5 - 2
      self.assertEqual(self.evaluate(opt.iterations), 1)

      # Test iterations is incremented in opt.minimize even if gradients aren't
      # applied to variables due to NaN gradients.
      loss = lambda: var * float('NaN')
      run_fn = lambda: opt.minimize(loss, [var])
      run_op = strategy.experimental_run(run_fn)
      self._run_if_in_graph_mode(run_op)
      self.assertEqual(self.evaluate(var), 3.0)
      self.assertEqual(self.evaluate(opt.iterations), 2)
    def testApplyGradientsGetsUnwrappedTensors(self):
        # Tests that gradients passed to apply_gradients are not wrapped in a
        # DistributionStrategy wrapper, such as PerReplica, but instead are raw
        # Tensors. Optimizer subclasses that override apply_gradients() expect raw
        # Tensors, even though the base Optimizer can handle PerReplica gradients.

        outer_self = self

        class MyOptimizer(gradient_descent.SGD):
            def apply_gradients(self, grads_and_vars, name=None):
                for grad, _ in grads_and_vars:
                    outer_self.assertIsInstance(grad, ops.Tensor)
                return super(MyOptimizer,
                             self).apply_gradients(grads_and_vars, name)

        with create_mirrored_strategy().scope() as strategy:
            var = variables.Variable([5.0])
            opt = MyOptimizer(learning_rate=1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=1)
            loss = lambda: var * 2.0
            run_fn = lambda: opt.minimize(loss, [var])
            strategy.experimental_run(run_fn)
  def test_restore_old_loss_scale_checkpoint(self):
    # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
    # of LossScaleOptimizer changed, but old checkpoints can still be loaded
    opt = gradient_descent.SGD(0.1, momentum=0.1)
    opt = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
    model = sequential.Sequential([core.Dense(2,)])

    # The checkpoint and expected values were obtained from the program in
    # testdata/BUILD.
    ckpt_dir = test.test_src_dir_path(
        'python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
    model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
    model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
    model(np.zeros((2, 2)))  # Create model weights
    opt._create_all_weights(model.weights)
    expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
    expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]])
    self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
    self.assertAllClose(
        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
        expected_slot)
    self.assertEqual(self.evaluate(opt.loss_scale()), 32768)
    self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)

    # Check restoring works even after the model is compiled and the weights
    # have been created.
    model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2)))
    self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel)
    self.assertNotAllClose(
        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
        expected_slot)
    model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
    self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
    self.assertAllClose(
        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
        expected_slot)
    self.assertEqual(self.evaluate(opt.loss_scale()), 32768)
    self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)
 def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2.)
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(5.)))
Exemple #16
0
    def testCheckpoint(self, strategy_fn, save_with_ls, restore_with_ls):
        class MySGD(gradient_descent.SGD):
            """A custom optimizer that tracks an extra variable."""
            def __init__(self, *args, **kwargs):
                super(MySGD, self).__init__(*args, **kwargs)
                self.my_var = variables.Variable(0.)
                self._track_trackable(self.my_var, 'my_var')

        strategy = strategy_fn()
        replicas = strategy.num_replicas_in_sync
        if (isinstance(strategy, mirrored_strategy.MirroredStrategy)
                and not context.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        with self.test_session(), strategy.scope():
            # Build and run a simple model.
            var = variables.Variable([2.0])
            opt = inner_opt = MySGD(1., momentum=1.)
            if save_with_ls:
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=1., increment_period=2., multiplier=2.)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            run_fn = lambda: opt.minimize(lambda: var / replicas + 1.,
                                          var_list=[var])
            opt_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self.evaluate(strategy.experimental_local_results(opt_op))

            # Assert values.
            self.assertEqual(self.evaluate(var), 1.)
            if save_with_ls:
                self.assertEqual(self.evaluate(loss_scale()), 1.)
                self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
            slot_var = opt.get_slot(var, 'momentum')
            self.assertEqual(self.evaluate(slot_var).item(), -1)
            self.assertEqual(self.evaluate(opt.iterations), 1)

            # Set optimizer variable to check arbitrary optimizer attributes can be
            # saved/restored
            self.evaluate(inner_opt.my_var.assign(1.))

            # Save a checkpoint.
            checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
            prefix = os.path.join(self.get_temp_dir(), 'ckpt')
            save_path = checkpoint.save(prefix)

            # Create new model
            var = variables.Variable([2.0])
            opt = inner_opt = MySGD(1., momentum=1.)
            if restore_with_ls:
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=1., increment_period=2., multiplier=2.)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

            # Restore new model.
            checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
            status = checkpoint.restore(save_path)
            if save_with_ls:
                status.assert_existing_objects_matched()
            else:
                status.assert_nontrivial_match()

            # Assert restored values. We can only assert in eager mode since the
            # variables are uninitialized in graph mode
            if context.executing_eagerly():
                self.assertEqual(self.evaluate(var), 1.)
                if save_with_ls and restore_with_ls:
                    self.assertEqual(self.evaluate(loss_scale()), 1.)
                    self.assertEqual(self.evaluate(loss_scale._num_good_steps),
                                     1)
                elif restore_with_ls:
                    self.assertEqual(self.evaluate(loss_scale()), 1.)
                    self.assertEqual(self.evaluate(loss_scale._num_good_steps),
                                     0)
                self.assertEqual(self.evaluate(opt.iterations), 1)

            # Run the model again.
            run_fn = lambda: opt.minimize(lambda: var / replicas + 1.,
                                          var_list=[var])
            opt_op = strategy.experimental_run(run_fn)

            # Assert new values.
            self.evaluate(variables.global_variables_initializer())
            status.run_restore_ops()
            self.evaluate(strategy.experimental_local_results(opt_op))
            self.assertEqual(self.evaluate(var), -1)
            slot_var = opt.get_slot(var, 'momentum')
            self.assertEqual(self.evaluate(slot_var).item(), -2)
            self.assertEqual(self.evaluate(opt.iterations), 2)
            self.assertEqual(self.evaluate(inner_opt.my_var), 1)

            # Restore model again to test restoring after slots are created
            status = checkpoint.restore(save_path)
            if save_with_ls and restore_with_ls:
                status.assert_consumed()
            elif save_with_ls:
                status.assert_existing_objects_matched()
            elif restore_with_ls:
                status.assert_nontrivial_match()
            status.run_restore_ops()
            self.assertEqual(self.evaluate(var), 1)
            self.assertEqual(self.evaluate(slot_var).item(), -1)
Exemple #17
0
 def testPassingNoneToLossScale(self):
     opt = gradient_descent.SGD()
     with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'):
         loss_scale_optimizer.LossScaleOptimizer(opt, None)
Exemple #18
0
 def testIterations(self):
     opt = gradient_descent.SGD(2.0)
     lso = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=10.)
     lso.iterations = 7
     self.assertEqual(lso.iterations, 7)
     self.assertEqual(opt.iterations, 7)
Exemple #19
0
    def test_dynamic_loss_scaling(self, strategy_fn, cloning=True):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        if not self._is_strategy_supported(strategy_fn):
            return
        strategy = strategy_fn()
        initial_loss_scale = 2.
        batch_size = 4
        expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                             dtype=dtypes.float16)
        # If this variable is set to True, the model below will have NaN gradients
        have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
        with strategy.scope():
            with policy.policy_scope(policy.Policy('infer_float32_vars')):
                x = layers.Input(shape=(1, ),
                                 batch_size=batch_size,
                                 dtype=dtypes.float16)
                layer = AddLayer(assert_type=dtypes.float16)
                y = layer(x)
                identity_with_nan_grads = (
                    mp_test_util.create_identity_with_nan_gradients_fn(
                        have_nan_gradients))
                y = core.Lambda(identity_with_nan_grads)(y)
                identity_with_grad_check_fn = (
                    mp_test_util.create_identity_with_grad_check_fn(
                        expected_dtype=dtypes.float16,
                        expected_gradient=expected_gradient))
                y = core.Lambda(identity_with_grad_check_fn)(y)
                y = math_ops.cast(y, dtypes.float32)
                model = models.Model(inputs=x, outputs=y)

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                opt = gradient_descent.SGD(1.)
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=initial_loss_scale, increment_period=2)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
                model.compile(
                    opt,
                    loss=loss_fn,
                    cloning=cloning,
                    run_eagerly=testing_utils.should_run_eagerly(),
                    run_distributed=testing_utils.should_run_distributed())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # The variables starts with 1 and has a gradient of 1, so will go down by 1
        # each step.
        self.assertEqual(backend.eval(layer.v), 0)

        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -1)

        # There have been two steps without NaNs, so the loss scale will double
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient * 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -2)

        # Next test with NaN gradients.
        backend.set_value(have_nan_gradients, True)
        model.fit(dataset)
        # Variable should not be updated
        self.assertEqual(backend.eval(layer.v), -2)

        # Test with finite gradients again
        backend.set_value(have_nan_gradients, False)
        # The loss scale will be halved due to the NaNs, so the gradient will also
        # be halved
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient / 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -3)
Exemple #20
0
    def test_dynamic_loss_scaling(self,
                                  strategy_fn,
                                  pass_loss_scale_to_policy=False,
                                  get_config=False):
        strategy = strategy_fn()
        initial_loss_scale = 2.
        batch_size = 4
        loss_scale = loss_scale_module.DynamicLossScale(
            initial_loss_scale=initial_loss_scale, increment_period=2)
        expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                             dtype=dtypes.float16)
        # If this variable is set to True, the model below will have NaN gradients
        have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
        with strategy.scope():
            opt = gradient_descent.SGD(1.)
            if pass_loss_scale_to_policy:
                p = policy.Policy('mixed_float16', loss_scale=loss_scale)
            else:
                p = policy.Policy('mixed_float16', loss_scale=None)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            with policy.policy_scope(p):
                x = layers.Input(shape=(1, ),
                                 batch_size=batch_size,
                                 dtype=dtypes.float16)
                layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16)
                y = layer(x)
                identity_with_nan_grads = (
                    mp_test_util.create_identity_with_nan_gradients_fn(
                        have_nan_gradients))
                y = core.Lambda(identity_with_nan_grads)(y)
                identity_with_grad_check_fn = (
                    mp_test_util.create_identity_with_grad_check_fn(
                        expected_dtype=dtypes.float16,
                        expected_gradient=expected_gradient))
                y = core.Lambda(identity_with_grad_check_fn)(y)
                model = models.Model(inputs=x, outputs=y)
                if get_config:
                    config = model.get_config()
                    model = model.__class__.from_config(
                        config,
                        custom_objects={
                            'MultiplyLayer': mp_test_util.MultiplyLayer
                        })
                    (layer, ) = (
                        layer for layer in model.layers
                        if isinstance(layer, mp_test_util.MultiplyLayer))

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                model.compile(opt,
                              loss=loss_fn,
                              run_eagerly=testing_utils.should_run_eagerly())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # The variables starts with 1 and has a gradient of 1, so will go down by 1
        # each step.
        self.assertEqual(backend.eval(layer.v), 0)

        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -1)

        # There have been two steps without NaNs, so the loss scale will double
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient * 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -2)

        # Next test with NaN gradients.
        backend.set_value(have_nan_gradients, True)
        model.fit(dataset)
        # Variable should not be updated
        self.assertEqual(backend.eval(layer.v), -2)

        # Test with finite gradients again
        backend.set_value(have_nan_gradients, False)
        # The loss scale will be halved due to the NaNs, so the gradient will also
        # be halved
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient / 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -3)
Exemple #21
0
    def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        # The advanced model tests mixed-precision-related features that would occur
        # in a resnet50 model. It tests a model that has:
        #  * Multiple layers, some which use auto-cast variables and some which do
        #    not
        #  * Regularization on some variables and not others.
        #  * A fixed loss scale (if use_loss_scaling is True)

        if not self._is_strategy_supported(strategy_fn):
            return
        strategy = strategy_fn()
        if use_loss_scaling:
            loss_scale = 8.
        learning_rate = 2**-14

        with strategy.scope():
            with policy.policy_scope(policy.Policy('infer_float32_vars')):
                x = layers.Input(shape=(1, ),
                                 batch_size=2,
                                 dtype=dtypes.float16)
                layer1 = AddLayer(assert_type=dtypes.float16,
                                  regularizer=IdentityRegularizer(),
                                  use_operator=True)
                layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
                                                 use_operator=True)
                layer3 = AddLayer(assert_type=dtypes.float16,
                                  use_operator=False)
                layer4 = AddLayerWithoutAutoCast(
                    assert_type=dtypes.float16,
                    regularizer=IdentityRegularizer(),
                    use_operator=False)
                y = layer1(x)
                y = layer2(y)
                y = layer3(y)
                y = layer4(y)
                if use_loss_scaling:
                    # The gradient of 'y' at this point is 1. With loss scaling, the
                    # gradient is 'loss_scale'. We divide by the batch size of 2 since the
                    # loss is averaged across batch elements.
                    expected_gradient = loss_scale / 2
                    identity_with_grad_check_fn = (
                        mp_test_util.create_identity_with_grad_check_fn(
                            expected_dtype=dtypes.float16,
                            expected_gradient=[expected_gradient]))
                    y = core.Lambda(identity_with_grad_check_fn)(y)
                y = math_ops.cast(y, dtypes.float32)
                model = models.Model(inputs=x, outputs=y)

                def loss_fn(y_true, y_pred):
                    self.assertEqual(y_true.dtype, dtypes.float32)
                    self.assertEqual(y_pred.dtype, dtypes.float32)
                    return math_ops.reduce_mean(y_pred)

                opt = gradient_descent.SGD(learning_rate)
                if use_loss_scaling:
                    opt = loss_scale_optimizer.LossScaleOptimizer(
                        opt, loss_scale)
                model.compile(
                    opt,
                    loss=loss_fn,
                    run_eagerly=testing_utils.should_run_eagerly(),
                    run_distributed=testing_utils.should_run_distributed())

        x = np.ones((2, 1))
        y = np.ones((2, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
        model.fit(dataset)
        for layer in (layer1, layer2, layer3, layer4):
            if layer.losses:
                # Layer has weight regularizer
                self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
            else:
                # Layer does not have weight regularizer
                self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
 def testDynamicMustBeBool(self):
   opt = gradient_descent.SGD()
   with self.assertRaisesRegex(
       TypeError, '"dynamic" argument to LossScaleOptimizer.__init__ must be '
                  "a bool, but got: 'dynamic'"):
     loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
Exemple #23
0
    def testGetConfigDynamic(self, get_config, from_config):
        # Get a config from LossScaleOptimizerV1, LossScaleOptimizer, or the
        # LossScaleOptimizer from TF 2.3. Then restore the config into a
        # LossScaleOptimizerV1 or LossScaleOptimizer
        opt = gradient_descent.SGD(2., momentum=0.5)
        if get_config == 'v1':
            loss_scale = tf_loss_scale_module.DynamicLossScale(
                initial_loss_scale=2, increment_period=3)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
            config = opt.get_config()
        elif get_config == 'v2':
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=3)
            config = opt.get_config()
        else:
            self.assertEqual(get_config, 'tf2_3')
            config = {
                'optimizer': {
                    'class_name': 'SGD',
                    'config': {
                        'learning_rate': 2.0,
                        'momentum': 0.5,
                        'decay': 0.0,
                        'nesterov': False,
                        'name': 'SGD',
                    }
                },
                'loss_scale': {
                    'class_name': 'DynamicLossScale',
                    'config': {
                        'initial_loss_scale': 2.0,
                        'increment_period': 3,
                        'multiplier': 2.0,
                    }
                },
            }

        if from_config == 'v1':
            opt = loss_scale_optimizer.LossScaleOptimizerV1.from_config(config)
        else:
            self.assertEqual(from_config, 'v2')
            opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)

        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        # Test attributes on the optimizer
        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt._optimizer.lr), 2.)
        self.assertEqual(self.evaluate(opt.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.initial_scale, 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertTrue(opt.dynamic)

        # Ensure the optimizer can be used
        var = variables.Variable([5.0])
        run_op = self._run_fn_with_grad_check(
            distribution_strategy_context.get_strategy(), var, opt, 2)()
        self.evaluate(variables.global_variables_initializer())
        self._run_if_in_graph_mode(run_op)
        self.assertEqual(self.evaluate(var), [3.])
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
Exemple #24
0
    def _benchmark(self, gradient_type, num_gpus, mode, loss_scaling):
        """Benchmarks loss scaling.

    We run a simple model with several scalar variables. The loss is the sum of
    all variables. The model is simple because we want to measure only the
    performance of loss scaling, not the performance of the model itself.

    Args:
      gradient_type: "optimizer" or "gradient_tape". How gradients are computed.
        "optimizer" uses Optimizer.minimize. "gradient_tape" uses
        GradientTape.gradient along with LossScaleOptimizer.get_scaled_loss and
        LossScaleOptimizer.get_unscaled_gradients.
      num_gpus: The number of GPUs to use. Must be at least 1.
      mode: "eager" or "tf_function". "tf_function" causes all computations to
        be wrapped in a tf.function, while "eager" runs computations eagerly.
      loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to
        use. None means use no loss scaling, which is useful as a baseline to
        see how much slower loss scaling is in comparison.
    """
        ls_str = loss_scaling or 'no_loss_scaling'
        name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str)
        with context.eager_mode(), _get_strategy(num_gpus).scope() as strategy:
            opt = adam.Adam()
            if loss_scaling == 'fixed':
                loss_scale = loss_scale_module.FixedLossScale(2.)
            elif loss_scaling == 'dynamic':
                # Make increment_period so high that it's effectively infinite. This
                # means the loss scale will never change. Any performance overhead
                # from increasing/decreasing the loss scale is typically negligible
                # since it happens infrequently, so we only benchmark the common case
                # of the loss scale not changing.
                increment_period = 1000000
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=2., increment_period=increment_period)
            else:
                assert loss_scaling is None
                loss_scale = None
            if loss_scale:
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

            num_vars = 200
            num_warmup_iters = 1
            num_iters = 20
            # By using scalar variables, we reduce overhead of the actual GPU work of
            # multiplying variables, dividing gradients, and checking gradients for
            # NaNs. Measuring these overheads isn't very useful as there is little we
            # can do to reduce them (one such way would be to fuse dividing gradients
            # and checking them for NaNs). We still have all other overheads, such as
            # all-reducing the `is_finite` values and having a tf.cond or
            # tf.while_loop based on whether gradients are NaNs. Currently, these
            # other overheads are much more significant than the GPU work.
            var_list = [
                variables.Variable(i, dtype='float32') for i in range(num_vars)
            ]

            def get_loss():
                return math_ops.add_n(var_list)

            if gradient_type == 'gradient_tape':
                if loss_scale is None:

                    def minimize_fn():
                        with backprop.GradientTape() as tape:
                            loss = get_loss()
                        grads = tape.gradient(loss, var_list)
                        return opt.apply_gradients(zip(grads, var_list))
                else:

                    def minimize_fn():
                        with backprop.GradientTape() as tape:
                            loss = get_loss()
                            scaled_loss = opt.get_scaled_loss(loss)
                        scaled_grads = tape.gradient(scaled_loss, var_list)
                        grads = opt.get_unscaled_gradients(scaled_grads)
                        return opt.apply_gradients(zip(grads, var_list))
            else:
                assert gradient_type == 'optimizer'

                def minimize_fn():
                    return opt.minimize(get_loss, var_list)

            def run_fn():
                strategy.run(minimize_fn)

            if mode == 'tf_function':
                run_fn = def_function.function(run_fn)

            for _ in range(num_warmup_iters):
                run_fn()

            start = time.time()
            for _ in range(num_iters):
                run_fn()
            end = time.time()
            self.report_benchmark(iters=num_iters,
                                  wall_time=(end - start) / num_iters,
                                  name=name)
 def testGetUnscaledGradients(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
     grads = opt.get_unscaled_gradients([3., None, -4.])
     grads = [self.evaluate(g) if g is not None else g for g in grads]
     self.assertEqual([1.5, None, -2.], grads)