Ejemplo n.º 1
0
 def testInvalidArgsWithFixedLossScale(self):
     opt = gradient_descent.SGD()
     with self.assertRaisesRegex(
             ValueError,
             '"initial_scale" must be specified if "dynamic" is False'):
         loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False)
     with self.assertRaisesRegex(
             ValueError,
             '"dynamic_growth_steps" must be None if "dynamic" is '
             'False, but got: 2'):
         loss_scale_optimizer.LossScaleOptimizer(opt,
                                                 dynamic=False,
                                                 initial_scale=1,
                                                 dynamic_growth_steps=2)
Ejemplo n.º 2
0
 def testDynamicMustBeBool(self):
     opt = gradient_descent.SGD()
     with self.assertRaisesRegex(
             TypeError,
             '"dynamic" argument to LossScaleOptimizer.__init__ must be '
             "a bool, but got: 'dynamic'"):
         loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
Ejemplo n.º 3
0
    def testSerializationWithBuiltInOptimizer(self, use_v1):
        opt = gradient_descent.SGD(2., momentum=0.5)
        if use_v1:
            loss_scale = tf.mixed_precision.experimental.DynamicLossScale(
                initial_loss_scale=2., increment_period=3.)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
        else:
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2., dynamic_growth_steps=3.)
        config = optimizers.serialize(opt)
        opt = optimizers.deserialize(config)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(tf.compat.v1.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertTrue(opt.dynamic, 4.)
        # Deserializing a LossScaleOptimizer always always results in a V2
        # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1.
        self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)

        # Ensure the optimizer can be used
        var = tf.Variable([5.0])
        run_op = self._run_fn_with_grad_check(tf.distribute.get_strategy(),
                                              var, opt, 2)()
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self._run_if_in_graph_mode(run_op)
        self.assertEqual(self.evaluate(var), [3.])
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
Ejemplo n.º 4
0
 def testUnsupportedStrategy(self):
     strategy = tf.distribute.experimental.CentralStorageStrategy()
     expected_error = (
         'Loss scaling is not supported with the tf.distribute.Strategy: '
         'CentralStorageStrategy. Try using a different Strategy, e.g. a '
         'MirroredStrategy')
     with strategy.scope(), self.assertRaisesRegex(ValueError,
                                                   expected_error):
         loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
     opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
     with strategy.scope():
         var = tf.Variable(1.0)
         loss = lambda: var * 2.0
         run_fn = lambda: opt.minimize(loss, [var])
         with self.assertRaisesRegex(ValueError, expected_error):
             strategy.experimental_run(run_fn)
Ejemplo n.º 5
0
    def testDynamicLossScaleWithSlots(self, strategy_fn):
        strategy_obj = strategy_fn()
        if (isinstance(strategy_obj, tf.distribute.MirroredStrategy)
                and tf.compat.v1.control_flow_v2_enabled()
                and not tf.executing_eagerly()):
            self.skipTest('b/138667997')
        with strategy_obj.scope() as strategy:
            var = tf.Variable([1.0, 2.0])
            # An SGD optimizer with momentum has slot variables.
            opt = gradient_descent.SGD(1.0, momentum=1.)
            initial_scale = 2.
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=initial_scale, dynamic_growth_steps=1)
            loss = lambda: var / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator starts at 0 and the gradient is 1. The
            # accumulator is incremented by the gradient, so it is now 1. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 1.
            self.assertAllClose([0.0, 1.0], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2)

            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator was 1 before this step and the gradient is 1.
            # The accumulator is incremented by the gradient, so it is now 2. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 2.
            self.assertAllClose([-2., -1.], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4)

            self.assertEqual(opt.get_slot_names(), ['momentum'])
  def testApplyGradientsGetsUnwrappedTensors(self):
    # Tests that gradients passed to apply_gradients are not wrapped in a
    # DistributionStrategy wrapper, such as PerReplica, but instead are raw
    # Tensors. Optimizer subclasses that override apply_gradients() expect raw
    # Tensors, even though the base Optimizer can handle PerReplica gradients.

    outer_self = self

    class MyOptimizer(gradient_descent.SGD):

      def apply_gradients(self,
                          grads_and_vars,
                          name=None,
                          experimental_aggregate_gradients=True):
        for grad, _ in grads_and_vars:
          outer_self.assertIsInstance(grad, tf.Tensor)
        return super(MyOptimizer,
                     self).apply_gradients(grads_and_vars, name,
                                           experimental_aggregate_gradients)

    with create_mirrored_strategy().scope() as strategy:
      var = tf.Variable([5.0])
      opt = MyOptimizer(learning_rate=1.0)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                    initial_scale=1)
      loss = lambda: var * 2.0
      run_fn = lambda: opt.minimize(loss, [var])
      strategy.experimental_run(run_fn)
Ejemplo n.º 7
0
 def testDynamicLossScaleDefaultValues(self):
     opt = gradient_descent.SGD()
     opt = loss_scale_optimizer.LossScaleOptimizer(opt)
     self.assertEqual(opt.initial_scale, 2**15)
     self.assertEqual(opt.dynamic_growth_steps, 2000)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
Ejemplo n.º 8
0
    def testNanOnOneReplicaOnly(self):
        if not tf.test.is_gpu_available():
            self.skipTest('Test requires GPU')
        if (not tf.executing_eagerly()
                and not tf.compat.v1.control_flow_v2_enabled()):
            self.skipTest(
                'b/181283011: GradientTape does not work properly with '
                'V1 control flow, and opt.minimize uses GradientTape')
        with create_mirrored_strategy().scope() as strategy:
            var = tf.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=2)

            def loss():
                rep_id = (tf.distribute.get_replica_context().
                          replica_id_in_sync_group)
                # The last element of last replica's gradient is NaN.
                return tf.compat.v1.cond(
                    tf.constant(rep_id == 0), lambda: var * 2.,
                    lambda: var * tf.constant([1., float('NaN')]))

            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [1.0, 2.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(1., self.evaluate(opt.loss_scale))
Ejemplo n.º 9
0
    def testDynamicUpdate(self, strategy_fn):
        with strategy_fn().scope() as strategy:
            var = tf.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)

            # Test optimizer with finite gradients
            loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Gradient is 2, so variable will have 2 subtracted from it
            self.assertAllClose([-1.0, 0.0], self.evaluate(var))
            # Loss scale has doubled from 2 to 4
            self.assertEqual(4., self.evaluate(opt.loss_scale))

            # Test optimizer with NaN gradients
            loss = lambda: var * float('NaN')
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(2., self.evaluate(opt.loss_scale))
Ejemplo n.º 10
0
    def testDynamicLossScale(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        expected_gradient = tf.Variable(learning_rate /
                                        strategy.num_replicas_in_sync)
        with strategy.scope():
            var = tf.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)
            self.assertEqual(opt.initial_scale, 2.)
            self.assertIsInstance(opt.initial_scale, float)
            self.assertEqual(opt.dynamic_growth_steps, 1)
            self.assertIsInstance(opt.dynamic_growth_steps, int)

            self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync,
                             0)
            run_fn = self._run_fn_with_grad_check(strategy, var, opt,
                                                  expected_gradient)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))

            # Loss scale will be double, so the expected gradient is also doubled.
            self.evaluate(
                expected_gradient.assign(2 * learning_rate /
                                         strategy.num_replicas_in_sync))
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # As before, the 2 is subtracted from the variable, making it's new value
            # 1.
            self.assertAllClose([1.], self.evaluate(var))
Ejemplo n.º 11
0
  def test_save_slot_variables_with_autocast_vars(self,
                                                  strategy_fn,
                                                  var_name='v'):
    p = policy.Policy('mixed_float16')
    with strategy_fn().scope(), policy.policy_scope(p):
      x = layers.Input(shape=(2,), batch_size=2)
      # Having a var_name other than 'v' tests that a fixed bug (b/134713714)
      # does not reoccur. The bug was that a crash would occur when saving a
      # checkpoint where an AutoCastVariable with a slot variable would have a
      # different name than the layer attribute's name (layer.v in this case).
      layer = mp_test_util.MultiplyLayer(assert_type=tf.float16,
                                         var_name=var_name)
      y = layer(x)
      model = models.Model(inputs=x, outputs=y)
      opt = gradient_descent.SGD(1., 1.)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                    initial_scale=1)
      model.compile(
          optimizer=opt,
          loss='mse',
          run_eagerly=testing_utils.should_run_eagerly())

    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
    weights_file = os.path.join(self.get_temp_dir(), 'weights')
    model.save_weights(weights_file)
    saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))

    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
    new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
    self.assertNotEqual(new_slot, saved_slot)

    model.load_weights(weights_file)
    restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
    self.assertEqual(restored_slot, saved_slot)
 def testIterations(self):
   opt = gradient_descent.SGD(2.0)
   lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                 initial_scale=10.)
   lso.iterations = 7
   self.assertEqual(lso.iterations, 7)
   self.assertEqual(opt.iterations, 7)
Ejemplo n.º 13
0
    def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False):
        # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy
        # as well.
        strategy = strategy_fn()
        if (isinstance(strategy, tf.distribute.MirroredStrategy)
                and not tf.executing_eagerly()):
            # TODO(b/121381184): Enable running the test in this case.
            return

        # Create and run model.
        with strategy.scope():
            x = layers.Input(shape=(2, ), batch_size=2, dtype=tf.float32)
            y = mp_test_util.MultiplyLayer()(x)
            model = models.Model(inputs=x, outputs=y)

            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=1., dynamic_growth_steps=2.)
            model.compile(optimizer=opt,
                          loss='mse',
                          run_eagerly=test_utils.should_run_eagerly())
        # Run for 3 steps (6 examples with a batch size of 2)
        model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2)
        self.assertEqual(backend.get_value(opt.loss_scale), 2)
        self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
        (weight, ) = model.trainable_weights
        orig_weight = backend.get_value(weight)

        # Save model weights.
        save_path = os.path.join(self.get_temp_dir(), 'model')
        model.save(save_path, save_format='h5' if h5 else 'tf')

        # Run model again for 1 step (2 examples with a batch size of 2)
        model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
        new_weight = backend.get_value(weight)
        self.assertNotEqual(new_weight, orig_weight)
        self.assertEqual(backend.get_value(opt.loss_scale), 4)
        self.assertEqual(backend.get_value(opt.dynamic_counter), 0)

        # Load model weights and ensure loss scale weights are restored.
        model = save.load_model(
            save_path,
            custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer})
        (weight, ) = model.trainable_weights
        loaded_weight = backend.get_value(weight)
        self.assertEqual(loaded_weight, orig_weight)
        # Currently the loss scale isn't always saved when the model is saved with
        # Model.save(). So we assert the loss scale either has the value when it was
        # saved, or the value it was initialized with.
        # TODO(reedwm): Always save/restore the loss scale with Model.save().
        self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2))
        self.assertIn(backend.get_value(model.optimizer.dynamic_counter),
                      (0, 1))

        # Test optimizer attributes and type
        self.assertEqual(model.optimizer.initial_scale, 1.)
        self.assertEqual(model.optimizer.dynamic_growth_steps, 2.)
        self.assertEqual(type(model.optimizer),
                         loss_scale_optimizer.LossScaleOptimizer)
Ejemplo n.º 14
0
 def testDynamicAttrsWithFixedLossScale(self):
     opt = gradient_descent.SGD()
     opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                   dynamic=False,
                                                   initial_scale=2.)
     self.assertFalse(opt.dynamic)
     self.assertIsNone(opt.dynamic_counter)
     self.assertIsNone(opt.dynamic_growth_steps)
  def testGetConfigFixed(self, config_version):
    # Get a config from LossScaleOptimizer, LossScaleOptimizerV3, or the
    # LossScaleOptimizer from TF 2.3. Then restore the config into a
    # LossScaleOptimizer or LossScaleOptimizerV3
    if config_version == 'v2':
      opt = gradient_descent.SGD(2., momentum=0.5)
      opt = loss_scale_optimizer.LossScaleOptimizer(
          opt, dynamic=False, initial_scale=2)
      config = opt.get_config()
      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
    elif config_version == 'v3':
      opt = sgd_experimental.SGD(2., momentum=0.5)
      opt = loss_scale_optimizer.LossScaleOptimizerV3(
          opt, dynamic=False, initial_scale=2)
      config = opt.get_config()
      opt = loss_scale_optimizer.LossScaleOptimizerV3.from_config(config)
    else:
      self.assertEqual(config_version, 'tf2_3')
      config = {
          'optimizer': {
              'class_name': 'SGD',
              'config': {
                  'learning_rate': 2.0,
                  'momentum': 0.5,
                  'decay': 0.0,
                  'nesterov': False,
                  'name': 'SGD',
              }
          },
          'loss_scale': {
              'class_name': 'FixedLossScale',
              'config': {'loss_scale_value': 2.0}
          },
      }
      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)

    # Force hyperparameters to be created
    opt.learning_rate  # pylint: disable=pointless-statement
    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Test attributes on the optimizer
    self.assertEqual(self.evaluate(opt.learning_rate), 2.)
    self.assertEqual(self.evaluate(opt.inner_optimizer.learning_rate), 2.)
    self.assertEqual(self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5)
    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
    self.assertEqual(opt.initial_scale, 2.)
    self.assertIsNone(opt.dynamic_growth_steps)
    self.assertIsNone(opt.dynamic_counter)
    self.assertFalse(opt.dynamic)

    # Ensure the optimizer can be used
    var = tf.Variable([5.0])
    run_op = self._run_fn_with_grad_check(
        tf.distribute.get_strategy(), var, opt, 2)()
    self.evaluate(tf.compat.v1.global_variables_initializer())
    self._run_if_in_graph_mode(run_op)
    self.assertEqual(self.evaluate(var), [3.])
 def testDir(self):
   lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
   dir_result = dir(lso)
   self.assertIn('learning_rate', dir_result)  # Hyperparameter
   self.assertIn('lr', dir_result)  # Hyperparameter
   self.assertIn('minimize', dir_result)  # Attribute
   self.assertIn('loss_scale', dir_result)  # Attribute
   self.assertNotIn('nesterov', dir_result)  # Attribute on inner optimizer
   self.assertIn('nesterov', dir(lso.inner_optimizer))
 def test_optimizer_errors(self):
     opt = gradient_descent_v2.SGD(1.0)
     opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt)
     with self.assertRaisesRegex(
             ValueError, '"opt" must not already be an instance of a '
             'LossScaleOptimizer.'):
         tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
             opt)
     self.assertFalse(tf.config.optimizer.get_experimental_options().get(
         'auto_mixed_precision', False))
Ejemplo n.º 18
0
 def test_loss_scale_optimizer_overrides_policy_v1_loss_scale(self):
   with policy.policy_scope(policy.PolicyV1('float32', loss_scale=10.)):
     opt = gradient_descent.SGD(1.)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                   initial_scale=5.)
     x = layers.Input(shape=(1,))
     y = mp_test_util.MultiplyLayer()(x)
     model = models.Model(x, y)
     model.compile(opt, loss='mse')
     self.assertEqual(self.evaluate(model.optimizer.loss_scale), 5.)
Ejemplo n.º 19
0
    def test_restore_old_loss_scale_checkpoint(self):
        # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
        # of LossScaleOptimizer changed, but old checkpoints can still be loaded
        opt = gradient_descent.SGD(0.1, momentum=0.1)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt)
        model = sequential.Sequential(
            [
                core.Dense(
                    2,
                )
            ]
        )

        # The checkpoint and expected values were obtained from the program in
        # testdata/BUILD.
        ckpt_dir = os.path.join(
            flags.FLAGS["test_srcdir"].value,
            "org_keras/keras",
            "mixed_precision/testdata/lso_ckpt_tf2.2",
        )
        # ckpt_dir = test.test_src_dir_path(
        #     'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2')
        model.load_weights(os.path.join(ckpt_dir, "ckpt"))
        model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly())
        model(np.zeros((2, 2)))  # Create model weights
        opt._create_all_weights(model.weights)
        expected_kernel = np.array(
            [[9.229685, 10.901115], [10.370763, 9.757362]]
        )
        expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]])
        self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
        self.assertAllClose(
            self.evaluate(opt.get_slot(model.weights[0], "momentum")),
            expected_slot,
        )
        self.assertEqual(self.evaluate(opt.loss_scale), 32768)
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)

        # Check restoring works even after the model is compiled and the weights
        # have been created.
        model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2)))
        self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel)
        self.assertNotAllClose(
            self.evaluate(opt.get_slot(model.weights[0], "momentum")),
            expected_slot,
        )
        model.load_weights(os.path.join(ckpt_dir, "ckpt"))
        self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
        self.assertAllClose(
            self.evaluate(opt.get_slot(model.weights[0], "momentum")),
            expected_slot,
        )
        self.assertEqual(self.evaluate(opt.loss_scale), 32768)
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
  def testHyperParametersExposed(self):
    with self.cached_session():
      opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
      # Force hyperparameters to be created
      opt.lr  # pylint: disable=pointless-statement
      self.evaluate(tf.compat.v1.global_variables_initializer())

      self.assertEqual(self.evaluate(lso.beta_1), 0.5)
      self.assertIsInstance(lso.beta_1, tf.Variable)
      self.assertEqual(self.evaluate(lso.lr), 1.0)
      self.assertIs(lso.lr, opt.lr)
      self.assertIs(lso.lr, lso.learning_rate)

      lso.beta_1 = 0.25
      self.assertEqual(self.evaluate(lso.beta_1), 0.25)
      self.assertEqual(self.evaluate(opt.beta_1), 0.25)
      self.assertIs(lso.beta_1, opt.beta_1)
      opt.beta_1 = 0.75
      self.assertEqual(self.evaluate(lso.beta_1), 0.75)
      self.assertEqual(self.evaluate(opt.beta_1), 0.75)
      self.assertIs(lso.beta_1, opt.beta_1)
      lso.lr = 2.0
      self.assertEqual(self.evaluate(lso.lr), 2.0)
      self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
      self.assertEqual(self.evaluate(opt.lr), 2.0)
      self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
      self.assertIs(lso.lr, opt.lr)

      # Test setting attribute that is both attribute on LossScaleOptimizer and
      # hyperparameter on wrapped optimizer.
      class MyOpt(gradient_descent.SGD):

        def __init__(self):
          super().__init__()
          self._set_hyper('loss_scale', 123.)

      opt = MyOpt()
      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
      with self.assertRaises(AttributeError):
        lso.loss_scale = 2.
Ejemplo n.º 21
0
    def testArbitraryAttributesNotExposed(self):
        opt = gradient_descent.SGD()
        lso = loss_scale_optimizer.LossScaleOptimizer(opt)
        self.assertFalse(opt.nesterov)
        with self.assertRaisesRegex(
                AttributeError,
                "'LossScaleOptimizer' object has no attribute 'nesterov'"):
            lso.nesterov  # pylint: disable=pointless-statement

        lso.nesterov = True
        self.assertTrue(lso.nesterov)
        self.assertFalse(opt.nesterov)
Ejemplo n.º 22
0
 def testGetUnscaledGradients(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                   dynamic=False,
                                                   initial_scale=2)
     scaled_grads = [
         tf.convert_to_tensor(3.), None,
         tf.convert_to_tensor(-4., dtype='float16')
     ]
     grads = opt.get_unscaled_gradients(scaled_grads)
     grads = [self.evaluate(g) if g is not None else g for g in grads]
     self.assertEqual([1.5, None, -2.], grads)
Ejemplo n.º 23
0
 def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                   dynamic=False,
                                                   initial_scale=2.)
     loss = tf.convert_to_tensor(5.)
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10.,
                      self.evaluate(opt.get_scaled_loss(lambda: loss)()))
     loss = tf.convert_to_tensor(5., dtype='float16')
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10.,
                      self.evaluate(opt.get_scaled_loss(lambda: loss)()))
Ejemplo n.º 24
0
 def testGetUnscaledSparseGradients(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                   dynamic=False,
                                                   initial_scale=2)
     sparse_scaled_grad = tf.IndexedSlices(
         tf.convert_to_tensor([[4., 2.], [8., 5.]]),
         tf.convert_to_tensor([1, 3], dtype='int32'),
         dense_shape=tf.convert_to_tensor([5, 2], dtype='int32'))
     sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
     self.assertIsInstance(sparse_grad, tf.IndexedSlices)
     self.assertAllEqual([[2., 1.], [4., 2.5]],
                         self.evaluate(sparse_grad.values))
 def testFixedLossScaleAppliedToLossWithGetGradients(self):
   with tf.Graph().as_default():
     var = tf.Variable([2.0])
     opt = gradient_descent.SGD(1.0)
     loss_scale = 10.
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                   initial_scale=loss_scale)
     grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
         loss_scale)
     loss = grad_check_fn(var)
     run_op = opt.get_gradients(loss, [var])
     self.evaluate(tf.compat.v1.global_variables_initializer())
     # This will cause an assertion to run, as
     # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
     self.evaluate(run_op)
  def testWeightMethods(self):
    with self.test_session():
      var = tf.Variable([1.0])
      opt = gradient_descent.SGD(1.0)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2.,
                                                    dynamic_growth_steps=1)
      run_op = opt.minimize(lambda: var * 2, [var])
      self.evaluate(tf.compat.v1.global_variables_initializer())
      self._run_if_in_graph_mode(run_op)

      self.assertLen(opt.weights, 1)  # The 'iterations' weight
      self.assertEqual(self.evaluate(opt.weights[0]), 1)
      self.assertEqual(opt.get_weights()[0], 1)
      self.assertEqual(self.evaluate(opt.variables()[0]), 1)
      opt.set_weights([np.array(2.)])
      self.assertEqual(self.evaluate(opt.variables()[0]), 2)
Ejemplo n.º 27
0
    def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        with strategy.scope():
            var = tf.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)

            def loss():
                return tf.cast(var / strategy.num_replicas_in_sync, 'float16')

            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))
Ejemplo n.º 28
0
    def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn):
        strategy = strategy_fn()
        if (
            isinstance(strategy, tf.distribute.MirroredStrategy)
            and not tf.executing_eagerly()
        ):
            # TODO(b/121381184): Enable running the test in this case.
            return

        # Create and run model.
        with strategy.scope():
            x = layers.Input(shape=(2,), batch_size=2, dtype=tf.float32)
            y = mp_test_util.MultiplyLayer(assert_type=tf.float32)(x)
            model = models.Model(inputs=x, outputs=y)

            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=1.0, dynamic_growth_steps=2.0
            )
            model.compile(
                optimizer=opt,
                loss="mse",
                run_eagerly=test_utils.should_run_eagerly(),
            )
        # Run for 3 steps (6 examples with a batch size of 2)
        model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
        self.assertEqual(backend.get_value(opt.loss_scale), 2)
        self.assertEqual(backend.get_value(opt.dynamic_counter), 1)

        # Save model weights.
        save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
        model.save_weights(save_prefix)

        # Run model again for 1 step (2 examples with a batch size of 2)
        model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
        self.assertEqual(backend.get_value(opt.loss_scale), 4)
        self.assertEqual(backend.get_value(opt.dynamic_counter), 0)

        # Load model weights and ensure loss scale weights are restored.
        model.load_weights(save_prefix)
        self.assertEqual(backend.get_value(opt.loss_scale), 2)
        self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
Ejemplo n.º 29
0
    def test_compile_wraps_with_loss_scale_optimizer(self):
        x = layers.Input(shape=(1,))
        y = mp_test_util.MultiplyLayer()(x)

        with policy.policy_scope("mixed_float16"):
            # Test optimizer is automatically wrapped with LSO
            model = models.Model(x, y)
            model.compile(gradient_descent.SGD(1.0), "mse")
            self.assertIsInstance(
                model.optimizer, loss_scale_optimizer.LossScaleOptimizer
            )
            self.assertEqual(
                backend.get_value(model.optimizer.learning_rate), 1.0
            )

            # Test optimizer specified as string is automatically wrapped in LSO
            model = models.Model(x, y)
            model.compile("sgd", "mse")
            self.assertIsInstance(
                model.optimizer, loss_scale_optimizer.LossScaleOptimizer
            )

            # Test if an LSO is passed, optimizer is not automatically wrapped with
            # another LSO
            model = models.Model(x, y)
            optimizer = loss_scale_optimizer.LossScaleOptimizer(
                gradient_descent.SGD(1.0), dynamic_growth_steps=2
            )
            model.compile(optimizer, "mse")
            self.assertIsInstance(
                model.optimizer, loss_scale_optimizer.LossScaleOptimizer
            )
            self.assertEqual(model.optimizer.dynamic_growth_steps, 2)

        with policy.policy_scope("mixed_bfloat16"):
            # Test mixed_bfloat16 models are not automatically wrapped with LSO
            model = models.Model(x, y)
            model.compile(gradient_descent.SGD(1.0), "mse")
            self.assertNotIsInstance(
                model.optimizer, loss_scale_optimizer.LossScaleOptimizer
            )
            self.assertIsInstance(model.optimizer, gradient_descent.SGD)
Ejemplo n.º 30
0
    def testClipping(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'):
            with strategy.scope(), self.subTest(clip_type=clip_type):
                var = tf.Variable([5.0])
                opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0})
                opt = loss_scale_optimizer.LossScaleOptimizer(
                    opt, initial_scale=2, dynamic_growth_steps=1)
                self.assertEqual(getattr(opt, clip_type), 2.0)
                self.assertEqual(
                    opt.initial_scale % strategy.num_replicas_in_sync, 0)

                loss = lambda: var * 4 / strategy.num_replicas_in_sync
                run_fn = lambda: opt.minimize(loss, var_list=[var])

                # Test running with clipped gradients
                run_op = strategy.experimental_run(run_fn)
                self.evaluate(tf.compat.v1.global_variables_initializer())
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 2, so the variable will be
                # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
                self.assertAllClose([1.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale), 4)

                # Test changing the clip amount and running again
                setattr(opt, clip_type, 3.0)
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 3, so the variable will be
                # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
                self.assertAllClose([-5.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale), 8)

                # Test Inf gradients are still skipped instead of being clipped
                loss = lambda: var * float('Inf')
                run_fn = lambda: opt.minimize(loss, var_list=[var])
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                self.assertAllClose([-5.],
                                    self.evaluate(var))  # Var does not change
                self.assertEqual(self.evaluate(opt.loss_scale), 4)