コード例 #1
0
 def testErrorWrappingSameOptimizerMultipleTimes(self):
   inner_opt = gradient_descent.SGD()
   loss_scale_optimizer.LossScaleOptimizer(inner_opt)
   with self.assertRaisesRegex(
       ValueError,
       '"inner_optimizer" is already wrapped by a LossScaleOptimizer.'):
     loss_scale_optimizer.LossScaleOptimizer(inner_opt)
コード例 #2
0
 def testInvalidArgsWithFixedLossScale(self):
   opt = gradient_descent.SGD()
   with self.assertRaisesRegex(
       ValueError, '"initial_scale" must be specified if "dynamic" is False'):
     loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False)
   opt = gradient_descent.SGD()
   with self.assertRaisesRegex(
       ValueError, '"dynamic_growth_steps" must be None if "dynamic" is '
                   'False, but got: 2'):
     loss_scale_optimizer.LossScaleOptimizer(
         opt, dynamic=False, initial_scale=1, dynamic_growth_steps=2)
コード例 #3
0
 def testUnsupportedStrategy(self):
   strategy = central_storage_strategy.CentralStorageStrategy()
   expected_error = (
       'Loss scaling is not supported with the tf.distribute.Strategy: '
       'CentralStorageStrategy. Try using a different Strategy, e.g. a '
       'MirroredStrategy')
   with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error):
     loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
   opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
   with strategy.scope():
     var = variables.Variable(1.0)
     loss = lambda: var * 2.0
     run_fn = lambda: opt.minimize(loss, [var])
     with self.assertRaisesRegex(ValueError, expected_error):
       strategy.experimental_run(run_fn)
コード例 #4
0
 def testDynamicAttrsWithFixedLossScale(self):
   opt = gradient_descent.SGD()
   opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                 initial_scale=2.)
   self.assertFalse(opt.dynamic)
   self.assertIsNone(opt.dynamic_counter)
   self.assertIsNone(opt.dynamic_growth_steps)
コード例 #5
0
    def testDynamicLossScale(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        expected_gradient = variables.Variable(learning_rate /
                                               strategy.num_replicas_in_sync)
        with strategy.scope():
            var = variables.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)
            self.assertEqual(opt.initial_scale, 2.)
            self.assertIsInstance(opt.initial_scale, float)
            self.assertEqual(opt.dynamic_growth_steps, 1)
            self.assertIsInstance(opt.dynamic_growth_steps, int)

            self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync,
                             0)
            run_fn = self._run_fn_with_grad_check(strategy, var, opt,
                                                  expected_gradient)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))

            # Loss scale will be double, so the expected gradient is also doubled.
            self.evaluate(
                expected_gradient.assign(2 * learning_rate /
                                         strategy.num_replicas_in_sync))
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # As before, the 2 is subtracted from the variable, making it's new value
            # 1.
            self.assertAllClose([1.], self.evaluate(var))
コード例 #6
0
 def testDynamicLossScaleDefaultValues(self):
     opt = gradient_descent.SGD()
     opt = loss_scale_optimizer.LossScaleOptimizer(opt)
     self.assertEqual(opt.initial_scale, 2**15)
     self.assertEqual(opt.dynamic_growth_steps, 2000)
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
コード例 #7
0
    def testDynamicLossScaleWithSlots(self, strategy_fn):
        strategy_obj = strategy_fn()
        if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy)
                and control_flow_v2_toggles.control_flow_v2_enabled()
                and not context.executing_eagerly()):
            self.skipTest('b/138667997')
        with strategy_obj.scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            # An SGD optimizer with momentum has slot variables.
            opt = gradient_descent.SGD(1.0, momentum=1.)
            initial_scale = 2.
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=initial_scale, dynamic_growth_steps=1)
            loss = lambda: var / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator starts at 0 and the gradient is 1. The
            # accumulator is incremented by the gradient, so it is now 1. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 1.
            self.assertAllClose([0.0, 1.0], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2)

            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator was 1 before this step and the gradient is 1.
            # The accumulator is incremented by the gradient, so it is now 2. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 2.
            self.assertAllClose([-2., -1.], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4)

            self.assertEqual(opt.get_slot_names(), ['momentum'])
コード例 #8
0
    def testSerializationWithBuiltInOptimizer(self, use_v1):
        opt = gradient_descent.SGD(2., momentum=0.5)
        if use_v1:
            loss_scale = tf_loss_scale_module.DynamicLossScale(
                initial_loss_scale=2., increment_period=3.)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
        else:
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2., dynamic_growth_steps=3.)
        config = optimizers.serialize(opt)
        opt = optimizers.deserialize(config)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertTrue(opt.dynamic, 4.)
        # Deserializing a LossScaleOptimizer always always results in a V2
        # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1.
        self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)

        # Ensure the optimizer can be used
        var = variables.Variable([5.0])
        run_op = self._run_fn_with_grad_check(
            distribution_strategy_context.get_strategy(), var, opt, 2)()
        self.evaluate(variables.global_variables_initializer())
        self._run_if_in_graph_mode(run_op)
        self.assertEqual(self.evaluate(var), [3.])
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
コード例 #9
0
    def testNanOnOneReplicaOnly(self):
        if not test_util.is_gpu_available():
            self.skipTest('Test requires GPU')
        if (not context.executing_eagerly()
                and not control_flow_v2_toggles.control_flow_v2_enabled()):
            self.skipTest(
                'b/181283011: GradientTape does not work properly with '
                'V1 control flow, and opt.minimize uses GradientTape')
        with create_mirrored_strategy().scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=2)

            def loss():
                rep_id = (distribution_strategy_context.get_replica_context().
                          replica_id_in_sync_group)
                # The last element of last replica's gradient is NaN.
                return control_flow_ops.cond(
                    constant_op.constant(rep_id == 0), lambda: var * 2.,
                    lambda: var * constant_op.constant([1., float('NaN')]))

            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [1.0, 2.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(1., self.evaluate(opt.loss_scale))
コード例 #10
0
 def testDynamicMustBeBool(self):
     opt = gradient_descent.SGD()
     with self.assertRaisesRegex(
             TypeError,
             '"dynamic" argument to LossScaleOptimizer.__init__ must be '
             "a bool, but got: 'dynamic'"):
         loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
コード例 #11
0
    def testApplyGradientsGetsUnwrappedTensors(self):
        # Tests that gradients passed to apply_gradients are not wrapped in a
        # DistributionStrategy wrapper, such as PerReplica, but instead are raw
        # Tensors. Optimizer subclasses that override apply_gradients() expect raw
        # Tensors, even though the base Optimizer can handle PerReplica gradients.

        outer_self = self

        class MyOptimizer(gradient_descent.SGD):
            def apply_gradients(self,
                                grads_and_vars,
                                name=None,
                                experimental_aggregate_gradients=True):
                for grad, _ in grads_and_vars:
                    outer_self.assertIsInstance(grad, ops.Tensor)
                return super(MyOptimizer, self).apply_gradients(
                    grads_and_vars, name, experimental_aggregate_gradients)

        with create_mirrored_strategy().scope() as strategy:
            var = variables.Variable([5.0])
            opt = MyOptimizer(learning_rate=1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                          dynamic=False,
                                                          initial_scale=1)
            loss = lambda: var * 2.0
            run_fn = lambda: opt.minimize(loss, [var])
            strategy.experimental_run(run_fn)
コード例 #12
0
 def testIterations(self):
   opt = gradient_descent.SGD(2.0)
   lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                 initial_scale=10.)
   lso.iterations = 7
   self.assertEqual(lso.iterations, 7)
   self.assertEqual(opt.iterations, 7)
コード例 #13
0
    def test_save_slot_variables_with_autocast_vars(self,
                                                    strategy_fn,
                                                    var_name='v'):
        p = policy.Policy('mixed_float16')
        with strategy_fn().scope(), policy.policy_scope(p):
            x = layers.Input(shape=(2, ), batch_size=2)
            # Having a var_name other than 'v' tests that a fixed bug (b/134713714)
            # does not reoccur. The bug was that a crash would occur when saving a
            # checkpoint where an AutoCastVariable with a slot variable would have a
            # different name than the layer attribute's name (layer.v in this case).
            layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                               var_name=var_name)
            y = layer(x)
            model = models.Model(inputs=x, outputs=y)
            opt = gradient_descent.SGD(1., 1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                          dynamic=False,
                                                          initial_scale=1)
            model.compile(optimizer=opt,
                          loss='mse',
                          run_eagerly=testing_utils.should_run_eagerly())

        model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
        weights_file = os.path.join(self.get_temp_dir(), 'weights')
        model.save_weights(weights_file)
        saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))

        model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
        new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
        self.assertNotEqual(new_slot, saved_slot)

        model.load_weights(weights_file)
        restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
        self.assertEqual(restored_slot, saved_slot)
コード例 #14
0
    def testDynamicUpdate(self, strategy_fn):
        with strategy_fn().scope() as strategy:
            var = variables.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)

            # Test optimizer with finite gradients
            loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Gradient is 2, so variable will have 2 subtracted from it
            self.assertAllClose([-1.0, 0.0], self.evaluate(var))
            # Loss scale has doubled from 2 to 4
            self.assertEqual(4., self.evaluate(opt.loss_scale))

            # Test optimizer with NaN gradients
            loss = lambda: var * float('NaN')
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(2., self.evaluate(opt.loss_scale))
コード例 #15
0
  def testGetConfigFixed(self, get_config, from_config):
    # Get a config from LossScaleOptimizerV1, LossScaleOptimizer, or the
    # LossScaleOptimizer from TF 2.3. Then restore the config into a
    # LossScaleOptimizerV1 or LossScaleOptimizer
    opt = gradient_descent.SGD(2., momentum=0.5)
    if get_config == 'v1':
      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, 2)
      config = opt.get_config()
    elif get_config == 'v2':
      opt = loss_scale_optimizer.LossScaleOptimizer(
          opt, dynamic=False, initial_scale=2)
      config = opt.get_config()
    else:
      self.assertEqual(get_config, 'tf2_3')
      config = {
          'optimizer': {
              'class_name': 'SGD',
              'config': {
                  'learning_rate': 2.0,
                  'momentum': 0.5,
                  'decay': 0.0,
                  'nesterov': False,
                  'name': 'SGD',
              }
          },
          'loss_scale': {
              'class_name': 'FixedLossScale',
              'config': {'loss_scale_value': 2.0}
          },
      }

    if from_config == 'v1':
      opt = loss_scale_optimizer.LossScaleOptimizerV1.from_config(config)
    else:
      self.assertEqual(from_config, 'v2')
      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)

    # Force hyperparameters to be created
    opt.lr  # pylint: disable=pointless-statement
    self.evaluate(variables.global_variables_initializer())

    # Test attributes on the optimizer
    self.assertEqual(self.evaluate(opt.lr), 2.)
    self.assertEqual(self.evaluate(opt.inner_optimizer.lr), 2.)
    self.assertEqual(self.evaluate(opt.momentum), 0.5)
    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
    self.assertEqual(opt.initial_scale, 2.)
    self.assertIsNone(opt.dynamic_growth_steps)
    self.assertIsNone(opt.dynamic_counter)
    self.assertFalse(opt.dynamic)

    # Ensure the optimizer can be used
    var = variables.Variable([5.0])
    run_op = self._run_fn_with_grad_check(
        distribution_strategy_context.get_strategy(), var, opt, 2)()
    self.evaluate(variables.global_variables_initializer())
    self._run_if_in_graph_mode(run_op)
    self.assertEqual(self.evaluate(var), [3.])
コード例 #16
0
 def testDir(self):
   lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
   dir_result = dir(lso)
   self.assertIn('learning_rate', dir_result)  # Hyperparameter
   self.assertIn('lr', dir_result)  # Hyperparameter
   self.assertIn('minimize', dir_result)  # Attribute
   self.assertIn('loss_scale', dir_result)  # Attribute
   self.assertNotIn('nesterov', dir_result)  # Attribute on inner optimizer
   self.assertIn('nesterov', dir(lso.inner_optimizer))
コード例 #17
0
 def testGetScaledLoss(self):
   opt = gradient_descent.SGD(2.0)
   opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                 initial_scale=2.)
   loss = ops.convert_to_tensor_v2_with_dispatch(5.)
   self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
   self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
   loss = ops.convert_to_tensor_v2_with_dispatch(5., dtype='float16')
   self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
   self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
コード例 #18
0
 def test_loss_scale_optimizer_overrides_policy_v1_loss_scale(self):
     with policy.policy_scope(policy.PolicyV1('float32', loss_scale=10.)):
         opt = gradient_descent.SGD(1.)
         opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                       dynamic=False,
                                                       initial_scale=5.)
         x = layers.Input(shape=(1, ))
         y = mp_test_util.MultiplyLayer()(x)
         model = models.Model(x, y)
         model.compile(opt, loss='mse')
         self.assertEqual(self.evaluate(model.optimizer.loss_scale), 5.)
コード例 #19
0
 def testGetUnscaledGradients(self):
   opt = gradient_descent.SGD(2.0)
   opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                 initial_scale=2)
   scaled_grads = [
       ops.convert_to_tensor_v2_with_dispatch(3.), None,
       ops.convert_to_tensor_v2_with_dispatch(-4., dtype='float16')
   ]
   grads = opt.get_unscaled_gradients(scaled_grads)
   grads = [self.evaluate(g) if g is not None else g for g in grads]
   self.assertEqual([1.5, None, -2.], grads)
コード例 #20
0
  def testHyperParametersExposed(self):
    with self.cached_session():
      opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
      # Force hyperparameters to be created
      opt.lr  # pylint: disable=pointless-statement
      self.evaluate(variables.global_variables_initializer())

      self.assertEqual(self.evaluate(lso.beta_1), 0.5)
      self.assertIsInstance(lso.beta_1, variables.Variable)
      self.assertEqual(self.evaluate(lso.lr), 1.0)
      self.assertIs(lso.lr, opt.lr)
      self.assertIs(lso.lr, lso.learning_rate)

      lso.beta_1 = 0.25
      self.assertEqual(self.evaluate(lso.beta_1), 0.25)
      self.assertEqual(self.evaluate(opt.beta_1), 0.25)
      self.assertIs(lso.beta_1, opt.beta_1)
      opt.beta_1 = 0.75
      self.assertEqual(self.evaluate(lso.beta_1), 0.75)
      self.assertEqual(self.evaluate(opt.beta_1), 0.75)
      self.assertIs(lso.beta_1, opt.beta_1)
      lso.lr = 2.0
      self.assertEqual(self.evaluate(lso.lr), 2.0)
      self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
      self.assertEqual(self.evaluate(opt.lr), 2.0)
      self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
      self.assertIs(lso.lr, opt.lr)

      # Test setting attribute that is both attribute on LossScaleOptimizer and
      # hyperparameter on wrapped optimizer.
      class MyOpt(gradient_descent.SGD):

        def __init__(self):
          super().__init__()
          self._set_hyper('loss_scale', 123.)

      opt = MyOpt()
      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
      with self.assertRaises(AttributeError):
        lso.loss_scale = 2.
コード例 #21
0
    def testArbitraryAttributesNotExposed(self):
        opt = gradient_descent.SGD()
        lso = loss_scale_optimizer.LossScaleOptimizer(opt)
        self.assertFalse(opt.nesterov)
        with self.assertRaisesRegex(
                AttributeError,
                "'LossScaleOptimizer' object has no attribute 'nesterov'"):
            lso.nesterov  # pylint: disable=pointless-statement

        lso.nesterov = True
        self.assertTrue(lso.nesterov)
        self.assertFalse(opt.nesterov)
コード例 #22
0
 def testGetUnscaledSparseGradients(self):
   opt = gradient_descent.SGD(2.0)
   opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                 initial_scale=2)
   sparse_scaled_grad = indexed_slices.IndexedSlices(
       ops.convert_to_tensor_v2_with_dispatch([[4., 2.], [8., 5.]]),
       ops.convert_to_tensor_v2_with_dispatch([1, 3], dtype='int32'),
       dense_shape=ops.convert_to_tensor_v2_with_dispatch([5, 2],
                                                          dtype='int32'))
   sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
   self.assertIsInstance(sparse_grad, indexed_slices.IndexedSlices)
   self.assertAllEqual([[2., 1.], [4., 2.5]],
                       self.evaluate(sparse_grad.values))
コード例 #23
0
 def testFixedLossScaleAppliedToLossWithGetGradients(self):
     with ops.Graph().as_default():
         var = variables.Variable([2.0])
         opt = gradient_descent.SGD(1.0)
         loss_scale = 10.
         opt = loss_scale_optimizer.LossScaleOptimizer(
             opt, dynamic=False, initial_scale=loss_scale)
         grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
             loss_scale)
         loss = grad_check_fn(var)
         run_op = opt.get_gradients(loss, [var])
         self.evaluate(variables.global_variables_initializer())
         # This will cause an assertion to run, as
         # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
         self.evaluate(run_op)
コード例 #24
0
    def testWeightMethods(self):
        with self.test_session():
            var = variables.Variable([1.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2., dynamic_growth_steps=1)
            run_op = opt.minimize(lambda: var * 2, [var])
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)

            self.assertLen(opt.weights, 1)  # The 'iterations' weight
            self.assertEqual(self.evaluate(opt.weights[0]), 1)
            self.assertEqual(opt.get_weights()[0], 1)
            self.assertEqual(self.evaluate(opt.variables()[0]), 1)
            opt.set_weights([np.array(2.)])
            self.assertEqual(self.evaluate(opt.variables()[0]), 2)
コード例 #25
0
    def test_restore_old_loss_scale_checkpoint(self):
        # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
        # of LossScaleOptimizer changed, but old checkpoints can still be loaded
        opt = gradient_descent.SGD(0.1, momentum=0.1)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt)
        model = sequential.Sequential([core.Dense(2, )])

        # The checkpoint and expected values were obtained from the program in
        # testdata/BUILD.
        ckpt_dir = os.path.join(flags.FLAGS['test_srcdir'].value,
                                'org_tensorflow/tensorflow/python/keras',
                                'mixed_precision/testdata/lso_ckpt_tf2.2')
        # ckpt_dir = test.test_src_dir_path(
        #     'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2')
        model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
        model.compile(opt,
                      'mse',
                      run_eagerly=testing_utils.should_run_eagerly())
        model(np.zeros((2, 2)))  # Create model weights
        opt._create_all_weights(model.weights)
        expected_kernel = np.array([[9.229685, 10.901115],
                                    [10.370763, 9.757362]])
        expected_slot = np.array([[10.049943, 9.917691], [10.049943,
                                                          9.917691]])
        self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
        self.assertAllClose(
            self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
            expected_slot)
        self.assertEqual(self.evaluate(opt.loss_scale), 32768)
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)

        # Check restoring works even after the model is compiled and the weights
        # have been created.
        model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2)))
        self.assertNotAllClose(self.evaluate(model.weights[0]),
                               expected_kernel)
        self.assertNotAllClose(
            self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
            expected_slot)
        model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
        self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
        self.assertAllClose(
            self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
            expected_slot)
        self.assertEqual(self.evaluate(opt.loss_scale), 32768)
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
コード例 #26
0
  def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
    strategy = strategy_fn()
    learning_rate = 2.
    with strategy.scope():
      var = variables.Variable([5.0])
      opt = gradient_descent.SGD(learning_rate)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
                                                    dynamic_growth_steps=1)

      def loss():
        return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16')
      run_fn = lambda: opt.minimize(loss, var_list=[var])
      run_op = strategy.experimental_run(run_fn)
      self.evaluate(variables.global_variables_initializer())
      self._run_if_in_graph_mode(run_op)
      # The loss is the identity of the variable. Therefore the gradient is 1,
      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
      self.assertAllClose([3.], self.evaluate(var))
コード例 #27
0
    def testClipping(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'):
            with strategy.scope(), self.subTest(clip_type=clip_type):
                var = variables.Variable([5.0])
                opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0})
                opt = loss_scale_optimizer.LossScaleOptimizer(
                    opt, initial_scale=2, dynamic_growth_steps=1)
                self.assertEqual(getattr(opt, clip_type), 2.0)
                self.assertEqual(
                    opt.initial_scale % strategy.num_replicas_in_sync, 0)

                loss = lambda: var * 4 / strategy.num_replicas_in_sync
                run_fn = lambda: opt.minimize(loss, var_list=[var])

                # Test running with clipped gradients
                run_op = strategy.experimental_run(run_fn)
                self.evaluate(variables.global_variables_initializer())
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 2, so the variable will be
                # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
                self.assertAllClose([1.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale), 4)

                # Test changing the clip amount and running again
                setattr(opt, clip_type, 3.0)
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                # The gradient is 4 but is clipped to 3, so the variable will be
                # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
                self.assertAllClose([-5.], self.evaluate(var))
                self.assertEqual(self.evaluate(opt.loss_scale), 8)

                # Test Inf gradients are still skipped instead of being clipped
                loss = lambda: var * float('Inf')
                run_fn = lambda: opt.minimize(loss, var_list=[var])
                run_op = strategy.experimental_run(run_fn)
                self._run_if_in_graph_mode(run_op)
                self.assertAllClose([-5.],
                                    self.evaluate(var))  # Var does not change
                self.assertEqual(self.evaluate(opt.loss_scale), 4)
コード例 #28
0
 def testFixedLossScaleAppliedToLossWithMinimize(self, strategy_fn):
     with strategy_fn().scope() as strategy:
         var = variables.Variable([5.0])
         opt = gradient_descent.SGD(2.0)
         loss_scale = 10.
         opt = loss_scale_optimizer.LossScaleOptimizer(
             opt, dynamic=False, initial_scale=loss_scale)
         self.assertEqual(self.evaluate(opt.loss_scale), loss_scale)
         self.assertIsInstance(opt.loss_scale, ops.Tensor)
         # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale
         # / strategy.num_replicas_in_sync will not be exact, which could lead to
         # assertion failures due to rounding issues.
         self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0)
         run_fn = self._run_fn_with_grad_check(
             strategy, var, opt, loss_scale / strategy.num_replicas_in_sync)
         run_op = strategy.experimental_run(run_fn)
         self.evaluate(variables.global_variables_initializer())
         self._run_if_in_graph_mode(run_op)
         # The loss is the identity of the variable. Therefore the gradient is 1,
         # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
         self.assertAllClose([3.], self.evaluate(var))
コード例 #29
0
    def test_fixed_loss_scaling(self, strategy_fn):
        # Note: We do not test mixed precision in this method, only loss scaling.
        loss_scale = 8.
        batch_size = 4
        with strategy_fn().scope():
            x = layers.Input(shape=(1, ), batch_size=batch_size)
            layer = mp_test_util.MultiplyLayer()
            y = layer(x)

            # The gradient of 'y' at this point is 1. With loss scaling, the gradient
            # is 'loss_scale'. We divide by the batch size since the loss is averaged
            # across batch elements.
            expected_gradient = loss_scale / batch_size
            identity_with_grad_check_fn = (
                mp_test_util.create_identity_with_grad_check_fn(
                    [expected_gradient]))
            y = core.Lambda(identity_with_grad_check_fn)(y)
            model = models.Model(inputs=x, outputs=y)

            def loss_fn(y_true, y_pred):
                del y_true
                return math_ops.reduce_mean(y_pred)

            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, dynamic=False, initial_scale=loss_scale)
            model.compile(opt,
                          loss=loss_fn,
                          run_eagerly=testing_utils.should_run_eagerly())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # Variable starts at 1, and should have gradient of 1 subtracted from it.
        expected = 0
        self.assertEqual(backend.eval(layer.v), expected)
コード例 #30
0
    def testSerializationWithCustomOptimizer(self):
        class MySGD(gradient_descent.SGD):
            def __init__(self, *args, **kwargs):
                super(MySGD, self).__init__(*args, **kwargs)
                self.my_attribute = 123

        opt = MySGD(2., momentum=0.5)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                      initial_scale=2.,
                                                      dynamic_growth_steps=3.)
        config = optimizers.serialize(opt)
        custom_objects = {'MySGD': MySGD}
        opt = optimizers.deserialize(config, custom_objects=custom_objects)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(variables.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertEqual(opt.inner_optimizer.my_attribute, 123)