def testErrorWrappingSameOptimizerMultipleTimes(self): inner_opt = gradient_descent.SGD() loss_scale_optimizer.LossScaleOptimizer(inner_opt) with self.assertRaisesRegex( ValueError, '"inner_optimizer" is already wrapped by a LossScaleOptimizer.'): loss_scale_optimizer.LossScaleOptimizer(inner_opt)
def testInvalidArgsWithFixedLossScale(self): opt = gradient_descent.SGD() with self.assertRaisesRegex( ValueError, '"initial_scale" must be specified if "dynamic" is False'): loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False) opt = gradient_descent.SGD() with self.assertRaisesRegex( ValueError, '"dynamic_growth_steps" must be None if "dynamic" is ' 'False, but got: 2'): loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=1, dynamic_growth_steps=2)
def testUnsupportedStrategy(self): strategy = central_storage_strategy.CentralStorageStrategy() expected_error = ( 'Loss scaling is not supported with the tf.distribute.Strategy: ' 'CentralStorageStrategy. Try using a different Strategy, e.g. a ' 'MirroredStrategy') with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error): loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) with strategy.scope(): var = variables.Variable(1.0) loss = lambda: var * 2.0 run_fn = lambda: opt.minimize(loss, [var]) with self.assertRaisesRegex(ValueError, expected_error): strategy.experimental_run(run_fn)
def testDynamicAttrsWithFixedLossScale(self): opt = gradient_descent.SGD() opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2.) self.assertFalse(opt.dynamic) self.assertIsNone(opt.dynamic_counter) self.assertIsNone(opt.dynamic_growth_steps)
def testDynamicLossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. expected_gradient = variables.Variable(learning_rate / strategy.num_replicas_in_sync) with strategy.scope(): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) self.assertEqual(opt.initial_scale, 2.) self.assertIsInstance(opt.initial_scale, float) self.assertEqual(opt.dynamic_growth_steps, 1) self.assertIsInstance(opt.dynamic_growth_steps, int) self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check(strategy, var, opt, expected_gradient) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Loss scale will be double, so the expected gradient is also doubled. self.evaluate( expected_gradient.assign(2 * learning_rate / strategy.num_replicas_in_sync)) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # As before, the 2 is subtracted from the variable, making it's new value # 1. self.assertAllClose([1.], self.evaluate(var))
def testDynamicLossScaleDefaultValues(self): opt = gradient_descent.SGD() opt = loss_scale_optimizer.LossScaleOptimizer(opt) self.assertEqual(opt.initial_scale, 2**15) self.assertEqual(opt.dynamic_growth_steps, 2000) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
def testDynamicLossScaleWithSlots(self, strategy_fn): strategy_obj = strategy_fn() if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy) and control_flow_v2_toggles.control_flow_v2_enabled() and not context.executing_eagerly()): self.skipTest('b/138667997') with strategy_obj.scope() as strategy: var = variables.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = gradient_descent.SGD(1.0, momentum=1.) initial_scale = 2. opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=initial_scale, dynamic_growth_steps=1) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4) self.assertEqual(opt.get_slot_names(), ['momentum'])
def testSerializationWithBuiltInOptimizer(self, use_v1): opt = gradient_descent.SGD(2., momentum=0.5) if use_v1: loss_scale = tf_loss_scale_module.DynamicLossScale( initial_loss_scale=2., increment_period=3.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2., dynamic_growth_steps=3.) config = optimizers.serialize(opt) opt = optimizers.deserialize(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.dynamic_growth_steps, 3.) self.assertTrue(opt.dynamic, 4.) # Deserializing a LossScaleOptimizer always always results in a V2 # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1. self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer) # Ensure the optimizer can be used var = variables.Variable([5.0]) run_op = self._run_fn_with_grad_check( distribution_strategy_context.get_strategy(), var, opt, 2)() self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.]) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def testNanOnOneReplicaOnly(self): if not test_util.is_gpu_available(): self.skipTest('Test requires GPU') if (not context.executing_eagerly() and not control_flow_v2_toggles.control_flow_v2_enabled()): self.skipTest( 'b/181283011: GradientTape does not work properly with ' 'V1 control flow, and opt.minimize uses GradientTape') with create_mirrored_strategy().scope() as strategy: var = variables.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=2) def loss(): rep_id = (distribution_strategy_context.get_replica_context(). replica_id_in_sync_group) # The last element of last replica's gradient is NaN. return control_flow_ops.cond( constant_op.constant(rep_id == 0), lambda: var * 2., lambda: var * constant_op.constant([1., float('NaN')])) run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [1.0, 2.0]) # Loss scale should half due to NaN gradients. self.assertEqual(1., self.evaluate(opt.loss_scale))
def testDynamicMustBeBool(self): opt = gradient_descent.SGD() with self.assertRaisesRegex( TypeError, '"dynamic" argument to LossScaleOptimizer.__init__ must be ' "a bool, but got: 'dynamic'"): loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
def testApplyGradientsGetsUnwrappedTensors(self): # Tests that gradients passed to apply_gradients are not wrapped in a # DistributionStrategy wrapper, such as PerReplica, but instead are raw # Tensors. Optimizer subclasses that override apply_gradients() expect raw # Tensors, even though the base Optimizer can handle PerReplica gradients. outer_self = self class MyOptimizer(gradient_descent.SGD): def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True): for grad, _ in grads_and_vars: outer_self.assertIsInstance(grad, ops.Tensor) return super(MyOptimizer, self).apply_gradients( grads_and_vars, name, experimental_aggregate_gradients) with create_mirrored_strategy().scope() as strategy: var = variables.Variable([5.0]) opt = MyOptimizer(learning_rate=1.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1) loss = lambda: var * 2.0 run_fn = lambda: opt.minimize(loss, [var]) strategy.experimental_run(run_fn)
def testIterations(self): opt = gradient_descent.SGD(2.0) lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=10.) lso.iterations = 7 self.assertEqual(lso.iterations, 7) self.assertEqual(opt.iterations, 7)
def test_save_slot_variables_with_autocast_vars(self, strategy_fn, var_name='v'): p = policy.Policy('mixed_float16') with strategy_fn().scope(), policy.policy_scope(p): x = layers.Input(shape=(2, ), batch_size=2) # Having a var_name other than 'v' tests that a fixed bug (b/134713714) # does not reoccur. The bug was that a crash would occur when saving a # checkpoint where an AutoCastVariable with a slot variable would have a # different name than the layer attribute's name (layer.v in this case). layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16, var_name=var_name) y = layer(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1., 1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1) model.compile(optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly()) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) weights_file = os.path.join(self.get_temp_dir(), 'weights') model.save_weights(weights_file) saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertNotEqual(new_slot, saved_slot) model.load_weights(weights_file) restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertEqual(restored_slot, saved_slot)
def testDynamicUpdate(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) # Test optimizer with finite gradients loss = lambda: var * 2.0 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Gradient is 2, so variable will have 2 subtracted from it self.assertAllClose([-1.0, 0.0], self.evaluate(var)) # Loss scale has doubled from 2 to 4 self.assertEqual(4., self.evaluate(opt.loss_scale)) # Test optimizer with NaN gradients loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [-1.0, 0.0]) # Loss scale should half due to NaN gradients. self.assertEqual(2., self.evaluate(opt.loss_scale))
def testGetConfigFixed(self, get_config, from_config): # Get a config from LossScaleOptimizerV1, LossScaleOptimizer, or the # LossScaleOptimizer from TF 2.3. Then restore the config into a # LossScaleOptimizerV1 or LossScaleOptimizer opt = gradient_descent.SGD(2., momentum=0.5) if get_config == 'v1': opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, 2) config = opt.get_config() elif get_config == 'v2': opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=2) config = opt.get_config() else: self.assertEqual(get_config, 'tf2_3') config = { 'optimizer': { 'class_name': 'SGD', 'config': { 'learning_rate': 2.0, 'momentum': 0.5, 'decay': 0.0, 'nesterov': False, 'name': 'SGD', } }, 'loss_scale': { 'class_name': 'FixedLossScale', 'config': {'loss_scale_value': 2.0} }, } if from_config == 'v1': opt = loss_scale_optimizer.LossScaleOptimizerV1.from_config(config) else: self.assertEqual(from_config, 'v2') opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) # Test attributes on the optimizer self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.lr), 2.) self.assertEqual(self.evaluate(opt.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.initial_scale, 2.) self.assertIsNone(opt.dynamic_growth_steps) self.assertIsNone(opt.dynamic_counter) self.assertFalse(opt.dynamic) # Ensure the optimizer can be used var = variables.Variable([5.0]) run_op = self._run_fn_with_grad_check( distribution_strategy_context.get_strategy(), var, opt, 2)() self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.])
def testDir(self): lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) dir_result = dir(lso) self.assertIn('learning_rate', dir_result) # Hyperparameter self.assertIn('lr', dir_result) # Hyperparameter self.assertIn('minimize', dir_result) # Attribute self.assertIn('loss_scale', dir_result) # Attribute self.assertNotIn('nesterov', dir_result) # Attribute on inner optimizer self.assertIn('nesterov', dir(lso.inner_optimizer))
def testGetScaledLoss(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2.) loss = ops.convert_to_tensor_v2_with_dispatch(5.) self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss))) self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)())) loss = ops.convert_to_tensor_v2_with_dispatch(5., dtype='float16') self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss))) self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
def test_loss_scale_optimizer_overrides_policy_v1_loss_scale(self): with policy.policy_scope(policy.PolicyV1('float32', loss_scale=10.)): opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=5.) x = layers.Input(shape=(1, )) y = mp_test_util.MultiplyLayer()(x) model = models.Model(x, y) model.compile(opt, loss='mse') self.assertEqual(self.evaluate(model.optimizer.loss_scale), 5.)
def testGetUnscaledGradients(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2) scaled_grads = [ ops.convert_to_tensor_v2_with_dispatch(3.), None, ops.convert_to_tensor_v2_with_dispatch(-4., dtype='float16') ] grads = opt.get_unscaled_gradients(scaled_grads) grads = [self.evaluate(g) if g is not None else g for g in grads] self.assertEqual([1.5, None, -2.], grads)
def testHyperParametersExposed(self): with self.cached_session(): opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9) lso = loss_scale_optimizer.LossScaleOptimizer(opt) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(lso.beta_1), 0.5) self.assertIsInstance(lso.beta_1, variables.Variable) self.assertEqual(self.evaluate(lso.lr), 1.0) self.assertIs(lso.lr, opt.lr) self.assertIs(lso.lr, lso.learning_rate) lso.beta_1 = 0.25 self.assertEqual(self.evaluate(lso.beta_1), 0.25) self.assertEqual(self.evaluate(opt.beta_1), 0.25) self.assertIs(lso.beta_1, opt.beta_1) opt.beta_1 = 0.75 self.assertEqual(self.evaluate(lso.beta_1), 0.75) self.assertEqual(self.evaluate(opt.beta_1), 0.75) self.assertIs(lso.beta_1, opt.beta_1) lso.lr = 2.0 self.assertEqual(self.evaluate(lso.lr), 2.0) self.assertEqual(self.evaluate(lso.learning_rate), 2.0) self.assertEqual(self.evaluate(opt.lr), 2.0) self.assertEqual(self.evaluate(opt.learning_rate), 2.0) self.assertIs(lso.lr, opt.lr) # Test setting attribute that is both attribute on LossScaleOptimizer and # hyperparameter on wrapped optimizer. class MyOpt(gradient_descent.SGD): def __init__(self): super().__init__() self._set_hyper('loss_scale', 123.) opt = MyOpt() lso = loss_scale_optimizer.LossScaleOptimizer(opt) with self.assertRaises(AttributeError): lso.loss_scale = 2.
def testArbitraryAttributesNotExposed(self): opt = gradient_descent.SGD() lso = loss_scale_optimizer.LossScaleOptimizer(opt) self.assertFalse(opt.nesterov) with self.assertRaisesRegex( AttributeError, "'LossScaleOptimizer' object has no attribute 'nesterov'"): lso.nesterov # pylint: disable=pointless-statement lso.nesterov = True self.assertTrue(lso.nesterov) self.assertFalse(opt.nesterov)
def testGetUnscaledSparseGradients(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2) sparse_scaled_grad = indexed_slices.IndexedSlices( ops.convert_to_tensor_v2_with_dispatch([[4., 2.], [8., 5.]]), ops.convert_to_tensor_v2_with_dispatch([1, 3], dtype='int32'), dense_shape=ops.convert_to_tensor_v2_with_dispatch([5, 2], dtype='int32')) sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0] self.assertIsInstance(sparse_grad, indexed_slices.IndexedSlices) self.assertAllEqual([[2., 1.], [4., 2.5]], self.evaluate(sparse_grad.values))
def testFixedLossScaleAppliedToLossWithGetGradients(self): with ops.Graph().as_default(): var = variables.Variable([2.0]) opt = gradient_descent.SGD(1.0) loss_scale = 10. opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=loss_scale) grad_check_fn = mp_test_util.create_identity_with_grad_check_fn( loss_scale) loss = grad_check_fn(var) run_op = opt.get_gradients(loss, [var]) self.evaluate(variables.global_variables_initializer()) # This will cause an assertion to run, as # mp_test_util.create_identity_with_grad_check_fn added an assertion op. self.evaluate(run_op)
def testWeightMethods(self): with self.test_session(): var = variables.Variable([1.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2., dynamic_growth_steps=1) run_op = opt.minimize(lambda: var * 2, [var]) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertLen(opt.weights, 1) # The 'iterations' weight self.assertEqual(self.evaluate(opt.weights[0]), 1) self.assertEqual(opt.get_weights()[0], 1) self.assertEqual(self.evaluate(opt.variables()[0]), 1) opt.set_weights([np.array(2.)]) self.assertEqual(self.evaluate(opt.variables()[0]), 2)
def test_restore_old_loss_scale_checkpoint(self): # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format # of LossScaleOptimizer changed, but old checkpoints can still be loaded opt = gradient_descent.SGD(0.1, momentum=0.1) opt = loss_scale_optimizer.LossScaleOptimizer(opt) model = sequential.Sequential([core.Dense(2, )]) # The checkpoint and expected values were obtained from the program in # testdata/BUILD. ckpt_dir = os.path.join(flags.FLAGS['test_srcdir'].value, 'org_tensorflow/tensorflow/python/keras', 'mixed_precision/testdata/lso_ckpt_tf2.2') # ckpt_dir = test.test_src_dir_path( # 'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2') model.load_weights(os.path.join(ckpt_dir, 'ckpt')) model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly()) model(np.zeros((2, 2))) # Create model weights opt._create_all_weights(model.weights) expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]]) expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]]) self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel) self.assertAllClose( self.evaluate(opt.get_slot(model.weights[0], 'momentum')), expected_slot) self.assertEqual(self.evaluate(opt.loss_scale), 32768) self.assertEqual(self.evaluate(opt.dynamic_counter), 1) # Check restoring works even after the model is compiled and the weights # have been created. model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2))) self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel) self.assertNotAllClose( self.evaluate(opt.get_slot(model.weights[0], 'momentum')), expected_slot) model.load_weights(os.path.join(ckpt_dir, 'ckpt')) self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel) self.assertAllClose( self.evaluate(opt.get_slot(model.weights[0], 'momentum')), expected_slot) self.assertEqual(self.evaluate(opt.loss_scale), 32768) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def testDynamicLossScaleWithFloat16Loss(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2, dynamic_growth_steps=1) def loss(): return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var))
def testClipping(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'): with strategy.scope(), self.subTest(clip_type=clip_type): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0}) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) self.assertEqual(getattr(opt, clip_type), 2.0) self.assertEqual( opt.initial_scale % strategy.num_replicas_in_sync, 0) loss = lambda: var * 4 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) # Test running with clipped gradients run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The gradient is 4 but is clipped to 2, so the variable will be # init_val - clipped_grad * lr == 5 - 2 * 2 == 1 self.assertAllClose([1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), 4) # Test changing the clip amount and running again setattr(opt, clip_type, 3.0) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The gradient is 4 but is clipped to 3, so the variable will be # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5 self.assertAllClose([-5.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), 8) # Test Inf gradients are still skipped instead of being clipped loss = lambda: var * float('Inf') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) self.assertAllClose([-5.], self.evaluate(var)) # Var does not change self.assertEqual(self.evaluate(opt.loss_scale), 4)
def testFixedLossScaleAppliedToLossWithMinimize(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([5.0]) opt = gradient_descent.SGD(2.0) loss_scale = 10. opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=loss_scale) self.assertEqual(self.evaluate(opt.loss_scale), loss_scale) self.assertIsInstance(opt.loss_scale, ops.Tensor) # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale # / strategy.num_replicas_in_sync will not be exact, which could lead to # assertion failures due to rounding issues. self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check( strategy, var, opt, loss_scale / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var))
def test_fixed_loss_scaling(self, strategy_fn): # Note: We do not test mixed precision in this method, only loss scaling. loss_scale = 8. batch_size = 4 with strategy_fn().scope(): x = layers.Input(shape=(1, ), batch_size=batch_size) layer = mp_test_util.MultiplyLayer() y = layer(x) # The gradient of 'y' at this point is 1. With loss scaling, the gradient # is 'loss_scale'. We divide by the batch size since the loss is averaged # across batch elements. expected_gradient = loss_scale / batch_size identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( [expected_gradient])) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=loss_scale) model.compile(opt, loss=loss_fn, run_eagerly=testing_utils.should_run_eagerly()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # Variable starts at 1, and should have gradient of 1 subtracted from it. expected = 0 self.assertEqual(backend.eval(layer.v), expected)
def testSerializationWithCustomOptimizer(self): class MySGD(gradient_descent.SGD): def __init__(self, *args, **kwargs): super(MySGD, self).__init__(*args, **kwargs) self.my_attribute = 123 opt = MySGD(2., momentum=0.5) opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2., dynamic_growth_steps=3.) config = optimizers.serialize(opt) custom_objects = {'MySGD': MySGD} opt = optimizers.deserialize(config, custom_objects=custom_objects) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.dynamic_growth_steps, 3.) self.assertEqual(opt.inner_optimizer.my_attribute, 123)