def testSerializationWithBuiltInOptimizer(self, use_v1): opt = gradient_descent.SGD(2., momentum=0.5) if use_v1: loss_scale = tf_loss_scale_module.DynamicLossScale( initial_loss_scale=2., increment_period=3.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2., dynamic_growth_steps=3.) config = optimizers.serialize(opt) opt = optimizers.deserialize(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.dynamic_growth_steps, 3.) self.assertTrue(opt.dynamic, 4.) # Deserializing a LossScaleOptimizer always always results in a V2 # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1. self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer) # Ensure the optimizer can be used var = variables.Variable([5.0]) run_op = self._run_fn_with_grad_check( distribution_strategy_context.get_strategy(), var, opt, 2)() self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.]) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def testGetConfigFixed(self, get_config, from_config): # Get a config from LossScaleOptimizerV1, LossScaleOptimizer, or the # LossScaleOptimizer from TF 2.3. Then restore the config into a # LossScaleOptimizerV1 or LossScaleOptimizer opt = gradient_descent.SGD(2., momentum=0.5) if get_config == 'v1': opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, 2) config = opt.get_config() elif get_config == 'v2': opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=2) config = opt.get_config() else: self.assertEqual(get_config, 'tf2_3') config = { 'optimizer': { 'class_name': 'SGD', 'config': { 'learning_rate': 2.0, 'momentum': 0.5, 'decay': 0.0, 'nesterov': False, 'name': 'SGD', } }, 'loss_scale': { 'class_name': 'FixedLossScale', 'config': {'loss_scale_value': 2.0} }, } if from_config == 'v1': opt = loss_scale_optimizer.LossScaleOptimizerV1.from_config(config) else: self.assertEqual(from_config, 'v2') opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) # Test attributes on the optimizer self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.lr), 2.) self.assertEqual(self.evaluate(opt.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.initial_scale, 2.) self.assertIsNone(opt.dynamic_growth_steps) self.assertIsNone(opt.dynamic_counter) self.assertFalse(opt.dynamic) # Ensure the optimizer can be used var = variables.Variable([5.0]) run_op = self._run_fn_with_grad_check( distribution_strategy_context.get_strategy(), var, opt, 2)() self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.])
def test_optimizer_errors(self): opt = gradient_descent_v2.SGD(1.0) opt = loss_scale_optimizer_v2.LossScaleOptimizerV1(opt, 'dynamic') with self.assertRaisesRegex( ValueError, '"opt" must not already be an instance of a ' 'LossScaleOptimizer.'): enable_mixed_precision_graph_rewrite(opt) self.assertFalse(config.get_optimizer_experimental_options().get( 'auto_mixed_precision', False))
def testPassingV1LossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): # Test FixedLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = tf_loss_scale_module.FixedLossScale(2.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) self.assertIsInstance(opt.loss_scale, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 2 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Test DynamicLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = tf_loss_scale_module.DynamicLossScale( initial_loss_scale=4, increment_period=1, multiplier=2) loss_scale._current_loss_scale.assign(2) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) self.assertEqual(opt.initial_scale, 4) self.assertEqual(opt.dynamic_growth_steps, 1) self.evaluate(variables.global_variables_initializer()) # Current loss scale is not copied so loss scale is reinitialized to 4 self.assertEqual(self.evaluate(opt.loss_scale), 4) for s in strategy.experimental_local_results(opt.dynamic_counter): self.assertEqual(self.evaluate(s), 0) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 4 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertAllClose([3.], self.evaluate(var))
def testV1Optimizer(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): # Test FixedLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale=2) self.assertIsInstance(opt.loss_scale, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2) self.assertEqual(opt.initial_scale, 2) self.assertIsNone(opt.dynamic_growth_steps) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 2 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Test DynamicLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, 'dynamic') self.assertEqual(opt.initial_scale, 2**15) self.assertEqual(opt.dynamic_growth_steps, 2000) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2**15) for s in strategy.experimental_local_results(opt.dynamic_counter): self.assertEqual(self.evaluate(s), 0) loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertAllClose([5.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), 2**14) for s in strategy.experimental_local_results(opt.dynamic_counter): self.assertEqual(self.evaluate(s), 0)
def testPassingV1LossScaleErrors(self): opt = gradient_descent.SGD() loss_scale = tf_loss_scale_module.DynamicLossScale(multiplier=4) with self.assertRaisesRegex( ValueError, 'When passing a DynamicLossScale to "loss_scale", ' 'DynamicLossScale.multiplier must be 2. Got: ' 'DynamicLossScale'): loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) class MyLossScale(tf_loss_scale_module.LossScale): def __call__(self): return 1. def update(self, grads): return None, True def get_config(self): return {} with self.assertRaisesRegex( TypeError, 'Passing a LossScale that is not a FixedLossScale or a ' 'DynamicLossScale is no longer supported. Got:'): loss_scale_optimizer.LossScaleOptimizerV1(opt, MyLossScale())
def test_save_model_with_dynamic_loss_scaling( self, strategy_fn, h5=False, use_v1_loss_scale_optimizer=False): # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy # as well. strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32) y = mp_test_util.MultiplyLayer()(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1.) if use_v1_loss_scale_optimizer: loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2.) opt = loss_scale_optimizer.LossScaleOptimizerV1( opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=1., dynamic_growth_steps=2.) model.compile(optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly()) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(opt.loss_scale), 2) self.assertEqual(backend.get_value(opt.dynamic_counter), 1) (weight, ) = model.trainable_weights orig_weight = backend.get_value(weight) # Save model weights. save_path = os.path.join(self.get_temp_dir(), 'model') model.save(save_path, save_format='h5' if h5 else 'tf') # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_weight = backend.get_value(weight) self.assertNotEqual(new_weight, orig_weight) self.assertEqual(backend.get_value(opt.loss_scale), 4) self.assertEqual(backend.get_value(opt.dynamic_counter), 0) # Load model weights and ensure loss scale weights are restored. model = save.load_model( save_path, custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer}) (weight, ) = model.trainable_weights loaded_weight = backend.get_value(weight) self.assertEqual(loaded_weight, orig_weight) # Currently the loss scale isn't always saved when the model is saved with # Model.save(). So we assert the loss scale either has the value when it was # saved, or the value it was initialized with. # TODO(reedwm): Always save/restore the loss scale with Model.save(). self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2)) self.assertIn(backend.get_value(model.optimizer.dynamic_counter), (0, 1)) # Test optimizer attributes and type self.assertEqual(model.optimizer.initial_scale, 1.) self.assertEqual(model.optimizer.dynamic_growth_steps, 2.) self.assertEqual(type(model.optimizer), loss_scale_optimizer.LossScaleOptimizer)
def test_dynamic_loss_scaling(self, strategy_fn, pass_loss_scale_to_policy=False, get_config=False, use_v1_loss_scale_optimizer=False): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): opt = gradient_descent.SGD(1.) if pass_loss_scale_to_policy: loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) p = policy.PolicyV1('mixed_float16', loss_scale=loss_scale) elif use_v1_loss_scale_optimizer: loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) p = policy.Policy('mixed_float16') opt = loss_scale_optimizer.LossScaleOptimizerV1( opt, loss_scale) else: p = policy.Policy('mixed_float16') opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=initial_loss_scale, dynamic_growth_steps=2) with policy.policy_scope(p): x = layers.Input(shape=(1, ), batch_size=batch_size, dtype=dtypes.float16) layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) if get_config: config = model.get_config() model = model.__class__.from_config( config, custom_objects={ 'MultiplyLayer': mp_test_util.MultiplyLayer }) (layer, ) = ( layer for layer in model.layers if isinstance(layer, mp_test_util.MultiplyLayer)) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) model.compile(opt, loss=loss_fn, run_eagerly=testing_utils.should_run_eagerly()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)