def testDynamicUpdate(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) # Test optimizer with finite gradients loss = lambda: var * 2.0 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Gradient is 2, so variable will have 2 subtracted from it self.assertAllClose([-1.0, 0.0], self.evaluate(var)) # Loss scale has doubled from 2 to 4 self.assertEqual(4., self.evaluate(opt.loss_scale())) # Test optimizer with NaN gradients loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [-1.0, 0.0]) # Loss scale should half due to NaN gradients. self.assertEqual(2., self.evaluate(opt.loss_scale()))
def testDynamicLossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. expected_gradient = variables.Variable(learning_rate / strategy.num_replicas_in_sync) with strategy.scope(): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) self.assertEqual( loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check(strategy, var, opt, expected_gradient) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Loss scale will be double, so the expected gradient is also doubled. self.evaluate( expected_gradient.assign(2 * learning_rate / strategy.num_replicas_in_sync)) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # As before, the 2 is subtracted from the variable, making it's new value # 1. self.assertAllClose([1.], self.evaluate(var))
def testSerializationWithBuiltInOptimizer(self, use_v1): opt = gradient_descent.SGD(2., momentum=0.5) if use_v1: loss_scale = tf_loss_scale_module.DynamicLossScale( initial_loss_scale=2., increment_period=3.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2., dynamic_growth_steps=3.) config = optimizers.serialize(opt) opt = optimizers.deserialize(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.dynamic_growth_steps, 3.) self.assertTrue(opt.dynamic, 4.) # Deserializing a LossScaleOptimizer always always results in a V2 # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1. self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer) # Ensure the optimizer can be used var = variables.Variable([5.0]) run_op = self._run_fn_with_grad_check( distribution_strategy_context.get_strategy(), var, opt, 2)() self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.]) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def testDynamicLossScaleWithSlots(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = gradient_descent.SGD(1.0, momentum=1.) initial_loss_scale = 2. loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=1, multiplier=4) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale()), initial_loss_scale * 4) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale()), initial_loss_scale * 16)
def testSerializationWithCustomOptimizer(self): class MySGD(gradient_descent.SGD): def __init__(self, *args, **kwargs): super(MySGD, self).__init__(*args, **kwargs) self.my_attribute = 123 opt = MySGD(2., momentum=0.5) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2., increment_period=3., multiplier=4.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) config = optimizers.serialize(opt) custom_objects = {'MySGD': MySGD} opt = optimizers.deserialize(config, custom_objects=custom_objects) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale()), 2.) self.assertEqual(opt.loss_scale.increment_period, 3.) self.assertEqual(opt.loss_scale.multiplier, 4.) self.assertEqual(opt._optimizer.my_attribute, 123)
def test_dynamic_scale_to_one_on_non_finite_gradient_on_last_replica( self, use_tf_function): if context.num_gpus() < 1: # Requires the mirrored strategy to have two replicas: one on the CPU and # one on the GPU self.skipTest('Test requires at least 1 GPU') loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) strategy = create_mirrored_strategy() with strategy.scope(): x = variables.Variable(3.0) def run_fn(): with lsgt.LossScaleGradientTape(loss_scale) as g: # The gradient will be finite on the first replica, and infinite on the # second rep_ctx = distribution_strategy_context.get_replica_context() if rep_ctx.replica_id_in_sync_group == rep_ctx.num_replicas_in_sync - 1: y = x * np.inf else: y = x * 2 return g.gradient(y, x) replica0_grad, replica1_grad = self._run_with_strategy( run_fn, strategy, use_tf_function) self.assertEqual(self.evaluate(loss_scale()), 1.0) self.assertEqual(replica0_grad, 2.0) self.assertEqual(replica1_grad, np.inf)
def from_config(cls, config, custom_objects=None): config = config.copy() # Make a copy, since we mutate config config['optimizer'] = optimizers.deserialize( config['optimizer'], custom_objects=custom_objects) # If loss_scale is in config, we assume we are deserializing a # LossScaleOptimizer from TF 2.3 or below. Otherwise, we assume we are # deserializing a LossScaleOptimizer from TF 2.4 or above. if 'loss_scale' in config: config['loss_scale'] = keras_loss_scale_module.deserialize( config['loss_scale']) if (isinstance(config['loss_scale'], loss_scale_module.DynamicLossScale) and config['loss_scale'].multiplier != 2): raise ValueError('Cannot deserialize LossScaleOptimizer with a ' 'DynamicLossScale whose multiplier is not 2. Got ' 'DynamicLossScale: %s' % (config['loss_scale'],)) return cls(**config) # We convert the config, as generated by LossScaleOptimizer.get_config, to a # version that can be passed to LossScaleOptimizerV1.__init__ if config['dynamic']: config['loss_scale'] = loss_scale_module.DynamicLossScale( config['initial_scale'], config['dynamic_growth_steps'], multiplier=2) else: config['loss_scale'] = loss_scale_module.FixedLossScale( config['initial_scale']) del config['dynamic'] del config['initial_scale'] del config['dynamic_growth_steps'] return cls(**config)
def _test_helper(self, inputs, expected_outputs, initial_loss_scale=1., increment_period=2, multiplier=2): loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=increment_period, multiplier=multiplier) itr = _get_example_iter(inputs) def update(): is_finite = itr.get_next() grad = self._get_tensor(is_finite) update_op, should_apply_gradients = loss_scale.update([grad]) assert_op = check_ops.assert_equal(should_apply_gradients, is_finite) if context.executing_eagerly(): return with ops.control_dependencies([assert_op]): return array_ops.identity(update_op) actual_outputs = [] if not context.executing_eagerly(): update_op = update() self.evaluate(variables.global_variables_initializer()) for _ in range(len(inputs)): if context.executing_eagerly(): update() else: self.evaluate(update_op) actual_outputs.append(self.evaluate(loss_scale())) self.assertEqual(actual_outputs, expected_outputs)
def test_dynamic_scale_to_one_on_non_finite_gradient( self, non_finite_term): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) x = constant_op.constant(1.0) with lsgt.LossScalingGradientTape(loss_scale) as g: g.watch(x) y = x * non_finite_term g.gradient(y, x) self.assertEqual(self.evaluate(loss_scale()), 1.0)
def test_serialization(self): loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1, increment_period=2, multiplier=3) config = loss_scale.get_config() loss_scale = loss_scale_module.DynamicLossScale.from_config(config) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(loss_scale()), 1) self.assertEqual(loss_scale.increment_period, 2) self.assertEqual(loss_scale.multiplier, 3)
def test_dynamic_loss_scaling_down_loop(self): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) x = constant_op.constant(1.0) with lsgt.LossScalingGradientTape(loss_scale) as g: g.watch(x) y = x * (3.0 * (10**37)) # grad will be inf after scaling dy_dx = g.gradient(y, x) self.assertEqual(self.evaluate(loss_scale()), 8.0) self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06)
def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False): self._skip_if_strategy_unsupported(strategy_fn) # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy # as well. strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32) y = mp_test_util.AddLayer()(x) model = models.Model(inputs=x, outputs=y) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils. should_run_tf_function()) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1) (weight, ) = model.trainable_weights orig_weight = backend.get_value(weight) # Save model weights. save_path = os.path.join(self.get_temp_dir(), 'model') model.save(save_path, save_format='h5' if h5 else 'tf') # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) new_weight = backend.get_value(weight) self.assertNotEqual(new_weight, orig_weight) self.assertEqual(backend.get_value(loss_scale()), 4) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0) # Load model weights and ensure loss scale weights are restored. model = save.load_model( save_path, custom_objects={'AddLayer': mp_test_util.AddLayer}) loss_scale = model.optimizer.loss_scale (weight, ) = model.trainable_weights loaded_weight = backend.get_value(weight) self.assertEqual(loaded_weight, orig_weight) # Currently the loss scale isn't always saved when the model is saved with # Model.save(). So we assert the loss scale either has the value when it was # saved, or the value it was initialized with. # TODO(reedwm): Always save/restore the loss scale with Model.save(). self.assertIn(backend.get_value(loss_scale()), (1, 2)) self.assertIn(backend.get_value(loss_scale._num_good_steps), (0, 1))
def test_dynamic_loss_scaling_inf_target_post_scale(self): loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=32.0) x = constant_op.constant(3.0 * (10**37)) with lsgt.LossScalingGradientTape(loss_scale) as g: g.watch(x) y = x * 3.0 # target will be inf after scaling dy_dx = g.gradient(y, x) self.assertAllClose(self.evaluate(dy_dx), 3.0) self.assertEqual(self.evaluate(loss_scale()), 32.0)
def test_dynamic_scale_to_one_on_non_finite_gradient( self, strategy_fn, non_finite_term, use_tf_function): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) strategy = strategy_fn() with strategy.scope(): x = variables.Variable(3.0) def run_fn(): with lsgt.LossScaleGradientTape(loss_scale) as g: y = x * non_finite_term g.gradient(y, x) self._run_with_strategy(run_fn, strategy, use_tf_function) self.assertEqual(self.evaluate(loss_scale()), 1.0)
def test_dynamic_scale_to_one_on_non_finite_gradient( self, strategy_fn, non_finite_term, use_tf_function): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) def run_fn(): x = constant_op.constant(1.0) with lsgt.LossScaleGradientTape(loss_scale) as g: g.watch(x) y = x * non_finite_term g.gradient(y, x) self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) self.assertEqual(self.evaluate(loss_scale()), 1.0)
def test_repr(self, strategy_fn): with strategy_fn().scope(): loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1, increment_period=2, multiplier=3) if context.executing_eagerly(): self.assertEqual(repr(loss_scale), 'DynamicLossScale(current_loss_scale=1.0, ' 'num_good_steps=0, initial_loss_scale=1.0, ' 'increment_period=2, multiplier=3.0)') else: self.assertEqual(repr(loss_scale), 'DynamicLossScale(initial_loss_scale=1.0, ' 'increment_period=2, multiplier=3.0)')
def test_dynamic_loss_scaling_down_loop(self, strategy_fn, use_tf_function): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) strategy = strategy_fn() with strategy.scope(): x = variables.Variable(3.0) def run_fn(): with lsgt.LossScaleGradientTape(loss_scale) as g: y = x * (3.0 * (10**37)) # grad will be inf after scaling return g.gradient(y, x) dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function) self.assertEqual(self.evaluate(loss_scale()), 8.0) for dy_dx in dy_dx_list: self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06)
def get(identifier): """Get a loss scale object.""" if isinstance(identifier, dict): return deserialize(identifier) if isinstance(identifier, (int, float)): return loss_scale_module.FixedLossScale(identifier) if identifier == 'dynamic': return loss_scale_module.DynamicLossScale() if isinstance(identifier, loss_scale_module.LossScale): return identifier elif identifier is None: return None else: raise ValueError('Could not interpret loss scale identifier: %s' % identifier)
def testSerializationWithBuiltInOptimizer(self): opt = gradient_descent.SGD(2., momentum=0.5) loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=2., increment_period=3., multiplier=4.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) config = optimizers.serialize(opt) opt = optimizers.deserialize(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale()), 2.) self.assertEqual(opt.loss_scale.increment_period, 3.) self.assertEqual(opt.loss_scale.multiplier, 4.)
def test_dynamic_loss_scaling_inf_target_post_scale( self, strategy_fn, use_tf_function): loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=32.0) def run_fn(): x = constant_op.constant(3.0 * (10**37)) with lsgt.LossScaleGradientTape(loss_scale) as g: g.watch(x) y = x * 3.0 # target will be inf after scaling return g.gradient(y, x) dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) self.assertEqual(self.evaluate(loss_scale()), 32.0) for dy_dx in dy_dx_list: self.assertAllClose(self.evaluate(dy_dx), 3.0)
def testWeightMethods(self): var = variables.Variable([1.0]) opt = gradient_descent.SGD(1.0) initial_loss_scale = 2. loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=1, multiplier=4) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) run_op = opt.minimize(lambda: var * 2, [var]) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertLen(opt.weights, 1) # The 'iterations' weight self.assertEqual(self.evaluate(opt.weights[0]), 1) self.assertEqual(opt.get_weights()[0], 1) self.assertEqual(self.evaluate(opt.variables()[0]), 1) opt.set_weights([np.array(2.)]) self.assertEqual(self.evaluate(opt.variables()[0]), 2)
def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn): if testing_utils.should_run_distributed(): self.skipTest('b/137397816') if not self._is_strategy_supported(strategy_fn): return strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32) y = AddLayer(assert_type=dtypes.float32)(x) model = models.Model(inputs=x, outputs=y) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile( optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1) # Save model weights. save_prefix = os.path.join(self.get_temp_dir(), 'ckpt') model.save_weights(save_prefix) # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 4) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0) # Load model weights and ensure loss scale weights are restored. model.load_weights(save_prefix) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
def testClipping(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'): with strategy.scope(), self.subTest(clip_type=clip_type): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0}) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) self.assertEqual(getattr(opt, clip_type), 2.0) self.assertEqual( loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0) loss = lambda: var * 4 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) # Test running with clipped gradients run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The gradient is 4 but is clipped to 2, so the variable will be # init_val - clipped_grad * lr == 5 - 2 * 2 == 1 self.assertAllClose([1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale()), 4) # Test changing the clip amount and running again setattr(opt, clip_type, 3.0) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The gradient is 4 but is clipped to 3, so the variable will be # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5 self.assertAllClose([-5.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale()), 8) # Test Inf gradients are still skipped instead of being clipped loss = lambda: var * float('Inf') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) self.assertAllClose([-5.], self.evaluate(var)) # Var does not change self.assertEqual(self.evaluate(opt.loss_scale()), 4)
def testDynamicLossScaleWithFloat16Loss(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) def loss(): return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var))
def testPassingV1LossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): # Test FixedLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = tf_loss_scale_module.FixedLossScale(2.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) self.assertIsInstance(opt.loss_scale, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 2 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Test DynamicLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = tf_loss_scale_module.DynamicLossScale( initial_loss_scale=4, increment_period=1, multiplier=2) loss_scale._current_loss_scale.assign(2) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) self.assertEqual(opt.initial_scale, 4) self.assertEqual(opt.dynamic_growth_steps, 1) self.evaluate(variables.global_variables_initializer()) # Current loss scale is not copied so loss scale is reinitialized to 4 self.assertEqual(self.evaluate(opt.loss_scale), 4) for s in strategy.experimental_local_results(opt.dynamic_counter): self.assertEqual(self.evaluate(s), 0) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 4 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertAllClose([3.], self.evaluate(var))
def testCheckpoint(self, strategy_fn): strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return with self.test_session(), strategy.scope(): # Build and run a simple model. var = variables.Variable([2.0]) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = gradient_descent.SGD(1., momentum=1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var]) opt_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self.evaluate(opt_op) self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) slot_var = opt._optimizer.get_slot(var, 'momentum') slot_value = self.evaluate(slot_var).item() # Save a checkpoint. checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var) prefix = os.path.join(self.get_temp_dir(), 'ckpt') save_path = checkpoint.save(prefix) # Run model again. self.evaluate(strategy.experimental_run(run_fn)) self.assertEqual(self.evaluate(loss_scale()), 2.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0) self.assertNotAlmostEqual( self.evaluate(slot_var).item(), slot_value) # Load checkpoint and ensure loss scale is back to it's original value. status = checkpoint.restore(save_path) status.assert_consumed() status.run_restore_ops() self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
def testDynamicLossScaleWithSlots(self, strategy_fn): strategy_obj = strategy_fn() if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy) and control_flow_v2_toggles.control_flow_v2_enabled() and not context.executing_eagerly()): self.skipTest('b/138667997') with strategy_obj.scope() as strategy: var = variables.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = gradient_descent.SGD(1.0, momentum=1.) initial_loss_scale = 2. loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=1, multiplier=4) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale()), initial_loss_scale * 4) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale()), initial_loss_scale * 16) self.assertEqual(opt.get_slot_names(), ['momentum'])
def test_loss_scale(self): policy = mp_policy.Policy('float32') self.assertEqual(policy.loss_scale, None) policy = mp_policy.Policy('float32', loss_scale=None) self.assertEqual(policy.loss_scale, None) ls = loss_scale_module.DynamicLossScale() policy = mp_policy.Policy('float32', loss_scale=ls) self.assertIs(policy.loss_scale, ls) policy = mp_policy.Policy('float32', loss_scale='dynamic') self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale) policy = mp_policy.Policy('mixed_float16') self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale) policy = mp_policy.Policy('mixed_float16', loss_scale=None) self.assertEqual(policy.loss_scale, None) policy = mp_policy.Policy('mixed_bfloat16') self.assertEqual(policy.loss_scale, None)
def testPassingV1LossScaleErrors(self): opt = gradient_descent.SGD() loss_scale = tf_loss_scale_module.DynamicLossScale(multiplier=4) with self.assertRaisesRegex( ValueError, 'When passing a DynamicLossScale to "loss_scale", ' 'DynamicLossScale.multiplier must be 2. Got: ' 'DynamicLossScale'): loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) class MyLossScale(tf_loss_scale_module.LossScale): def __call__(self): return 1. def update(self, grads): return None, True def get_config(self): return {} with self.assertRaisesRegex( TypeError, 'Passing a LossScale that is not a FixedLossScale or a ' 'DynamicLossScale is no longer supported. Got:'): loss_scale_optimizer.LossScaleOptimizerV1(opt, MyLossScale())
def testCheckpoint(self, strategy_fn, save_with_ls, restore_with_ls): class MySGD(gradient_descent.SGD): """A custom optimizer that tracks an extra variable.""" def __init__(self, *args, **kwargs): super(MySGD, self).__init__(*args, **kwargs) self.my_var = variables.Variable(0.) self._track_trackable(self.my_var, 'my_var') strategy = strategy_fn() replicas = strategy.num_replicas_in_sync if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return with self.test_session(), strategy.scope(): # Build and run a simple model. var = variables.Variable([2.0]) opt = inner_opt = MySGD(1., momentum=1.) if save_with_ls: loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var]) opt_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self.evaluate(strategy.experimental_local_results(opt_op)) # Assert values. self.assertEqual(self.evaluate(var), 1.) if save_with_ls: self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) slot_var = opt.get_slot(var, 'momentum') self.assertEqual(self.evaluate(slot_var).item(), -1) self.assertEqual(self.evaluate(opt.iterations), 1) # Set optimizer variable to check arbitrary optimizer attributes can be # saved/restored self.evaluate(inner_opt.my_var.assign(1.)) # Save a checkpoint. checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var) prefix = os.path.join(self.get_temp_dir(), 'ckpt') save_path = checkpoint.save(prefix) # Create new model var = variables.Variable([2.0]) opt = inner_opt = MySGD(1., momentum=1.) if restore_with_ls: loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) # Restore new model. checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var) status = checkpoint.restore(save_path) if save_with_ls: status.assert_existing_objects_matched() else: status.assert_nontrivial_match() # Assert restored values. We can only assert in eager mode since the # variables are uninitialized in graph mode if context.executing_eagerly(): self.assertEqual(self.evaluate(var), 1.) if save_with_ls and restore_with_ls: self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) elif restore_with_ls: self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0) self.assertEqual(self.evaluate(opt.iterations), 1) # Run the model again. run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var]) opt_op = strategy.experimental_run(run_fn) # Assert new values. self.evaluate(variables.global_variables_initializer()) status.run_restore_ops() self.evaluate(strategy.experimental_local_results(opt_op)) self.assertEqual(self.evaluate(var), -1) slot_var = opt.get_slot(var, 'momentum') self.assertEqual(self.evaluate(slot_var).item(), -2) self.assertEqual(self.evaluate(opt.iterations), 2) self.assertEqual(self.evaluate(inner_opt.my_var), 1) # Restore model again to test restoring after slots are created status = checkpoint.restore(save_path) if save_with_ls and restore_with_ls: status.assert_consumed() elif save_with_ls: status.assert_existing_objects_matched() elif restore_with_ls: status.assert_nontrivial_match() status.run_restore_ops() self.assertEqual(self.evaluate(var), 1) self.assertEqual(self.evaluate(slot_var).item(), -1)