def _test_helper(self, inputs, expected_outputs, initial_loss_scale=1., increment_period=2, multiplier=2): loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=increment_period, multiplier=multiplier) itr = _get_example_iter(inputs) def update(): is_finite = itr.get_next() grad = self._get_tensor(is_finite) update_op, should_apply_gradients = loss_scale.update([grad]) assert_op = check_ops.assert_equal(should_apply_gradients, is_finite) if context.executing_eagerly(): return with ops.control_dependencies([assert_op]): return array_ops.identity(update_op) actual_outputs = [] if not context.executing_eagerly(): update_op = update() self.evaluate(variables.global_variables_initializer()) for _ in range(len(inputs)): if context.executing_eagerly(): update() else: self.evaluate(update_op) actual_outputs.append(self.evaluate(loss_scale())) self.assertEqual(actual_outputs, expected_outputs)
def testDynamicLossScaleWithSlots(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = gradient_descent.SGD(1.0, momentum=1.) initial_loss_scale = 2. loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=1, multiplier=4) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 4) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 16)
def testDynamicUpdate(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) # Test optimizer with finite gradients loss = lambda: var * 2.0 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Gradient is 2, so variable will have 2 subtracted from it self.assertAllClose([-1.0, 0.0], self.evaluate(var)) # Loss scale has doubled from 2 to 4 self.assertEqual(4., self.evaluate(opt._loss_scale())) # Test optimizer with NaN gradients loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [-1.0, 0.0]) # Loss scale should half due to NaN gradients. self.assertEqual(2., self.evaluate(opt._loss_scale()))
def testDynamicLossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. expected_gradient = resource_variable_ops.ResourceVariable( learning_rate / strategy.num_replicas_in_sync) with strategy.scope(): var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) self.assertEqual( loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check(strategy, var, opt, expected_gradient) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Loss scale will be double, so the expected gradient is also doubled. self.evaluate(expected_gradient.assign( 2 * learning_rate / strategy.num_replicas_in_sync)) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # As before, the 2 is subtracted from the variable, making it's new value # 1. self.assertAllClose([1.], self.evaluate(var))
def test_serialization(self): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=1, increment_period=2, multiplier=3) config = loss_scale_module.serialize(loss_scale) loss_scale = loss_scale_module.deserialize(config) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(loss_scale()), 1) self.assertEqual(loss_scale.increment_period, 2) self.assertEqual(loss_scale.multiplier, 3)
def testCheckpoint(self, strategy_fn): strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return with self.test_session(), strategy.scope(): # Build and run a simple model. var = variables.Variable([2.0]) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = gradient_descent.SGD(1., momentum=1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var]) opt_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self.evaluate(opt_op) self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) slot_var = opt._optimizer.get_slot(var, 'momentum') slot_value = self.evaluate(slot_var).item() # Save a checkpoint. checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var) prefix = os.path.join(self.get_temp_dir(), 'ckpt') save_path = checkpoint.save(prefix) # Run model again. self.evaluate(strategy.experimental_run(run_fn)) self.assertEqual(self.evaluate(loss_scale()), 2.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0) self.assertNotAlmostEqual(self.evaluate(slot_var).item(), slot_value) # Load checkpoint and ensure loss scale is back to it's original value. status = checkpoint.restore(save_path) status.assert_consumed() status.run_restore_ops() self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn): with context.eager_mode(): strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32) y = AddLayer(assert_type=dtypes.float32)(x) model = models.Model(inputs=x, outputs=y) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(optimizer=opt, loss='mse') # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1) # Save model weights. save_prefix = os.path.join(self.get_temp_dir(), 'ckpt') model.save_weights(save_prefix) # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 4) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0) # Load model weights and ensure loss scale weights are restored. model.load_weights(save_prefix) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
def test_dynamic_loss_scaling(self, strategy_fn, cloning=True): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1, ), batch_size=batch_size, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(opt, loss=loss_fn, cloning=cloning) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)
def test_get(self): scalar = loss_scale_module.get('dynamic') scalar2 = loss_scale_module.DynamicLossScale() self.assertEqual(scalar.initial_loss_scale, scalar2.initial_loss_scale) self.assertEqual(scalar.increment_period, scalar2.increment_period) self.assertEqual(scalar.multiplier, scalar2.multiplier)
def test_update_with_none_gradients(self): loss_scale = loss_scale_module.DynamicLossScale() loss_scale.update([None])