def testFixedLossScaleAppliedToLossWithGetGradients(self): with tf.Graph().as_default(): var = tf.Variable([2.0]) opt = gradient_descent.SGD(1.0) loss_scale = 10. opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=loss_scale) grad_check_fn = mp_test_util.create_identity_with_grad_check_fn( loss_scale) loss = grad_check_fn(var) run_op = opt.get_gradients(loss, [var]) self.evaluate(tf.compat.v1.global_variables_initializer()) # This will cause an assertion to run, as # mp_test_util.create_identity_with_grad_check_fn added an assertion op. self.evaluate(run_op)
def test_fixed_loss_scaling(self, strategy_fn): # Note: We do not test mixed precision in this method, only loss scaling. loss_scale = 8.0 batch_size = 4 with strategy_fn().scope(): x = layers.Input(shape=(1,), batch_size=batch_size) layer = mp_test_util.MultiplyLayer() y = layer(x) # The gradient of 'y' at this point is 1. With loss scaling, the gradient # is 'loss_scale'. We divide by the batch size since the loss is averaged # across batch elements. expected_gradient = loss_scale / batch_size identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( [expected_gradient] ) ) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return tf.reduce_mean(y_pred) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=loss_scale ) model.compile( opt, loss=loss_fn, run_eagerly=test_utils.should_run_eagerly() ) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size) model.fit(dataset) # Variable starts at 1, and should have gradient of 1 subtracted from it. expected = 0 self.assertEqual(backend.eval(layer.v), expected)
def test_dynamic_loss_scaling(self, strategy_fn, get_config=False): strategy = strategy_fn() initial_loss_scale = 2.0 batch_size = 4 expected_gradient = backend.variable( [initial_loss_scale / batch_size], dtype=tf.float16 ) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=tf.bool) with strategy.scope(): opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=initial_loss_scale, dynamic_growth_steps=2 ) with policy.policy_scope("mixed_float16"): x = layers.Input( shape=(1,), batch_size=batch_size, dtype=tf.float16 ) layer = mp_test_util.MultiplyLayer(assert_type=tf.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients ) ) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=tf.float16, expected_gradient=expected_gradient, ) ) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) if get_config: config = model.get_config() model = model.__class__.from_config( config, custom_objects={ "MultiplyLayer": mp_test_util.MultiplyLayer }, ) (layer,) = ( layer for layer in model.layers if isinstance(layer, mp_test_util.MultiplyLayer) ) def loss_fn(y_true, y_pred): del y_true return tf.reduce_mean(y_pred) model.compile( opt, loss=loss_fn, run_eagerly=test_utils.should_run_eagerly(), ) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value( expected_gradient, backend.get_value(expected_gradient * 2) ) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value( expected_gradient, backend.get_value(expected_gradient / 2) ) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)
def test_advanced_model(self, strategy_fn, use_loss_scaling=False): # The advanced model tests mixed-precision-related features that would occur # in a resnet50 model. It tests a model that has: # * Multiple layers, some which use auto-cast variables and some which do # not # * Regularization on some variables and not others. # * A fixed loss scale (if use_loss_scaling is True) strategy = strategy_fn() if use_loss_scaling: loss_scale = 8.0 learning_rate = 2**-14 with strategy.scope(): with policy.policy_scope(policy.Policy("mixed_float16")): x = layers.Input(shape=(1,), batch_size=2) layer1 = mp_test_util.MultiplyLayer( assert_type=tf.float16, regularizer=mp_test_util.IdentityRegularizer(), use_operator=True, ) layer2 = mp_test_util.MultiplyLayerWithoutAutoCast( assert_type=tf.float16, use_operator=True ) layer3 = mp_test_util.MultiplyLayer( assert_type=tf.float16, use_operator=False ) layer4 = mp_test_util.MultiplyLayerWithoutAutoCast( assert_type=tf.float16, regularizer=mp_test_util.IdentityRegularizer(), use_operator=False, ) y = layer1(x) y = layer2(y) y = layer3(y) y = layer4(y) if use_loss_scaling: # The gradient of 'y' at this point is 1. With loss scaling, the # gradient is 'loss_scale'. We divide by the batch size of 2 since the # loss is averaged across batch elements. expected_gradient = loss_scale / 2 identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=tf.float16, expected_gradient=[expected_gradient], ) ) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return tf.reduce_mean(y_pred) opt = gradient_descent.SGD(learning_rate) if use_loss_scaling: opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=loss_scale ) model.compile( opt, loss=loss_fn, run_eagerly=test_utils.should_run_eagerly(), ) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) for layer in (layer1, layer2, layer3, layer4): if layer.losses: # Layer has weight regularizer self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate) else: # Layer does not have weight regularizer self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad): grad_check_fn = mp_test_util.create_identity_with_grad_check_fn( expected_grad) loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync return lambda: opt.minimize(loss, var_list=[var])