def _test_checkpointing_layer_weights(self, strategy_fn, mixed_prec_when_saving, mixed_prec_when_loading): # In this test, we potentially save with mixed precision enabled and load # with mixed precision disabled, or vice versa. This is possible because # variables are float32 regardless of whether mixed precision is enabled. save_policy = 'infer_float32_vars' if mixed_prec_when_saving else 'infer' load_policy = 'infer_float32_vars' if mixed_prec_when_loading else 'infer' save_input_dtype = 'float16' if mixed_prec_when_saving else 'float32' load_input_dtype = 'float16' if mixed_prec_when_loading else 'float32' # Create a layer and save a checkpoint. x = constant_op.constant([1.], dtype=save_input_dtype) with strategy_fn().scope(): with policy.policy_scope(save_policy): layer = AddLayer(assert_type=save_input_dtype) layer.build(()) layer.set_weights([np.array(100.)]) self.assertEqual(self.evaluate(layer(x)), 101.) checkpoint = trackable_utils.Checkpoint(layer=layer) prefix = os.path.join(self.get_temp_dir(), 'ckpt') save_path = checkpoint.save(prefix) # Create a new layer and restore the checkpoint. x = constant_op.constant([1.], dtype=load_input_dtype) with strategy_fn().scope(): with policy.policy_scope(load_policy): layer = AddLayer(assert_type=load_input_dtype) layer.build(()) layer.set_weights([np.array(200.)]) self.assertEqual(self.evaluate(layer(x)), 201.) checkpoint = trackable_utils.Checkpoint(layer=layer) checkpoint.restore(save_path).assert_consumed().run_restore_ops() self.assertEqual(layer.get_weights(), [100.]) self.assertEqual(self.evaluate(layer(x)), 101.)
def test_policy_scope(self): with mp_policy.policy_scope('infer_float32_vars'): self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars') with mp_policy.policy_scope('infer'): self.assertEqual(mp_policy.global_policy().name, 'infer') self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars') self.assertEqual(mp_policy.global_policy().name, 'infer')
def test_error_if_policy_is_set(self): with policy.policy_scope('mixed_float16'): with self.assertRaisesRegexp( ValueError, 'the global Keras dtype Policy has been set'): enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is currently the default. enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is a non-mixed policy. with policy.policy_scope('float64'): enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
def test_policy_scope(self): if base_layer_utils.v2_dtype_behavior_enabled(): default_policy = 'float32' else: default_policy = '_infer' with mp_policy.policy_scope('mixed_float16'): self.assertEqual(mp_policy.global_policy().name, 'mixed_float16') with mp_policy.policy_scope('_infer'): self.assertEqual(mp_policy.global_policy().name, '_infer') self.assertEqual(mp_policy.global_policy().name, 'mixed_float16') self.assertEqual(mp_policy.global_policy().name, default_policy)
def test_v1_dtype_behavior(self): # Setting global policies are not allowed with V1 dtype behavior with self.assertRaisesRegex( ValueError, 'global policy can only be set in TensorFlow 2'): with mp_policy.policy_scope(mp_policy.Policy('_infer')): pass with self.assertRaisesRegex( ValueError, 'global policy can only be set in TensorFlow 2'): with mp_policy.policy_scope(mp_policy.Policy('float32')): pass with self.assertRaisesRegex( ValueError, 'global policy can only be set in TensorFlow 2'): with mp_policy.policy_scope(mp_policy.Policy('mixed_float16')): pass
def test_gradient(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope() as strategy: with policy.policy_scope('infer_float32_vars'): layer = AddLayer(assert_type=dtypes.float16) def run_fn(): with backprop.GradientTape() as tape: y = layer(x) # Divide by num_replicas_in_sync, as the effective total loss is the # sum of each of the replica's losses. y /= strategy.num_replicas_in_sync # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate is not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2 ** -14) grad = tape.gradient(y, layer.v) return opt.apply_gradients([(grad, layer.v)]) op = strategy.experimental_run(run_fn) if not context.executing_eagerly(): self.evaluate(variables.global_variables_initializer()) self.evaluate(op) # The gradient with respective to the variable is 1. Since the # variable is initialized with 1 and the learning rate is 2**-14, the # new variable value should be: init_val - gradient * learning_rate, # which is 1 - 1 * 2**-14 self.assertEqual(self.evaluate(layer.v), 1 - 2 ** -14)
def testMixedPrecision(self, required_gpus): if test_util.is_xla_enabled(): return # Test gets NaNs with XLA with policy.policy_scope('mixed_float16'): self._run_between_graph_clients(self._test_mixed_precision, self._cluster_spec, num_gpus=required_gpus)
def test_gradient(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope() as strategy: with policy.policy_scope('infer_float32_vars'): layer = AddLayer(assert_type=dtypes.float16) def run_fn(): with backprop.GradientTape() as tape: y = layer(x) # Divide by num_replicas_in_sync, as the effective total loss is the # sum of each of the replica's losses. y /= strategy.num_replicas_in_sync # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate is not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2**-14) grad = tape.gradient(y, layer.v) return opt.apply_gradients([(grad, layer.v)]) op = strategy.experimental_run(run_fn) if not context.executing_eagerly(): self.evaluate(variables.global_variables_initializer()) self.evaluate(op) # The gradient with respective to the variable is 1. Since the # variable is initialized with 1 and the learning rate is 2**-14, the # new variable value should be: init_val - gradient * learning_rate, # which is 1 - 1 * 2**-14 self.assertEqual(self.evaluate(layer.v), 1 - 2**-14)
def test_int32_with_float32_vars(self, strategy_fn): # The policy int32_with_float32_vars is not useful at all (nor is any other # non-float policy with float32 variables), but we have it for consistency, # and so we test it. class IdentityLayerWithVar(base_layer.Layer): def build(self, _): self.v = self.add_weight('v', ()) def call(self, inputs): # Variables are only casted to other floats, not ints assert array_ops.identity(self.v).dtype == 'float32' return array_ops.identity(inputs) x = constant_op.constant([1]) with strategy_fn().scope(), policy.policy_scope( 'int32_with_float32_vars'): layer = IdentityLayerWithVar() self.assertEqual(layer.dtype, dtypes.float32) self.assertEqual(layer._dtype_policy._name, 'int32_with_float32_vars') y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtypes.int32)
def test_model(self, strategy_fn, use_operator=False, use_regularizer=False, cloning=True): regularizer = IdentityRegularizer() if use_regularizer else None with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): x = layers.Input(shape=(1,), batch_size=2, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16, use_operator=use_operator, regularizer=regularizer) y = layer(x) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2 ** -14) model.compile(opt, loss=loss_fn, cloning=cloning) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) # Variable starts at 1, and should have gradient of 2 ** -14 subtracted # from it. expected = 1 - 2 ** -14 if use_regularizer: # Regularizer adds another 2 ** -14 to the gradient. expected -= 2 ** -14 self.assertEqual(backend.eval(layer.v), expected)
def test_save_slot_variables_with_autocast_vars(self, strategy_fn, var_name='v'): if not self._is_strategy_supported(strategy_fn): return with strategy_fn().scope(), policy.policy_scope('infer_float32_vars'): x = layers.Input(shape=(2,), batch_size=2, dtype=dtypes.float16) # Having a var_name other than 'v' tests that a fixed bug (b/134713714) # does not reoccur. The bug was that a crash would occur when saving a # checkpoint where an AutoCastVariable with a slot variable would have a # different name than the layer attribute's name (layer.v in this case). layer = AddLayer(assert_type=dtypes.float16, var_name=var_name) y = layer(x) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1., 1.) model.compile(optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly()) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) weights_file = os.path.join(self.get_temp_dir(), 'weights') model.save_weights(weights_file) saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertNotEqual(new_slot, saved_slot) model.load_weights(weights_file) restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertEqual(restored_slot, saved_slot)
def test_error_if_policy_is_set(self): with policy.policy_scope('infer_float32_vars'): with self.assertRaisesRegexp( ValueError, 'a keras mixed precision Policy has been set'): enable_mixed_precision_graph_rewrite( gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is current the default. enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
def test_pass_invalid_optimizer_with_loss_scaling(self): with policy.policy_scope(policy.Policy('float32', loss_scale=10.)): x = layers.Input(shape=(1, )) y = AddLayer()(x) model = models.Model(x, y) with self.assertRaisesRegexp(ValueError, 'optimizer" must be an instance of '): model.compile(optimizers.SGD(1.), 'mse')
def test_v1_dtype_behavior(self): # These policies are allowed with V1 dtype behavior with mp_policy.policy_scope(mp_policy.Policy('infer')): pass with mp_policy.policy_scope(mp_policy.Policy('infer_float32_vars')): pass # These policies are not allowed with V1 dtype behavior with self.assertRaisesRegexp( ValueError, 'the V2 layer dtype behavior must be enabled'): with mp_policy.policy_scope(mp_policy.Policy('float32')): pass with self.assertRaisesRegexp( ValueError, 'the V2 layer dtype behavior must be enabled'): with mp_policy.policy_scope( mp_policy.Policy('float16_with_float32_vars')): pass
def test_functional_model_loss_dtype(self): with policy.policy_scope('float16'): x = layers.Input(shape=(1, )) y = AddLayer()(x) model = models.Model(x, y) model.add_loss(math_ops.cast(y, 'float32')) # The loss should not be casted to the policy's dtype. self.assertEqual(model.losses[0].dtype, 'float32')
def test_variable_not_casted_for_int_inputs(self, strategy_fn): x = constant_op.constant([[1]], dtype=dtypes.int32) with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): layer = layers.Embedding(input_dim=10, output_dim=32) y = layer(x) self.assertEqual(layer.embeddings.dtype, dtypes.float32) self.assertEqual(y.dtype, dtypes.float32)
def test_model(self, strategy_fn, use_operator=False, use_regularizer=False, policy_name='mixed_float16', experimental_run_tf_function=True): if not self._is_strategy_supported(strategy_fn, check_model_type=True): return regularizer = IdentityRegularizer() if use_regularizer else None with strategy_fn().scope(): # Pass loss_scale=None, as this test will fail if the DynamicLossScale # skips applying gradients for a step with policy.policy_scope( policy.Policy(policy_name, loss_scale=None)): layer_list = [] if testing_utils.get_model_type() == 'subclass': # Subclassed models do not have an Input layer, so the model does not # cast inputs to the Input layer's dtype. Therefore, we need to # manually insert a float16 cast. cast_f16_layer = layers.Lambda( lambda x: math_ops.cast(x, 'float16'), input_shape=(1, )) layer_list.append(cast_f16_layer) layer = AddLayer(assert_type=dtypes.float16, use_operator=use_operator, regularizer=regularizer, input_shape=(1, )) cast_f32_layer = layers.Lambda( lambda x: math_ops.cast(x, 'float32')) layer_list += [layer, cast_f32_layer] model = testing_utils.get_model_from_layers( layer_list, input_shape=(1, ), input_dtype=dtypes.float16) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2**-14) model.compile(opt, loss=loss_fn, run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils. should_run_tf_function()) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) # Variable starts at 1, and should have gradient of 2 ** -14 subtracted # from it. expected = 1 - 2**-14 if use_regularizer: # Regularizer adds another 2 ** -14 to the gradient. expected -= 2**-14 self.assertEqual(backend.eval(layer.v), expected)
def test_loss_scale_optimizer_overrides_policy_loss_scale(self): with policy.policy_scope(policy.Policy('float32', loss_scale=10.)): opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=5.) x = layers.Input(shape=(1, )) y = AddLayer()(x) model = models.Model(x, y) model.compile(opt, loss='mse') self.assertEqual(self.evaluate(model.optimizer.loss_scale()), 5.)
def test_v1_dtype_behavior(self): # Only the "infer" policy is allowed with V1 dtype behavior with mp_policy.policy_scope(mp_policy.Policy('infer')): pass # Non-infer policies are not allowed with V1 dtype behavior with self.assertRaisesRegexp( ValueError, 'global policy can only be set to a non-infer policy in TensorFlow 2' ): with mp_policy.policy_scope(mp_policy.Policy('float32')): pass with self.assertRaisesRegexp( ValueError, 'global policy can only be set to a non-infer policy in TensorFlow 2' ): with mp_policy.policy_scope(mp_policy.Policy('mixed_float16')): pass
def testMixedPrecision(self, num_gpus): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if test_util.is_xla_enabled(): self.skipTest('Test gets NaNs with XLA') with policy.policy_scope('mixed_float16'): self._run_between_graph_clients(self._test_mixed_precision, self._cluster_spec, num_gpus=num_gpus)
def test_v1_dtype_behavior(self): # These policies are allowed with V1 dtype behavior with mp_policy.policy_scope(mp_policy.Policy('infer')): pass with mp_policy.policy_scope(mp_policy.Policy('infer_float32_vars')): pass # These policies are not allowed with V1 dtype behavior with self.assertRaisesRegexp( ValueError, 'global policy can only be set to a non-infer policy in TensorFlow 2'): with mp_policy.policy_scope(mp_policy.Policy('float32')): pass with self.assertRaisesRegexp( ValueError, 'global policy can only be set to a non-infer policy in TensorFlow 2'): with mp_policy.policy_scope( mp_policy.Policy('float16_with_float32_vars')): pass
def test_lstm_model_correctness_mixed_precision(self, distribution, use_numpy, use_validation_data): if isinstance(distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)): policy_name = 'mixed_bfloat16' else: policy_name = 'mixed_float16' with policy.policy_scope(policy_name): self.run_correctness_test(distribution, use_numpy, use_validation_data)
def test_layer_calling_tf_function(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): layer = AddLayerWithFunction(assert_type=dtypes.float16) y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtypes.float16) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(y), 2.)
def test_layer_with_non_autocast_variable(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): layer = AddLayerWithoutAutoCast(assert_type=dtypes.float16) y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtypes.float16) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(y), 2.)
def test_layer_with_non_autocast_variable(self, strategy_fn): x = constant_op.constant([1.]) with strategy_fn().scope(): with policy.policy_scope('mixed_float16'): layer = AddLayerWithoutAutoCast(assert_type=dtypes.float16) y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtypes.float16) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(y), 2.)
def test_pass_invalid_optimizer_with_loss_scaling(self): with policy.policy_scope(policy.Policy('float32', loss_scale=10.)): x = layers.Input(shape=(1, )) y = mp_test_util.MultiplyLayer()(x) model = models.Model(x, y) if context.executing_eagerly(): error_msg = 'Use a `tf.keras` Optimizer instead' else: error_msg = 'optimizer" must be an instance of ' with self.assertRaisesRegexp(ValueError, error_msg): model.compile(optimizers.SGD(1.), 'mse')
def test_advanced_model(self, strategy_fn): # The advanced model tests mixed-precision-related features that would occur # in a resnet50 model. It tests a model that has: # * Multiple layers, some which use auto-cast variables and some which do # not # * Regularization on some variables and not others. strategy = strategy_fn() learning_rate = 2**-14 with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(), batch_size=2, dtype=dtypes.float16) layer1 = AddLayer(assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=True) layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16, use_operator=True) layer3 = AddLayer(assert_type=dtypes.float16, use_operator=False) layer4 = AddLayerWithoutAutoCast( assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=False) y = layer1(x) y = layer2(y) y = layer3(y) y = layer4(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): self.assertEqual(y_true.dtype, dtypes.float32) self.assertEqual(y_pred.dtype, dtypes.float32) return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(learning_rate) model.compile(opt, loss=loss_fn) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) for layer in (layer1, layer2, layer3, layer4): if layer.losses: # Layer has weight regularizer self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate) else: # Layer does not have weight regularizer self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
def test_error_if_graph_rewrite_enabled(self): try: mixed_precision.enable_mixed_precision_graph_rewrite( gradient_descent.SGD(1.)) with self.assertRaisesRegex( ValueError, 'cannot be set to "mixed_float16", .* the mixed ' 'precision graph rewrite has already been enabled'): mp_policy.set_policy('mixed_float16') with mp_policy.policy_scope('float64'): pass # Non-mixed policies are allowed finally: mixed_precision.disable_mixed_precision_graph_rewrite()
def test_model(self, strategy_fn, use_operator=False, use_regularizer=False, cloning=True): if testing_utils.should_run_distributed(): self.skipTest('b/137397816') if not self._is_strategy_supported(strategy_fn): return regularizer = IdentityRegularizer() if use_regularizer else None with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): x = layers.Input(shape=(1, ), batch_size=2, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16, use_operator=use_operator, regularizer=regularizer) y = layer(x) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2**-14) model.compile( opt, loss=loss_fn, cloning=cloning, run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) # Variable starts at 1, and should have gradient of 2 ** -14 subtracted # from it. expected = 1 - 2**-14 if use_regularizer: # Regularizer adds another 2 ** -14 to the gradient. expected -= 2**-14 self.assertEqual(backend.eval(layer.v), expected)
def test_advanced_model(self, strategy_fn): # The advanced model tests mixed-precision-related features that would occur # in a resnet50 model. It tests a model that has: # * Multiple layers, some which use auto-cast variables and some which do # not # * Regularization on some variables and not others. strategy = strategy_fn() learning_rate = 2 ** -14 with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(), batch_size=2, dtype=dtypes.float16) layer1 = AddLayer(assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=True) layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16, use_operator=True) layer3 = AddLayer(assert_type=dtypes.float16, use_operator=False) layer4 = AddLayerWithoutAutoCast(assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=False) y = layer1(x) y = layer2(y) y = layer3(y) y = layer4(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): self.assertEqual(y_true.dtype, dtypes.float32) self.assertEqual(y_pred.dtype, dtypes.float32) return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(learning_rate) model.compile(opt, loss=loss_fn) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) for layer in (layer1, layer2, layer3, layer4): if layer.losses: # Layer has weight regularizer self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate) else: # Layer does not have weight regularizer self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
def test_mixed_policies_(self, strategy_fn): for dtype in 'float16', 'bfloat16': x = constant_op.constant([1.]) policy_name = 'mixed_' + dtype with strategy_fn().scope(), policy.policy_scope(policy_name): layer = mp_test_util.AddLayer(assert_type=dtype) self.assertEqual(layer.dtype, dtypes.float32) self.assertEqual(layer._dtype_policy._name, policy_name) y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtype) self.assertEqual(layer.dtype, dtypes.float32) self.assertEqual(layer._dtype_policy._name, policy_name) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(y), 2.)
def test_floating_point_policies_with_float32_vars(self, strategy_fn): for dtype in 'bfloat16', 'float16', 'float64': x = constant_op.constant([1.]) policy_name = dtype + '_with_float32_vars' with strategy_fn().scope(), policy.policy_scope(policy_name): layer = AddLayer(assert_type=dtype) self.assertEqual(layer.dtype, dtypes.float32) self.assertEqual(layer._dtype_policy._name, policy_name) y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtype) self.assertEqual(layer.dtype, dtypes.float32) self.assertEqual(layer._dtype_policy._name, policy_name) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(y), 2.)
def test_passing_policy_to_layer(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(): # Passing a Policy to 'dtype' sets the policy for that layer. layer = mp_test_util.AddLayer(assert_type=dtypes.float16, dtype=policy.Policy('mixed_float16')) # layer.dtype refers to the variable dtype self.assertEqual(layer.dtype, dtypes.float32) layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) with policy.policy_scope('mixed_float16'): # Passing a Policy to dtype overrides the global Policy layer = mp_test_util.AddLayer(assert_type=dtypes.float64, dtype=policy.Policy('float64')) self.assertEqual(layer.dtype, 'float64') self.assertEqual(layer(x).dtype, dtypes.float64) self.assertEqual(layer.v.dtype, dtypes.float64)
def test_infer_with_float32_vars(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(), policy.policy_scope('infer_float32_vars'): layer = AddLayer(assert_type=dtypes.float16) self.assertEqual(layer.dtype, dtypes.float32) y = layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) self.assertEqual(y.dtype, dtypes.float16) self.assertEqual(layer.dtype, dtypes.float32) self.assertEqual(layer._dtype_policy._name, 'float16_with_float32_vars') self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(y), 2.) if base_layer_utils.v2_dtype_behavior_enabled(): # Layer should now cast inputs to float16 x = constant_op.constant([1.], dtype=dtypes.float32) y = layer(x) self.assertEqual(y.dtype, dtypes.float16)
def test_passing_policy_to_layer(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(): # Passing a Policy to 'dtype' sets the policy for that layer. layer = AddLayer(assert_type=dtypes.float16, dtype=policy.Policy('infer_float32_vars')) # layer.dtype refers to the variable dtype self.assertEqual(layer.dtype, dtypes.float32) layer(x) self.assertEqual(layer.v.dtype, dtypes.float32) with policy.policy_scope('infer_float32_vars'): # Passing a Policy to dtype overrides the global Policy layer = AddLayer(assert_type=dtypes.float16, dtype=policy.Policy('infer')) # layer dtype is not yet known self.assertEqual(layer.dtype, None) layer(x) self.assertEqual(layer.v.dtype, dtypes.float16) self.assertEqual(layer.dtype, dtypes.float16)
def test_checkpointing_layer_weights(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): layer = AddLayer(assert_type=dtypes.float16) layer.build(()) layer.set_weights([np.array(100.)]) self.assertEqual(self.evaluate(layer(x)), 101.) checkpoint = trackable_utils.Checkpoint(layer=layer) prefix = os.path.join(self.get_temp_dir(), 'ckpt') save_path = checkpoint.save(prefix) layer.set_weights([np.array(200.)]) self.assertEqual(self.evaluate(layer(x)), 201.) checkpoint.restore(save_path).assert_consumed().run_restore_ops() self.assertEqual(layer.get_weights(), [100.]) self.assertEqual(self.evaluate(layer(x)), 101.)
def test_layer_regularizer_runs_in_float32(self, strategy_fn): x = constant_op.constant([1.], dtype=dtypes.float16) with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): # Test on AddLayer layer = AddLayer(assert_type=dtypes.float16, regularizer=IdentityRegularizer()) layer(x) (regularizer_loss,) = layer.losses self.assertEqual(regularizer_loss.dtype, dtypes.float32) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(regularizer_loss), 1.) # Test on AddLayerWithoutAutoCast layer = AddLayerWithoutAutoCast(assert_type=dtypes.float16, regularizer=IdentityRegularizer()) layer(x) (regularizer_loss,) = layer.losses self.assertEqual(regularizer_loss.dtype, dtypes.float32) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(regularizer_loss), 1.)
def test_save_weights_with_autocast_vars(self, strategy_fn, h5=False): with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): x = layers.Input(shape=(1,), batch_size=2, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) model.set_weights([np.array(100.)]) x = np.ones((2, 1), dtype=np.float16) self.assertAllClose(backend.get_value(model(x)), x + 100.) suffix = '.h5' if h5 else '' weights_file = os.path.join(self.get_temp_dir(), 'weights' + suffix) model.save_weights(weights_file) model.set_weights([np.array(200.)]) self.assertAllClose(backend.get_value(model(x)), x + 200.) model.load_weights(weights_file) self.assertAllClose(backend.get_value(model(x)), x + 100.) self.assertEqual(model.get_weights(), [np.array(100.)])
def test_advanced_model(self, strategy_fn, use_loss_scaling=False): # The advanced model tests mixed-precision-related features that would occur # in a resnet50 model. It tests a model that has: # * Multiple layers, some which use auto-cast variables and some which do # not # * Regularization on some variables and not others. # * A fixed loss scale (if use_loss_scaling is True) strategy = strategy_fn() if use_loss_scaling: loss_scale = 8. learning_rate = 2 ** -14 with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1,), batch_size=2, dtype=dtypes.float16) layer1 = AddLayer(assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=True) layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16, use_operator=True) layer3 = AddLayer(assert_type=dtypes.float16, use_operator=False) layer4 = AddLayerWithoutAutoCast(assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=False) y = layer1(x) y = layer2(y) y = layer3(y) y = layer4(y) if use_loss_scaling: # The gradient of 'y' at this point is 1. With loss scaling, the # gradient is 'loss_scale'. We divide by the batch size of 2 since the # loss is averaged across batch elements. expected_gradient = loss_scale / 2 identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=[expected_gradient])) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): self.assertEqual(y_true.dtype, dtypes.float32) self.assertEqual(y_pred.dtype, dtypes.float32) return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(learning_rate) if use_loss_scaling: opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(opt, loss=loss_fn) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) for layer in (layer1, layer2, layer3, layer4): if layer.losses: # Layer has weight regularizer self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate) else: # Layer does not have weight regularizer self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
def test_dynamic_loss_scaling(self, strategy_fn, cloning=True): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1,), batch_size=batch_size, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(opt, loss=loss_fn, cloning=cloning) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)