def testCompositeTypeSpecArgWithoutDtype(self): for assign_variant_dtype in [False, True]: # Create a Keras Input spec = TwoTensorsSpecNoOneDtype( (1, 2, 3), dtypes.float32, (1, 2, 3), dtypes.int64, assign_variant_dtype=assign_variant_dtype) x = input_layer_lib.Input(type_spec=spec) def lambda_fn(tensors): return (math_ops.cast(tensors.x, dtypes.float64) + math_ops.cast(tensors.y, dtypes.float64)) # Verify you can construct and use a model w/ this input model = functional.Functional(x, core.Lambda(lambda_fn)(x)) # And that the model works two_tensors = TwoTensors( array_ops.ones((1, 2, 3)) * 2.0, array_ops.ones(1, 2, 3)) self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors)) # Test serialization / deserialization model = functional.Functional.from_config(model.get_config()) self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors)) model = model_config.model_from_json(model.to_json()) self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
def test_Bidirectional_ragged_input(self, merge_mode): if test.is_built_with_rocm(): # ragged tenors are not supported in ROCM RNN implementation self.skipTest('Test not supported on the ROCm platform') np.random.seed(100) rnn = keras.layers.LSTM units = 3 x = ragged_factory_ops.constant( [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], ragged_rank=1) x = math_ops.cast(x, 'float32') # pylint: disable=g-long-lambda with self.cached_session(): if merge_mode == 'ave': merge_func = lambda y, y_rev: (y + y_rev) / 2 elif merge_mode == 'concat': merge_func = lambda y, y_rev: ragged_concat_ops.concat( (y, y_rev), axis=-1) elif merge_mode == 'mul': merge_func = lambda y, y_rev: (y * y_rev) # pylint: enable=g-long-lambda inputs = keras.Input( shape=(None, 3), batch_size=4, dtype='float32', ragged=True) layer = keras.layers.Bidirectional( rnn(units, return_sequences=True), merge_mode=merge_mode) f_merged = keras.backend.function([inputs], layer(inputs)) f_forward = keras.backend.function([inputs], layer.forward_layer(inputs)) # TODO(kaftan): after KerasTensor refactor TF op layers should work # with many composite tensors, and this shouldn't need to be a lambda # layer. reverse_layer = core.Lambda(array_ops.reverse, arguments=dict(axis=[1])) f_backward = keras.backend.function( [inputs], reverse_layer(layer.backward_layer(inputs))) y_merged = f_merged(x) y_expected = merge_func( ragged_tensor.convert_to_tensor_or_ragged_tensor(f_forward(x)), ragged_tensor.convert_to_tensor_or_ragged_tensor(f_backward(x))) y_merged = ragged_tensor.convert_to_tensor_or_ragged_tensor(y_merged) self.assertAllClose(y_merged.flat_values, y_expected.flat_values)
def test_fixed_loss_scaling(self, strategy_fn, cloning=True): if testing_utils.should_run_distributed(): self.skipTest('b/137397816') # Note: We do not test mixed precision in this method, only loss scaling. if not self._is_strategy_supported(strategy_fn): return loss_scale = 8. batch_size = 4 with strategy_fn().scope(): x = layers.Input(shape=(1, ), batch_size=batch_size) layer = AddLayer() y = layer(x) # The gradient of 'y' at this point is 1. With loss scaling, the gradient # is 'loss_scale'. We divide by the batch size since the loss is averaged # across batch elements. expected_gradient = loss_scale / batch_size identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( [expected_gradient])) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile( opt, loss=loss_fn, cloning=cloning, run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # Variable starts at 1, and should have gradient of 1 subtracted from it. expected = 0 self.assertEqual(backend.eval(layer.v), expected)
def test_adapt_preprocessing_stage_with_dict_input(self): x0 = Input(shape=(3,), name='x0') x1 = Input(shape=(4,), name='x1') x2 = Input(shape=(3, 5), name='x2') # dimension will mismatch if x1 incorrectly placed. x1_sum = core.Lambda( lambda x: math_ops.reduce_sum(x, axis=-1, keepdims=True))( x1) x2_sum = core.Lambda(lambda x: math_ops.reduce_sum(x, axis=-1))(x2) l0 = PLMerge() y = l0([x0, x1_sum]) l1 = PLMerge() y = l1([y, x2_sum]) l2 = PLSplit() z, y = l2(y) stage = preprocessing_stage.FunctionalPreprocessingStage( { 'x2': x2, 'x0': x0, 'x1': x1 }, [y, z]) stage.compile() # Test with dict of NumPy array one_array0 = np.ones((4, 3), dtype='float32') one_array1 = np.ones((4, 4), dtype='float32') one_array2 = np.ones((4, 3, 5), dtype='float32') adapt_data = {'x1': one_array1, 'x0': one_array0, 'x2': one_array2} stage.adapt(adapt_data) self.assertEqual(l0.adapt_count, 1) self.assertEqual(l1.adapt_count, 1) self.assertEqual(l2.adapt_count, 1) self.assertLessEqual(l0.adapt_time, l1.adapt_time) self.assertLessEqual(l1.adapt_time, l2.adapt_time) # Check call y, z = stage({ 'x1': array_ops.constant(one_array1), 'x2': array_ops.constant(one_array2), 'x0': array_ops.constant(one_array0) }) self.assertAllClose(y, np.zeros((4, 3), dtype='float32') + 9.) self.assertAllClose(z, np.zeros((4, 3), dtype='float32') + 11.) # Test with list of NumPy array adapt_data = [one_array0, one_array1, one_array2] stage.adapt(adapt_data) self.assertEqual(l0.adapt_count, 2) self.assertEqual(l1.adapt_count, 2) self.assertEqual(l2.adapt_count, 2) self.assertLessEqual(l0.adapt_time, l1.adapt_time) self.assertLessEqual(l1.adapt_time, l2.adapt_time) # Test with flattened dataset adapt_data = dataset_ops.Dataset.from_tensor_slices( (one_array0, one_array1, one_array2)) adapt_data = adapt_data.batch(2) # 5 batches of 2 samples stage.adapt(adapt_data) self.assertEqual(l0.adapt_count, 3) self.assertEqual(l1.adapt_count, 3) self.assertEqual(l2.adapt_count, 3) self.assertLessEqual(l0.adapt_time, l1.adapt_time) self.assertLessEqual(l1.adapt_time, l2.adapt_time) # Test with dataset in dict shape adapt_data = dataset_ops.Dataset.from_tensor_slices({ 'x0': one_array0, 'x2': one_array2, 'x1': one_array1 }) adapt_data = adapt_data.batch(2) # 5 batches of 2 samples stage.adapt(adapt_data) self.assertEqual(l0.adapt_count, 4) self.assertEqual(l1.adapt_count, 4) self.assertEqual(l2.adapt_count, 4) self.assertLessEqual(l0.adapt_time, l1.adapt_time) self.assertLessEqual(l1.adapt_time, l2.adapt_time) # Test error with bad data with self.assertRaisesRegex(ValueError, 'requires a '): stage.adapt(None)
def test_dynamic_loss_scaling(self, strategy_fn, cloning=True): if testing_utils.should_run_distributed(): self.skipTest('b/137397816') if not self._is_strategy_supported(strategy_fn): return strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1, ), batch_size=batch_size, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile( opt, loss=loss_fn, cloning=cloning, run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)
def test_advanced_model(self, strategy_fn, use_loss_scaling=False): if testing_utils.should_run_distributed(): self.skipTest('b/137397816') # The advanced model tests mixed-precision-related features that would occur # in a resnet50 model. It tests a model that has: # * Multiple layers, some which use auto-cast variables and some which do # not # * Regularization on some variables and not others. # * A fixed loss scale (if use_loss_scaling is True) if not self._is_strategy_supported(strategy_fn): return strategy = strategy_fn() if use_loss_scaling: loss_scale = 8. learning_rate = 2**-14 with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1, ), batch_size=2, dtype=dtypes.float16) layer1 = AddLayer(assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=True) layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16, use_operator=True) layer3 = AddLayer(assert_type=dtypes.float16, use_operator=False) layer4 = AddLayerWithoutAutoCast( assert_type=dtypes.float16, regularizer=IdentityRegularizer(), use_operator=False) y = layer1(x) y = layer2(y) y = layer3(y) y = layer4(y) if use_loss_scaling: # The gradient of 'y' at this point is 1. With loss scaling, the # gradient is 'loss_scale'. We divide by the batch size of 2 since the # loss is averaged across batch elements. expected_gradient = loss_scale / 2 identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=[expected_gradient])) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): self.assertEqual(y_true.dtype, dtypes.float32) self.assertEqual(y_pred.dtype, dtypes.float32) return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(learning_rate) if use_loss_scaling: opt = loss_scale_optimizer.LossScaleOptimizer( opt, loss_scale) model.compile( opt, loss=loss_fn, run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) x = np.ones((2, 1)) y = np.ones((2, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2) model.fit(dataset) for layer in (layer1, layer2, layer3, layer4): if layer.losses: # Layer has weight regularizer self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate) else: # Layer does not have weight regularizer self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
def test_dynamic_loss_scaling(self, strategy_fn, pass_loss_scale_to_policy=False, get_config=False): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): opt = gradient_descent.SGD(1.) if pass_loss_scale_to_policy: p = policy.Policy('mixed_float16', loss_scale=loss_scale) else: p = policy.Policy('mixed_float16', loss_scale=None) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) with policy.policy_scope(p): x = layers.Input(shape=(1, ), batch_size=batch_size, dtype=dtypes.float16) layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) if get_config: config = model.get_config() model = model.__class__.from_config( config, custom_objects={ 'MultiplyLayer': mp_test_util.MultiplyLayer }) (layer, ) = ( layer for layer in model.layers if isinstance(layer, mp_test_util.MultiplyLayer)) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) model.compile(opt, loss=loss_fn, run_eagerly=testing_utils.should_run_eagerly()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)