def test_wrap_optimizer_dynamic_loss_scale(self): opt = gradient_descent_v2.SGD(1.0) opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( opt, "dynamic" ) self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2.0**15) self.assertTrue(opt.dynamic) self.assertTrue(opt.initial_scale, 2.0**15) self.assertTrue(opt.dynamic_growth_steps, 2000) opt = gradient_descent_v2.SGD(1.0) opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( opt, tf.compat.v1.mixed_precision.DynamicLossScale( initial_loss_scale=4, increment_period=1000 ), ) self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 4.0) self.assertTrue(opt.dynamic) self.assertTrue(opt.initial_scale, 4.0) self.assertTrue(opt.dynamic_growth_steps, 1000)
def test_wide_deep_model_backprop(self): with self.cached_session(): linear_model = linear.LinearModel(units=1, kernel_initializer="zeros") dnn_model = sequential.Sequential( [core.Dense(units=1, kernel_initializer="zeros")]) wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model) linear_inp = np.array([[1.0]]) dnn_inp = np.array([[1.0]]) inputs = [linear_inp, dnn_inp] output = linear_inp + 2 * dnn_inp linear_opt = gradient_descent.SGD(learning_rate=0.1) dnn_opt = gradient_descent.SGD(learning_rate=0.3) wide_deep_model.compile( optimizer=[linear_opt, dnn_opt], loss="mse", metrics=[], run_eagerly=test_utils.should_run_eagerly(), ) self.evaluate(tf.compat.v1.global_variables_initializer()) wide_deep_model.fit(inputs, output, epochs=1) self.assertAllClose( [[0.6]], self.evaluate( wide_deep_model.linear_model.dense_layers[0].kernel), ) self.assertAllClose( [[1.8]], self.evaluate(wide_deep_model.dnn_model.layers[0].kernel), )
def testConstructMomentumWithLR(self): opt = gradient_descent.SGD(lr=1.0, momentum=0.9) opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0) opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9) self.assertIsInstance(opt.lr, tf.Variable) self.assertIsInstance(opt_2.lr, tf.Variable) self.assertIsInstance(opt_3.lr, tf.Variable) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(opt.lr), (1.0)) self.assertAllClose(self.evaluate(opt_2.lr), (1.0)) self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
def test_error_if_policy_is_set(self): with policy.policy_scope('mixed_float16'): with self.assertRaisesRegex( ValueError, 'the global Keras dtype Policy has been set'): tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is currently the default. tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is a non-mixed policy. with policy.policy_scope('float64'): tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( gradient_descent_v2.SGD(1.0))
def testErrorWhenV3LsoWrapsV2Optimizer(self): sgd = gradient_descent.SGD() with self.assertRaisesRegex( TypeError, 'only the new experimental optimizer ' 'defined in keras/optimizer_expeirmental/optimizer.py can be ' 'passed'): loss_scale_optimizer.LossScaleOptimizerV3(sgd)
def test_variable_run_argument(self, distribution): # Test that variables passed to run() remain variables. Previous # behavior in TPUStrategy was to cast to Tensor. with distribution.scope(): optimizer = gradient_descent.SGD(0.1) net = core.Dense(1, trainable=True) dataset = tf.data.Dataset.from_tensors([[1.0]]) dataset = dataset.repeat() dataset = dataset.batch(2, drop_remainder=True) def replica_step(trainable_variables, features): with tf.GradientTape() as tape: net_out = net(features[0], training=True) loss = (net_out - 1.0) * (net_out - 1.0) gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss @tf.function def step(features): per_replica_losses = distribution.run( replica_step, (net.trainable_variables, features), ) loss = distribution.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return loss step(next(iter(dataset)))
def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self): for dtype in [tf.half, tf.float32, tf.float64]: learning_rate = learning_rate_schedule.InverseTimeDecay( 3.0, decay_steps=1.0, decay_rate=0.5) sgd = gradient_descent.SGD(learning_rate=learning_rate) sgd = gradient_descent.SGD.from_config(sgd.get_config()) self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
def testBasicWithLearningRateDecay(self): for dtype in [tf.half, tf.float32, tf.float64]: learning_rate = 3.0 decay = 0.5 sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay) self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
def testSparseBasic(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([[1.0], [2.0]], dtype=dtype) var1 = tf.Variable([[3.0], [4.0]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1]), ) grads1 = tf.IndexedSlices( tf.constant([0.01], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1]), ) sgd_op = gradient_descent.SGD(3.0).apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 1 step of sgd self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]], self.evaluate(var0)) self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]], self.evaluate(var1))
def testMinimizeSparseResourceVariable(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([[1.0, 2.0]], dtype=dtype) var1 = tf.Variable([3.0], dtype=dtype) x = tf.constant([[4.0], [5.0]], dtype=dtype) def loss(): pred = tf.matmul( tf.compat.v1.nn.embedding_lookup([var0], [0]), x) pred += var1 return pred * pred sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1]) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 1 step of sgd self.evaluate(sgd_op) # Validate updated params np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0 np_grad = 2 * np_pred self.assertAllCloseAccordingToType( [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0), ) self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
def test_wide_deep_model_with_two_feature_columns(self): vocab_list = ["alpha", "beta", "gamma"] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = tf.feature_column.categorical_column_with_vocabulary_list( key="symbol", vocabulary_list=vocab_list) ind_column = tf.feature_column.indicator_column(cat_column) emb_column = tf.feature_column.embedding_column(cat_column, dimension=5) linear_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer="zeros") combined_linear = sequential.Sequential( [linear_feature_layer, linear_model]) dnn_model = sequential.Sequential([core.Dense(units=1)]) dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column]) combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model]) wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn) opt = gradient_descent.SGD(learning_rate=0.1) wide_deep_model.compile(opt, "mse", [], run_eagerly=test_utils.should_run_eagerly()) wide_deep_model.fit(x={"symbol": data}, y=y, batch_size=32, epochs=10)
def test_custom_aggregation(self, distribution, experimental_aggregate_gradients, expected): with distribution.scope(): v = tf.Variable([0.0, 0.0]) optimizer = gradient_descent.SGD(0.1) class PerReplica(values.DistributedValues): """Holds a map from replica to unsynchronized values.""" @property def values(self): """Returns the per replica values.""" return self._values @tf.function def optimize(): with tf.device(distribution.extended.worker_devices[0]): v1 = tf.convert_to_tensor([1.0, 1.0]) with tf.device(distribution.extended.worker_devices[1]): v2 = tf.convert_to_tensor([2.0, 2.0]) grads = PerReplica([v1, v2]) def step_fn(grads): optimizer.apply_gradients( [(grads, v)], experimental_aggregate_gradients= experimental_aggregate_gradients, ) return v.read_value() return distribution.experimental_local_results( distribution.run(step_fn, args=(grads, ))) self.assertAllClose(optimize(), expected)
def testConfig(self): opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True) config = opt.get_config() opt2 = gradient_descent.SGD.from_config(config) lr = opt.lr lr2 = opt2.lr self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(lr), self.evaluate(lr2)) self.assertAllClose(self.evaluate(opt._get_hyper("momentum")), self.evaluate(opt2._get_hyper("momentum"))) self.assertAllClose(self.evaluate(opt._get_hyper("decay")), self.evaluate(opt2._get_hyper("decay"))) var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32) loss = lambda: 3 * var0 # learning rate variable created when calling minimize. opt.minimize(loss, [var0]) self.evaluate(tf.compat.v1.global_variables_initializer()) config = opt.get_config() opt3 = gradient_descent.SGD.from_config(config) lr3 = opt3.lr self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(lr), self.evaluate(lr3)) self.assertAllClose(self.evaluate(opt._get_hyper("momentum")), self.evaluate(opt3._get_hyper("momentum"))) self.assertAllClose(self.evaluate(opt._get_hyper("decay")), self.evaluate(opt3._get_hyper("decay"))) self.assertTrue(opt3.nesterov)
def testSharing(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([1.0, 2.0], dtype=dtype) var1 = tf.Variable([3.0, 4.0], dtype=dtype) grads0 = tf.constant([0.1, 0.1], dtype=dtype) grads1 = tf.constant([0.01, 0.01], dtype=dtype) mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9) mom_update1 = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) mom_update2 = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) slot0 = mom_opt.get_slot(var0, "momentum") self.assertEqual(slot0.shape, var0.shape) slot1 = mom_opt.get_slot(var1, "momentum") self.assertEqual(slot1.shape, var1.shape) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Step 1: the momentum accumulators where 0. So we should see a normal # update: v -= grad * learning_rate self.evaluate(mom_update1) # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType(np.array([-0.2, -0.2]), self.evaluate(slot0)) self.assertAllCloseAccordingToType(np.array([-0.02, -0.02]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), self.evaluate(var1)) # Step 2: the second momentum accumulators contain the previous update. self.evaluate(mom_update2) # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]), self.evaluate(slot0)) self.assertAllCloseAccordingToType( np.array([(0.9 * (-0.02) - 2.0 * 0.01), (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([ 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0) ]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([ 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - ((0.9 * 0.01 + 0.01) * 2.0) ]), self.evaluate(var1))
def test_save_slot_variables_with_autocast_vars(self, strategy_fn, var_name='v'): p = policy.Policy('mixed_float16') with strategy_fn().scope(), policy.policy_scope(p): x = layers.Input(shape=(2, ), batch_size=2) # Having a var_name other than 'v' tests that a fixed bug (b/134713714) # does not reoccur. The bug was that a crash would occur when saving a # checkpoint where an AutoCastVariable with a slot variable would have a # different name than the layer attribute's name (layer.v in this case). layer = mp_test_util.MultiplyLayer(assert_type=tf.float16, var_name=var_name) y = layer(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1., 1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1) model.compile(optimizer=opt, loss='mse', run_eagerly=test_utils.should_run_eagerly()) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) weights_file = os.path.join(self.get_temp_dir(), 'weights') model.save_weights(weights_file) saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertNotEqual(new_slot, saved_slot) model.load_weights(weights_file) restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertEqual(restored_slot, saved_slot)
def testIterations(self): opt = gradient_descent.SGD(2.0) lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=10.) lso.iterations = 7 self.assertEqual(lso.iterations, 7) self.assertEqual(opt.iterations, 7)
def test_model_with_fixed_input_dim(self): """Ensure that the batch_dim is removed when saving. When serving or retraining, it is important to reset the batch dim. This can be an issue inside of tf.function. See b/132783590 for context. """ model = test_utils.get_small_mlp(10, 3, 5) loss_object = keras.losses.MeanSquaredError() optimizer = gradient_descent.SGD() @tf.function def train_step(data, labels): with tf.GradientTape() as tape: predictions = model(data) loss = loss_object(labels, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) x = np.random.random((8, 5)) y = np.random.random((8, 3)) train_step(x, y) fn = saving_utils.trace_model_call(model) self.assertEqual( fn.structured_input_signature[0][0].shape.as_list(), tf.TensorShape([None, 5]).as_list(), )
def get_mnist_model(input_shape): """Define a deterministically-initialized CNN model for MNIST testing.""" inputs = keras.Input(shape=input_shape) x = keras.layers.Conv2D( 32, kernel_size=(3, 3), activation="relu", kernel_initializer=keras.initializers.TruncatedNormal(seed=99), )(inputs) x = keras.layers.BatchNormalization()(x) x = keras.layers.Flatten()(x) + keras.layers.Flatten()(x) x = keras.layers.Dense( 10, activation="softmax", kernel_initializer=keras.initializers.TruncatedNormal(seed=99), )(x) model = keras.Model(inputs=inputs, outputs=x) # TODO(yuefengz): optimizer with slot variables doesn't work because of # optimizer's bug. # TODO(yuefengz): we should not allow non-v2 optimizer. model.compile( loss=keras.losses.sparse_categorical_crossentropy, optimizer=gradient_descent.SGD(learning_rate=0.001), metrics=["accuracy"], ) return model
def testConfigWithLearningRateDecay(self): with test_utils.use_gpu(): var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32) for decay_schedule in [ learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.1), learning_rate_schedule.PiecewiseConstantDecay( [5], [1., .5]) ]: step = 10 opt = gradient_descent.SGD(decay_schedule) config = opt.get_config() opt2 = gradient_descent.SGD.from_config(config) # assert both are equal float values. self.assertAllEqual( decay_schedule(step), opt._get_hyper('learning_rate')(step)) self.assertAllEqual( decay_schedule(step), opt2._get_hyper('learning_rate')(step)) loss = lambda: 3 * var0 # learning rate variable is created when calling minimize. opt.minimize(loss, [var0]) self.evaluate(tf.compat.v1.global_variables_initializer()) config = opt.get_config() opt3 = gradient_descent.SGD.from_config(config) self.assertAllEqual( self.evaluate(opt._get_hyper('learning_rate')(step)), opt3._get_hyper('learning_rate')(step))
def get_model( self, max_words=10, initial_weights=None, distribution=None, input_shapes=None, ): del input_shapes with keras_correctness_test_base.MaybeDistributionScope(distribution): word_ids = keras.layers.Input(shape=(max_words, ), dtype=np.int32, name="words") word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(word_ids) if self.use_distributed_dense: word_embed = keras.layers.TimeDistributed( keras.layers.Dense(4))(word_embed) avg = keras.layers.GlobalAveragePooling1D()(word_embed) preds = keras.layers.Dense(2, activation="softmax")(avg) model = keras.Model(inputs=[word_ids], outputs=[preds]) if initial_weights: model.set_weights(initial_weights) model.compile( optimizer=gradient_descent_keras.SGD(learning_rate=0.1), loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"], ) return model
def test_gradient(self, strategy_fn): x = tf.constant([1.]) with strategy_fn().scope() as strategy: with policy.policy_scope('mixed_float16'): layer = mp_test_util.MultiplyLayer(assert_type=tf.float16) # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate is not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2**-14) def run_fn(): with tf.GradientTape() as tape: y = layer(x) # Divide by num_replicas_in_sync, as the effective total loss is the # sum of each of the replica's losses. y /= strategy.num_replicas_in_sync grad = tape.gradient(y, layer.v) return opt.apply_gradients([(grad, layer.v)]) op = strategy.experimental_run(run_fn) if not tf.executing_eagerly(): self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(op) # The gradient with respective to the variable is 1. Since the # variable is initialized with 1 and the learning rate is 2**-14, the # new variable value should be: init_val - gradient * learning_rate, # which is 1 - 1 * 2**-14 self.assertEqual(self.evaluate(layer.v), 1 - 2**-14)
def test_wrap_optimizer_dynamic_loss_scale_errors(self): opt = gradient_descent_v2.SGD(1.0) with self.assertRaisesRegex( ValueError, 'When passing a DynamicLossScale to "loss_scale", ' 'DynamicLossScale.multiplier must be 2. Got: ' 'DynamicLossScale'): tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( opt, tf.compat.v1.mixed_precision.DynamicLossScale(multiplier=4.)) class MyLossScale(tf.compat.v1.mixed_precision.LossScale): def __call__(self): return 1. def update(self, grads): return None, True def get_config(self): return {} with self.assertRaisesRegex( TypeError, 'Passing a LossScale that is not a FixedLossScale or a ' 'DynamicLossScale is not supported. Got:'): tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( opt, MyLossScale())
def test_linear_model_with_feature_column(self): vocab_list = ["alpha", "beta", "gamma"] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape ) cat_column = tf.feature_column.categorical_column_with_vocabulary_list( key="symbol", vocabulary_list=vocab_list ) ind_column = tf.feature_column.indicator_column(cat_column) dense_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel( use_bias=False, kernel_initializer="zeros" ) combined = sequential.Sequential([dense_feature_layer, linear_model]) opt = gradient_descent.SGD(learning_rate=0.1) combined.compile(opt, "mse", []) combined.fit(x={"symbol": data}, y=y, batch_size=32, epochs=10) self.assertAllClose( [[0.4], [0.6], [0.9]], combined.layers[1].dense_layers[0].kernel.numpy(), atol=0.01, )
def test_linear_model(self, distribution, use_dataset_creator, data_fn): if (not use_dataset_creator) and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy): self.skipTest( "Parameter Server strategy requires dataset creator to be used in " "model.fit.") if (not tf.__internal__.tf2.enabled() and use_dataset_creator and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy)): self.skipTest( "Parameter Server strategy with dataset creator needs to be run when " "eager execution is enabled.") with distribution.scope(): model = linear.LinearModel() opt = gradient_descent.SGD(learning_rate=0.1) model.compile(opt, "mse") if use_dataset_creator: x = dataset_creator.DatasetCreator(dataset_fn) hist = model.fit(x, epochs=3, steps_per_epoch=INPUT_SIZE) else: if data_fn == "numpy": inputs, output = get_numpy() hist = model.fit(inputs, output, epochs=3) else: hist = model.fit(get_dataset(), epochs=3) self.assertLess(hist.history["loss"][2], 0.2)
def test_wide_deep_model(self, distribution, use_dataset_creator, data_fn): if (not use_dataset_creator) and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy): self.skipTest( "Parameter Server strategy requires dataset creator to be used in " "model.fit.") if (not tf.__internal__.tf2.enabled() and use_dataset_creator and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy)): self.skipTest( "Parameter Server strategy with dataset creator needs to be run when " "eager execution is enabled.") with distribution.scope(): linear_model = linear.LinearModel(units=1) dnn_model = sequential.Sequential([core.Dense(units=1)]) wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model) linear_opt = gradient_descent.SGD(learning_rate=0.05) dnn_opt = adagrad.Adagrad(learning_rate=0.1) wide_deep_model.compile(optimizer=[linear_opt, dnn_opt], loss="mse") if use_dataset_creator: x = dataset_creator.DatasetCreator(dataset_fn) hist = wide_deep_model.fit(x, epochs=3, steps_per_epoch=INPUT_SIZE) else: if data_fn == "numpy": inputs, output = get_numpy() hist = wide_deep_model.fit(inputs, output, epochs=3) else: hist = wide_deep_model.fit(get_dataset(), epochs=3) self.assertLess(hist.history["loss"][2], 0.2)
def get_model(self, initial_weights=None, distribution=None, input_shapes=None): del input_shapes with keras_correctness_test_base.MaybeDistributionScope(distribution): image = keras.layers.Input(shape=(28, 28, 3), name='image') c1 = keras.layers.Conv2D( name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4), kernel_regularizer=keras.regularizers.l2(1e-4))(image) if self.with_batch_norm == 'regular': c1 = keras.layers.BatchNormalization(name='bn1')(c1) elif self.with_batch_norm == 'sync': # Test with parallel batch norms to verify all-reduce works OK. bn1 = keras.layers.SyncBatchNormalization(name='bn1')(c1) bn2 = keras.layers.SyncBatchNormalization(name='bn2')(c1) c1 = keras.layers.Add()([bn1, bn2]) c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1) logits = keras.layers.Dense(10, activation='softmax', name='pred')( keras.layers.Flatten()(c1)) model = keras.Model(inputs=[image], outputs=[logits]) if initial_weights: model.set_weights(initial_weights) model.compile(optimizer=gradient_descent.SGD(learning_rate=0.1), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) return model
def get_model(self, initial_weights=None, distribution=None, input_shapes=None): with keras_correctness_test_base.MaybeDistributionScope(distribution): # We add few non-linear layers to make it non-trivial. model = keras.Sequential() model.add( keras.layers.Dense(10, activation="relu", input_shape=(1, ))) model.add( keras.layers.Dense( 10, activation="relu", kernel_regularizer=keras.regularizers.l2(1e-4), )) model.add(keras.layers.Dense(10, activation="relu")) model.add(keras.layers.Dense(1)) if initial_weights: model.set_weights(initial_weights) model.compile( loss=keras.losses.mean_squared_error, optimizer=gradient_descent_keras.SGD(0.05), metrics=["mse"], ) return model
def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False): # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy # as well. strategy = strategy_fn() if (isinstance(strategy, tf.distribute.MirroredStrategy) and not tf.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=tf.float32) y = mp_test_util.MultiplyLayer()(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=1., dynamic_growth_steps=2.) model.compile(optimizer=opt, loss='mse', run_eagerly=test_utils.should_run_eagerly()) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(opt.loss_scale), 2) self.assertEqual(backend.get_value(opt.dynamic_counter), 1) (weight, ) = model.trainable_weights orig_weight = backend.get_value(weight) # Save model weights. save_path = os.path.join(self.get_temp_dir(), 'model') model.save(save_path, save_format='h5' if h5 else 'tf') # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_weight = backend.get_value(weight) self.assertNotEqual(new_weight, orig_weight) self.assertEqual(backend.get_value(opt.loss_scale), 4) self.assertEqual(backend.get_value(opt.dynamic_counter), 0) # Load model weights and ensure loss scale weights are restored. model = save.load_model( save_path, custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer}) (weight, ) = model.trainable_weights loaded_weight = backend.get_value(weight) self.assertEqual(loaded_weight, orig_weight) # Currently the loss scale isn't always saved when the model is saved with # Model.save(). So we assert the loss scale either has the value when it was # saved, or the value it was initialized with. # TODO(reedwm): Always save/restore the loss scale with Model.save(). self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2)) self.assertIn(backend.get_value(model.optimizer.dynamic_counter), (0, 1)) # Test optimizer attributes and type self.assertEqual(model.optimizer.initial_scale, 1.) self.assertEqual(model.optimizer.dynamic_growth_steps, 2.) self.assertEqual(type(model.optimizer), loss_scale_optimizer.LossScaleOptimizer)
def testIsInstance(self): optimizer = create_lso(sgd_experimental.SGD()) self.assertIsInstance(optimizer, loss_scale_optimizer.BaseLossScaleOptimizer) optimizer = create_lso(gradient_descent.SGD()) self.assertIsInstance(optimizer, loss_scale_optimizer.BaseLossScaleOptimizer)
def testDir(self): opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.1) dir_result = set(dir(opt)) self.assertIn('learning_rate', dir_result) # Hyperparameter self.assertIn('lr', dir_result) # Hyperparameter self.assertIn('momentum', dir_result) # Hyperparameter self.assertIn('nesterov', dir_result) # Attribute self.assertIn('minimize', dir_result) # Attribute