def test_wide_deep_model_backprop(self): with self.cached_session(): linear_model = linear.LinearModel(units=1, kernel_initializer='zeros') dnn_model = sequential.Sequential( [core.Dense(units=1, kernel_initializer='zeros')]) wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model) linear_inp = np.array([1.]) dnn_inp = np.array([1.]) inputs = [linear_inp, dnn_inp] output = linear_inp + 2 * dnn_inp linear_opt = gradient_descent.SGD(learning_rate=.1) dnn_opt = gradient_descent.SGD(learning_rate=.3) wide_deep_model.compile( optimizer=[linear_opt, dnn_opt], loss='mse', metrics=[], run_eagerly=testing_utils.should_run_eagerly()) self.evaluate(tf.compat.v1.global_variables_initializer()) wide_deep_model.fit(inputs, output, epochs=1) self.assertAllClose( [[0.6]], self.evaluate( wide_deep_model.linear_model.dense_layers[0].kernel)) self.assertAllClose( [[1.8]], self.evaluate(wide_deep_model.dnn_model.layers[0].kernel))
def test_error_if_policy_is_set(self): with policy.policy_scope('mixed_float16'): with self.assertRaisesRegex(ValueError, 'the global Keras dtype Policy has been set'): enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is currently the default. enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0)) # Test no error is thrown when the policy is a non-mixed policy. with policy.policy_scope('float64'): enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
def testConstructMomentumWithLR(self): opt = gradient_descent.SGD(lr=1.0, momentum=0.9) opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0) opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9) self.assertIsInstance(opt.lr, tf.Variable) self.assertIsInstance(opt_2.lr, tf.Variable) self.assertIsInstance(opt_3.lr, tf.Variable) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(opt.lr), (1.0)) self.assertAllClose(self.evaluate(opt_2.lr), (1.0)) self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
def test_custom_aggregation(self, distribution, experimental_aggregate_gradients, expected): with distribution.scope(): v = tf.Variable([0., 0.]) optimizer = gradient_descent.SGD(0.1) @tf.function def optimize(): with tf.compat.v1.device(distribution.extended.worker_devices[0]): v1 = tf.convert_to_tensor([1., 1.]) with tf.compat.v1.device(distribution.extended.worker_devices[1]): v2 = tf.convert_to_tensor([2., 2.]) grads = values.PerReplica([v1, v2]) def step_fn(grads): optimizer.apply_gradients([(grads, v)], experimental_aggregate_gradients= experimental_aggregate_gradients) return v.read_value() return distribution.experimental_local_results( distribution.run(step_fn, args=(grads, ))) self.assertAllClose(optimize(), expected)
def test_dataset_creator_usage_in_parameter_server_model_fit(self): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=1, rpc_layer="grpc") cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] strategy = tf.distribute.experimental.ParameterServerStrategy( SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")) with strategy.scope(): model = sequential.Sequential([core_layers.Dense(10)]) model.compile(gradient_descent.SGD(), loss="mse") def dataset_fn(input_context): global_batch_size = 64 batch_size = input_context.get_per_replica_batch_size( global_batch_size) dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat() dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(2) return dataset history = model.fit(dataset_creator.DatasetCreator(dataset_fn), epochs=10, steps_per_epoch=10, verbose=0) self.assertLen(history.history["loss"], 10)
def test_linear_model(self, distribution, use_dataset_creator, data_fn): if ((not use_dataset_creator) and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy)): self.skipTest( 'Parameter Server strategy requires dataset creator to be used in ' 'model.fit.') if (not tf.__internal__.tf2.enabled() and use_dataset_creator and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy)): self.skipTest( 'Parameter Server strategy with dataset creator needs to be run when ' 'eager execution is enabled.') with distribution.scope(): model = linear.LinearModel() opt = gradient_descent.SGD(learning_rate=0.1) model.compile(opt, 'mse') if use_dataset_creator: x = dataset_creator.DatasetCreator(dataset_fn) hist = model.fit(x, epochs=5, steps_per_epoch=INPUT_SIZE) else: if data_fn == 'numpy': inputs, output = get_numpy() hist = model.fit(inputs, output, epochs=5) else: hist = model.fit(get_dataset(), epochs=5) self.assertLess(hist.history['loss'][4], 0.2)
def testUnsupportedStrategy(self): strategy = tf.distribute.experimental.CentralStorageStrategy() expected_error = ( 'Loss scaling is not supported with the tf.distribute.Strategy: ' 'CentralStorageStrategy. Try using a different Strategy, e.g. a ' 'MirroredStrategy') with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error): loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) with strategy.scope(): var = tf.Variable(1.0) loss = lambda: var * 2.0 run_fn = lambda: opt.minimize(loss, [var]) with self.assertRaisesRegex(ValueError, expected_error): strategy.experimental_run(run_fn)
def testMinimizeSparseResourceVariable(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([[1.0, 2.0]], dtype=dtype) var1 = tf.Variable([3.0], dtype=dtype) x = tf.constant([[4.0], [5.0]], dtype=dtype) def loss(): pred = tf.matmul( tf.compat.v1.nn.embedding_lookup([var0], [0]), x) # pylint: disable=cell-var-from-loop pred += var1 # pylint: disable=cell-var-from-loop return pred * pred sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1]) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 1 step of sgd self.evaluate(sgd_op) # Validate updated params np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0 np_grad = 2 * np_pred self.assertAllCloseAccordingToType( [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0)) self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
def testSharing(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([1.0, 2.0], dtype=dtype) var1 = tf.Variable([3.0, 4.0], dtype=dtype) grads0 = tf.constant([0.1, 0.1], dtype=dtype) grads1 = tf.constant([0.01, 0.01], dtype=dtype) mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9) mom_update1 = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) mom_update2 = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) slot0 = mom_opt.get_slot(var0, "momentum") self.assertEqual(slot0.shape, var0.shape) slot1 = mom_opt.get_slot(var1, "momentum") self.assertEqual(slot1.shape, var1.shape) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Step 1: the momentum accumulators where 0. So we should see a normal # update: v -= grad * learning_rate self.evaluate(mom_update1) # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType(np.array([-0.2, -0.2]), self.evaluate(slot0)) self.assertAllCloseAccordingToType(np.array([-0.02, -0.02]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), self.evaluate(var1)) # Step 2: the second momentum accumulators contain the previous update. self.evaluate(mom_update2) # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]), self.evaluate(slot0)) self.assertAllCloseAccordingToType( np.array([(0.9 * (-0.02) - 2.0 * 0.01), (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([ 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0) ]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([ 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - ((0.9 * 0.01 + 0.01) * 2.0) ]), self.evaluate(var1))
def testDynamicLossScaleWithSlots(self, strategy_fn): strategy_obj = strategy_fn() if (isinstance(strategy_obj, tf.distribute.MirroredStrategy) and tf.compat.v1.control_flow_v2_enabled() and not tf.executing_eagerly()): self.skipTest('b/138667997') with strategy_obj.scope() as strategy: var = tf.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = gradient_descent.SGD(1.0, momentum=1.) initial_scale = 2. opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=initial_scale, dynamic_growth_steps=1) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4) self.assertEqual(opt.get_slot_names(), ['momentum'])
def testPassingV1LossScaleErrors(self): opt = gradient_descent.SGD() loss_scale = tf.mixed_precision.experimental.DynamicLossScale( multiplier=4) with self.assertRaisesRegex( ValueError, 'When passing a DynamicLossScale to "loss_scale", ' 'DynamicLossScale.multiplier must be 2. Got: ' 'DynamicLossScale'): loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) class MyLossScale(tf.mixed_precision.experimental.LossScale): def __call__(self): return 1. def update(self, grads): return None, True def get_config(self): return {} with self.assertRaisesRegex( TypeError, 'Passing a LossScale that is not a FixedLossScale or a ' 'DynamicLossScale is no longer supported. Got:'): loss_scale_optimizer.LossScaleOptimizerV1(opt, MyLossScale())
def testDynamicUpdate(self, strategy_fn): with strategy_fn().scope() as strategy: var = tf.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) # Test optimizer with finite gradients loss = lambda: var * 2.0 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Gradient is 2, so variable will have 2 subtracted from it self.assertAllClose([-1.0, 0.0], self.evaluate(var)) # Loss scale has doubled from 2 to 4 self.assertEqual(4., self.evaluate(opt.loss_scale)) # Test optimizer with NaN gradients loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [-1.0, 0.0]) # Loss scale should half due to NaN gradients. self.assertEqual(2., self.evaluate(opt.loss_scale))
def testNanOnOneReplicaOnly(self): if not tf.test.is_gpu_available(): self.skipTest('Test requires GPU') if (not tf.executing_eagerly() and not tf.compat.v1.control_flow_v2_enabled()): self.skipTest( 'b/181283011: GradientTape does not work properly with ' 'V1 control flow, and opt.minimize uses GradientTape') with create_mirrored_strategy().scope() as strategy: var = tf.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=2) def loss(): rep_id = (tf.distribute.get_replica_context(). replica_id_in_sync_group) # The last element of last replica's gradient is NaN. return tf.compat.v1.cond( tf.constant(rep_id == 0), lambda: var * 2., lambda: var * tf.constant([1., float('NaN')])) run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [1.0, 2.0]) # Loss scale should half due to NaN gradients. self.assertEqual(1., self.evaluate(opt.loss_scale))
def testDynamicLossScaleDefaultValues(self): opt = gradient_descent.SGD() opt = loss_scale_optimizer.LossScaleOptimizer(opt) self.assertEqual(opt.initial_scale, 2**15) self.assertEqual(opt.dynamic_growth_steps, 2000) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
def testDynamicLossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. expected_gradient = tf.Variable(learning_rate / strategy.num_replicas_in_sync) with strategy.scope(): var = tf.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) self.assertEqual(opt.initial_scale, 2.) self.assertIsInstance(opt.initial_scale, float) self.assertEqual(opt.dynamic_growth_steps, 1) self.assertIsInstance(opt.dynamic_growth_steps, int) self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check(strategy, var, opt, expected_gradient) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Loss scale will be double, so the expected gradient is also doubled. self.evaluate( expected_gradient.assign(2 * learning_rate / strategy.num_replicas_in_sync)) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # As before, the 2 is subtracted from the variable, making it's new value # 1. self.assertAllClose([1.], self.evaluate(var))
def testDynamicMustBeBool(self): opt = gradient_descent.SGD() with self.assertRaisesRegex( TypeError, '"dynamic" argument to LossScaleOptimizer.__init__ must be ' "a bool, but got: 'dynamic'"): loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
def test_gradient(self, strategy_fn): x = tf.constant([1.]) with strategy_fn().scope() as strategy: with policy.policy_scope('mixed_float16'): layer = mp_test_util.MultiplyLayer(assert_type=tf.float16) # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate is not # applied to a float16 value, but instead the float32 variable. opt = gradient_descent.SGD(2**-14) def run_fn(): with tf.GradientTape() as tape: y = layer(x) # Divide by num_replicas_in_sync, as the effective total loss is the # sum of each of the replica's losses. y /= strategy.num_replicas_in_sync grad = tape.gradient(y, layer.v) return opt.apply_gradients([(grad, layer.v)]) op = strategy.experimental_run(run_fn) if not tf.executing_eagerly(): self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(op) # The gradient with respective to the variable is 1. Since the # variable is initialized with 1 and the learning rate is 2**-14, the # new variable value should be: init_val - gradient * learning_rate, # which is 1 - 1 * 2**-14 self.assertEqual(self.evaluate(layer.v), 1 - 2**-14)
def testSerializationWithBuiltInOptimizer(self, use_v1): opt = gradient_descent.SGD(2., momentum=0.5) if use_v1: loss_scale = tf.mixed_precision.experimental.DynamicLossScale( initial_loss_scale=2., increment_period=3.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2., dynamic_growth_steps=3.) config = optimizers.serialize(opt) opt = optimizers.deserialize(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.dynamic_growth_steps, 3.) self.assertTrue(opt.dynamic, 4.) # Deserializing a LossScaleOptimizer always always results in a V2 # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1. self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer) # Ensure the optimizer can be used var = tf.Variable([5.0]) run_op = self._run_fn_with_grad_check(tf.distribute.get_strategy(), var, opt, 2)() self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.]) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def test_save_slot_variables_with_autocast_vars(self, strategy_fn, var_name='v'): p = policy.Policy('mixed_float16') with strategy_fn().scope(), policy.policy_scope(p): x = layers.Input(shape=(2,), batch_size=2) # Having a var_name other than 'v' tests that a fixed bug (b/134713714) # does not reoccur. The bug was that a crash would occur when saving a # checkpoint where an AutoCastVariable with a slot variable would have a # different name than the layer attribute's name (layer.v in this case). layer = mp_test_util.MultiplyLayer(assert_type=tf.float16, var_name=var_name) y = layer(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1., 1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1) model.compile( optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly()) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) weights_file = os.path.join(self.get_temp_dir(), 'weights') model.save_weights(weights_file) saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertNotEqual(new_slot, saved_slot) model.load_weights(weights_file) restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertEqual(restored_slot, saved_slot)
def get_model(self, initial_weights=None, distribution=None, input_shapes=None): del input_shapes with keras_correctness_test_base.MaybeDistributionScope(distribution): image = keras.layers.Input(shape=(28, 28, 3), name='image') c1 = keras.layers.Conv2D( name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4), kernel_regularizer=keras.regularizers.l2(1e-4))(image) if self.with_batch_norm == 'regular': c1 = keras.layers.BatchNormalization(name='bn1')(c1) elif self.with_batch_norm == 'sync': # Test with parallel batch norms to verify all-reduce works OK. bn1 = keras.layers.SyncBatchNormalization(name='bn1')(c1) bn2 = keras.layers.SyncBatchNormalization(name='bn2')(c1) c1 = keras.layers.Add()([bn1, bn2]) c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1) logits = keras.layers.Dense(10, activation='softmax', name='pred')( keras.layers.Flatten()(c1)) model = keras.Model(inputs=[image], outputs=[logits]) if initial_weights: model.set_weights(initial_weights) model.compile(optimizer=gradient_descent.SGD(learning_rate=0.1), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) return model
def testSparseBasicWithLearningRateDecay(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([[1.0], [2.0]], dtype=dtype) var1 = tf.Variable([[3.0], [4.0]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1])) grads1 = tf.IndexedSlices( tf.constant([0.01], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) sgd_op = gradient_descent.SGD(3.0, decay=0.5).apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 2 steps of sgd self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]], self.evaluate(var0)) self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]], self.evaluate(var1)) self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType( [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0)) self.assertAllCloseAccordingToType( [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self): for dtype in [tf.half, tf.float32, tf.float64]: learning_rate = learning_rate_schedule.InverseTimeDecay( 3.0, decay_steps=1.0, decay_rate=0.5) sgd = gradient_descent.SGD(learning_rate=learning_rate) sgd = gradient_descent.SGD.from_config(sgd.get_config()) self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
def testConfig(self): opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True) config = opt.get_config() opt2 = gradient_descent.SGD.from_config(config) lr = opt.lr lr2 = opt2.lr self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(lr), self.evaluate(lr2)) self.assertAllClose(self.evaluate(opt._get_hyper("momentum")), self.evaluate(opt2._get_hyper("momentum"))) self.assertAllClose(self.evaluate(opt._get_hyper("decay")), self.evaluate(opt2._get_hyper("decay"))) var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32) loss = lambda: 3 * var0 # learning rate variable created when calling minimize. opt.minimize(loss, [var0]) self.evaluate(tf.compat.v1.global_variables_initializer()) config = opt.get_config() opt3 = gradient_descent.SGD.from_config(config) lr3 = opt3.lr self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(lr), self.evaluate(lr3)) self.assertAllClose(self.evaluate(opt._get_hyper("momentum")), self.evaluate(opt3._get_hyper("momentum"))) self.assertAllClose(self.evaluate(opt._get_hyper("decay")), self.evaluate(opt3._get_hyper("decay"))) self.assertTrue(opt3.nesterov)
def test_wide_deep_model_with_two_feature_columns(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = tf.feature_column.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = tf.feature_column.indicator_column(cat_column) emb_column = tf.feature_column.embedding_column(cat_column, dimension=5) linear_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer='zeros') combined_linear = sequential.Sequential( [linear_feature_layer, linear_model]) dnn_model = sequential.Sequential([core.Dense(units=1)]) dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column]) combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model]) wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn) opt = gradient_descent.SGD(learning_rate=0.1) wide_deep_model.compile(opt, 'mse', [], run_eagerly=testing_utils.should_run_eagerly()) wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
def testBasicWithLearningRateDecay(self): for dtype in [tf.half, tf.float32, tf.float64]: learning_rate = 3.0 decay = 0.5 sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay) self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
def test_custom_aggregation(self, distribution, experimental_aggregate_gradients, expected): with distribution.scope(): v = tf.Variable([0., 0.]) optimizer = gradient_descent.SGD(0.1) class PerReplica(values.DistributedValues): """Holds a map from replica to unsynchronized values.""" @property def values(self): """Returns the per replica values.""" return self._values @tf.function def optimize(): with tf.compat.v1.device(distribution.extended.worker_devices[0]): v1 = tf.convert_to_tensor([1., 1.]) with tf.compat.v1.device(distribution.extended.worker_devices[1]): v2 = tf.convert_to_tensor([2., 2.]) grads = PerReplica([v1, v2]) def step_fn(grads): optimizer.apply_gradients([(grads, v)], experimental_aggregate_gradients= experimental_aggregate_gradients) return v.read_value() return distribution.experimental_local_results( distribution.run(step_fn, args=(grads, ))) self.assertAllClose(optimize(), expected)
def test_model_with_fixed_input_dim(self): """Ensure that the batch_dim is removed when saving. When serving or retraining, it is important to reset the batch dim. This can be an issue inside of tf.function. See b/132783590 for context. """ model = testing_utils.get_small_mlp(10, 3, 5) loss_object = keras.losses.MeanSquaredError() optimizer = gradient_descent.SGD() @tf.function def train_step(data, labels): with tf.GradientTape() as tape: predictions = model(data) loss = loss_object(labels, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) x = np.random.random((8, 5)) y = np.random.random((8, 3)) train_step(x, y) fn = saving_utils.trace_model_call(model) self.assertEqual(fn.input_signature[0].shape.as_list(), tf.TensorShape([None, 5]).as_list())
def test_variable_run_argument(self, distribution): # Test that variables passed to run() remain variables. Previous behavior # in TPUStrategy was to cast to Tensor. with distribution.scope(): optimizer = gradient_descent.SGD(0.1) net = core.Dense(1, trainable=True) dataset = tf.data.Dataset.from_tensors([[1.]]) dataset = dataset.repeat() dataset = dataset.batch(2, drop_remainder=True) def replica_step(trainable_variables, features): with tf.GradientTape() as tape: net_out = net(features[0], training=True) loss = (net_out - 1.0) * (net_out - 1.0) gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss @tf.function def step(features): per_replica_losses = distribution.run( replica_step, (net.trainable_variables, features), ) loss = distribution.reduce( tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return loss step(next(iter(dataset)))
def createTestModel(self, compile_model): model = keras.Sequential([keras.layers.Dense(10)]) if compile_model: model.compile(gradient_descent.SGD(), loss='mse', metrics=keras.metrics.CategoricalAccuracy()) return model
def test_wide_deep_model(self, distribution, use_dataset_creator, data_fn): if ((not use_dataset_creator) and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy)): self.skipTest( 'Parameter Server strategy requires dataset creator to be used in ' 'model.fit.') if (not tf.__internal__.tf2.enabled() and use_dataset_creator and isinstance( distribution, tf.distribute.experimental.ParameterServerStrategy)): self.skipTest( 'Parameter Server strategy with dataset creator needs to be run when ' 'eager execution is enabled.') with distribution.scope(): linear_model = linear.LinearModel(units=1) dnn_model = sequential.Sequential([core.Dense(units=1)]) wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model) linear_opt = gradient_descent.SGD(learning_rate=0.05) dnn_opt = adagrad.Adagrad(learning_rate=0.1) wide_deep_model.compile(optimizer=[linear_opt, dnn_opt], loss='mse') if use_dataset_creator: x = dataset_creator.DatasetCreator(dataset_fn) hist = wide_deep_model.fit(x, epochs=5, steps_per_epoch=INPUT_SIZE) else: if data_fn == 'numpy': inputs, output = get_numpy() hist = wide_deep_model.fit(inputs, output, epochs=5) else: hist = wide_deep_model.fit(get_dataset(), epochs=5) self.assertLess(hist.history['loss'][4], 0.2)