def testJitCompile(self, strategy): # Test the optimizer yields same numerical results when jit_compile is # on and off. with strategy.scope(): optimizer_1 = adam_new.Adam(ema_option=optimizer_lib.EMAOption( use_ema=True, ema_overwrite_frequency=1)) optimizer_2 = adam_new.Adam(jit_compile=True, ema_option=optimizer_lib.EMAOption( use_ema=True, ema_overwrite_frequency=1)) model_1 = keras.Sequential([ keras.layers.Input(shape=(2, )), keras.layers.Dense(5), keras.layers.Dense(1) ]) model_2 = keras.models.clone_model(model_1) model_2.set_weights(model_1.get_weights()) def per_worker_dataset_fn(): def dataset_fn(_): x = np.random.rand(6, 2) y = [1, 1, 1, 0, 0, 0] ds = tf.data.Dataset.from_tensor_slices((x, y)) ds = ds.repeat().batch(6) return ds return strategy.distribute_datasets_from_function(dataset_fn) ds = per_worker_dataset_fn() @tf.function def train_step(ds): def replica_fn(data): features, labels = data with tf.GradientTape() as tape: output_1 = model_1(features) loss_1 = keras.losses.MeanSquaredError( reduction=losses_utils.ReductionV2.NONE)(labels, output_1) grads_1 = tape.gradient(loss_1, model_1.trainable_variables) optimizer_1.apply_gradients( zip(grads_1, model_1.trainable_variables)) with tf.GradientTape() as tape: output_2 = model_2(features) loss_2 = keras.losses.MeanSquaredError( reduction=losses_utils.ReductionV2.NONE)(labels, output_2) grads_2 = tape.gradient(loss_2, model_2.trainable_variables) optimizer_2.apply_gradients( zip(grads_2, model_2.trainable_variables)) strategy.run(replica_fn, args=(next(iter(ds)), )) for _ in range(3): train_step(ds) self.assertAllClose(model_1.trainable_variables[0][0], model_2.trainable_variables[0][0])
def testCheckpointOptimizer(self): x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32) lr_schedule = learning_rate_schedule.ExponentialDecay( initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9) optimizer_1 = adam_new.Adam( learning_rate=lr_schedule, beta_1=0.8, beta_2=0.888) grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]]) for _ in range(1): optimizer_1.apply_gradients(zip([grads], [x])) # Then save the variable and optimizer to a checkpoint. checkpoint_1 = tf.train.Checkpoint(var=x, optimizer=optimizer_1) checkpoint_path = checkpoint_1.save(self.get_temp_dir()) # Create a new optimizer and call restore on it (and x) x2 = tf.Variable([[0., 0.], [0., 0.]], dtype=x.dtype) optimizer_2 = adam_new.Adam(learning_rate=0.02, beta_1=0.7, beta_2=0.777) optimizer_2.build([x2]) checkpoint_2 = tf.train.Checkpoint(var=x2, optimizer=optimizer_2) checkpoint_2.restore(checkpoint_path) self.assertTrue( (self.evaluate(optimizer_1._momentums._storage[0]) == self.evaluate( optimizer_2._momentums._storage[0])).all()) self.assertEqual( self.evaluate(optimizer_1._iterations), self.evaluate(optimizer_2._iterations))
def testSetLearningRate(self): optimizer = adam_new.Adam(learning_rate=1.0) self.assertIsInstance(optimizer._learning_rate, tf.Variable) self.assertEqual(self.evaluate(optimizer.learning_rate), 1.0) optimizer.learning_rate = 2.0 self.assertEqual(self.evaluate(optimizer.learning_rate), 2.0) # Test the legacy setter. optimizer.lr = 3.0 self.assertEqual(self.evaluate(optimizer.learning_rate), 3.0) lr_schedule = learning_rate_schedule.ExponentialDecay( initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9) optimizer = adam_new.Adam(learning_rate=lr_schedule) self.assertIsInstance(optimizer._learning_rate, learning_rate_schedule.ExponentialDecay) self.assertEqual(optimizer.learning_rate, 0.01) # Test the legacy property. self.assertEqual(optimizer.lr, 0.01) x = tf.Variable([1.0, 2.0], dtype=tf.float32) grads = tf.convert_to_tensor([1.0, 2.0]) for _ in range(2): optimizer.apply_gradients(zip([grads], [x])) self.assertTrue(optimizer.learning_rate < 0.01 and optimizer.learning_rate > 0.00999) with self.assertRaisesRegex(TypeError, "This optimizer was created with*"): optimizer.learning_rate = 2.0
def testClipGlobalNorm(self): optimizer = adam_new.Adam(global_clipnorm=1) grad = [tf.cast([100.0, 100.0], dtype=tf.float32), tf.cast([100.0, 100.0], dtype=tf.float32)] clipped_grad = optimizer._clip_gradients(grad) self.assertAllClose(clipped_grad[0], [0.5, 0.5]) with self.assertRaisesRegex(ValueError, "At most one of*"): _ = adam_new.Adam( learning_rate=1, epsilon=0, global_clipnorm=1, clipnorm=1)
def testSetLearningRate(self): optimizer = adam_new.Adam(learning_rate=1.0) self.assertIsInstance(optimizer._learning_rate, tf.Variable) self.assertEqual(self.evaluate(optimizer.learning_rate), 1.0) optimizer.learning_rate = 2.0 self.assertEqual(self.evaluate(optimizer.learning_rate), 2.0) lr_schedule = learning_rate_schedule.ExponentialDecay( initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9) optimizer = adam_new.Adam(learning_rate=lr_schedule) self.assertIsInstance(optimizer._learning_rate, learning_rate_schedule.ExponentialDecay) with self.assertRaisesRegex(TypeError, "This optimizer was created with*"): optimizer.learning_rate = 2.0
def testGetAndFromConfig(self): optimizer = adam_new.Adam( learning_rate=np.float64(0.05), beta_1=0.7, beta_2=0.77, amsgrad=True, epsilon=0.001, clipnorm=0.5, use_ema=True, ema_momentum=0.5, ema_overwrite_frequency=50) config = optimizer.get_config() self.assertDictEqual( config, { "learning_rate": np.float32(0.05), "beta_1": 0.7, "beta_2": 0.77, "epsilon": 0.001, "amsgrad": True, "clipnorm": 0.5, "global_clipnorm": None, "clipvalue": None, "use_ema": True, "ema_momentum": 0.5, "ema_overwrite_frequency": 50, "jit_compile": False, }) restored_optimizer = adam_new.Adam.from_config(config) self.assertDictEqual(restored_optimizer.get_config(), optimizer.get_config())
def testGetAndFromConfig(self): gradients_clip_option = optimizer_lib.GradientsClipOption(clipnorm=0.5) ema_option = optimizer_lib.EMAOption(use_ema=True, ema_momentum=0.5, ema_overwrite_frequency=50) optimizer = adam_new.Adam(learning_rate=np.float64(0.05), beta_1=0.7, beta_2=0.77, amsgrad=True, epsilon=0.001, gradients_clip_option=gradients_clip_option, ema_option=ema_option) config = optimizer.get_config() self.assertDictEqual( config, { "learning_rate": np.float32(0.05), "beta_1": 0.7, "beta_2": 0.77, "epsilon": 0.001, "amsgrad": True, "gradients_clip_option": { "clipnorm": 0.5, "global_clipnorm": None, "clipvalue": None, }, "ema_option": { "use_ema": True, "ema_momentum": 0.5, "ema_overwrite_frequency": 50, } }) restored_optimizer = adam_new.Adam.from_config(config) self.assertDictEqual(restored_optimizer.get_config(), optimizer.get_config())
def testPassingLegacyArgsRaiseWarning(self): with self.assertLogs(level="WARNING") as log_output: logging.set_verbosity(logging.WARNING) _ = adam_new.Adam(clipnorm=1, decay=0.5) expected_log = "decay is deprecated in" output = log_output[0][0].message self.assertTrue(re.search(expected_log, output))
def testClipGlobalNorm(self): optimizer = adam_new.Adam(global_clipnorm=1) grad = [ tf.cast([100.0, 100.0], dtype=tf.float32), tf.cast([100.0, 100.0], dtype=tf.float32) ] clipped_grad = optimizer._clip_gradients(grad) self.assertAllClose(clipped_grad[0], [0.5, 0.5])
def testSetIterations(self): optimizer = adam_new.Adam() optimizer.iterations = tf.Variable(2, dtype=tf.int32) self.assertEqual(optimizer.iterations, 2) var_list = [tf.Variable(2.0), tf.Variable(2.0)] grads = tf.convert_to_tensor([1.0, 1.0]) optimizer.apply_gradients(zip(grads, var_list)) self.assertEqual(optimizer.iterations, 3) with self.assertRaisesRegex(RuntimeError, "Cannot set*"): optimizer.iterations = 2
def testReturnAllOptimizerVariables(self): x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32) optimizer = adam_new.Adam() grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]]) optimizer.apply_gradients(zip([grads], [x])) optimizer_variables = optimizer.variables all_names = [var._shared_name for var in optimizer_variables] self.assertLen(optimizer_variables, 4) self.assertCountEqual( all_names, ["iteration", "learning_rate", "Adam/m/Variable", "Adam/v/Variable"])
def testMovingAverageOptimizer(self): # We set polyak averaging with ema_momentum = 1 so that the # moving average is always the original value of the variables. ema_option = optimizer_lib.EMAOption( use_ema=True, ema_momentum=1, ema_overwrite_frequency=2) optimizer = adam_new.Adam(ema_option=ema_option) x = tf.Variable([1.0, 2.0], dtype=tf.float32) x_origin = tf.Variable(x) grads = tf.convert_to_tensor([1.0, 2.0]) # First iteration, we store the moving average, and do not do overriding. optimizer.apply_gradients(zip([grads], [x])) self.assertAllEqual(optimizer._model_variables_moving_average[0], x_origin) self.assertNotAllEqual(x, x_origin) # Second iteration, we store the moving average, and override model vars. optimizer.apply_gradients(zip([grads], [x])) self.assertAllEqual(x, x_origin)
def testGetConfig(self): optimizer = adam_new.Adam(learning_rate=np.float64(0.05), beta_1=0.7, beta_2=0.77, amsgrad=True, epsilon=0.001) config = optimizer.get_config() self.assertDictEqual( config, { "learning_rate": np.float32(0.05), "beta_1": 0.7, "beta_2": 0.77, "epsilon": 0.001, "amsgrad": True, "clipnorm": None, "global_clipnorm": None, "clipvalue": None, })
"RmsPropV1", lambda: tf.compat.v1.train.RMSPropOptimizer(0.001)) # TODO(shiningsun): consider adding the other v1 optimizers optimizers_v1 = [ gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn, ftrl_optimizer_v1_fn, rmsprop_optimizer_v1_fn ] adadelta_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001)) adagrad_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001)) adam_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0)) adam_experimental_fn = tf.__internal__.test.combinations.NamedObject( "AdamExperimental", lambda: adam_experimental.Adam(0.001)) adamax_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "AdamaxKerasV2", lambda: adamax_keras_v2.Adamax(0.001, epsilon=1.0)) nadam_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "NadamKerasV2", lambda: nadam_keras_v2.Nadam(0.001, epsilon=1.0)) ftrl_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "FtrlKerasV2", lambda: ftrl_keras_v2.Ftrl(0.001)) gradient_descent_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "GradientDescentKerasV2", lambda: gradient_descent_keras_v2.SGD(0.001)) rmsprop_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject( "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001)) # TODO(shiningsun): consider adding the other v2 optimizers optimizers_v2 = [ gradient_descent_optimizer_keras_v2_fn, adagrad_optimizer_keras_v2_fn ]
def testClipValue(self): gradients_clip_option = optimizer_lib.GradientsClipOption(clipvalue=1) optimizer = adam_new.Adam(gradients_clip_option=gradients_clip_option) grad = [tf.convert_to_tensor([100.0, 100.0])] clipped_grad = optimizer._clip_gradients(grad) self.assertAllEqual(clipped_grad[0], [1.0, 1.0])
def testClipValue(self): optimizer = adam_new.Adam(clipvalue=1) grad = [tf.convert_to_tensor([100.0, 100.0])] clipped_grad = optimizer._clip_gradients(grad) self.assertAllEqual(clipped_grad[0], [1.0, 1.0])
def testClipNorm(self): optimizer = adam_new.Adam(clipnorm=1) grad = [tf.convert_to_tensor([100.0, 100.0])] clipped_grad = optimizer._clip_gradients(grad) self.assertAllClose(clipped_grad[0], [2**0.5 / 2, 2**0.5 / 2])
def testBuildIndexDict(self): optimizer = adam_new.Adam() var_list = [tf.Variable(0, name=f"var{i}") for i in range(10)] optimizer._build_index_dict(var_list) self.assertEqual(optimizer._index_dict[optimizer._var_key(var_list[7])], 7)
def testAddVariableFromReference(self): optimizer = adam_new.Adam() variable = optimizer.add_variable_from_reference( tf.Variable(1.0, name="tmp"), "test") self.assertEqual(variable._shared_name, "test/tmp") self.assertEqual(self.evaluate(variable), 0)
def testAdam(self): self._compare_numerical( adam_old.Adam(amsgrad=True), adam_new.Adam(amsgrad=True))
ds_combinations.mirrored_strategy_with_two_gpus, ds_combinations.tpu_strategy, ds_combinations.cloud_tpu_strategy, ds_combinations.multi_worker_mirrored_2x1_cpu, ds_combinations.multi_worker_mirrored_2x2_gpu, ds_combinations.central_storage_strategy_with_two_gpus, ] adadelta_new_fn = tf.__internal__.test.combinations.NamedObject( "experimentaladadelta", lambda: adadelta_new.Adadelta( # pylint: disable=g-long-lambda 0.002, use_ema=True, ema_overwrite_frequency=None)) adagrad_new_fn = tf.__internal__.test.combinations.NamedObject( "experimentaladagrad", lambda: adagrad_new.Adagrad(0.002)) adam_new_fn = tf.__internal__.test.combinations.NamedObject( "experimentaladam", lambda: adam_new.Adam(0.002)) rmsprop_new_fn = tf.__internal__.test.combinations.NamedObject( "experimentalrmsprop", lambda: rmsprop_new.RMSprop(0.002)) sgd_new_fn = tf.__internal__.test.combinations.NamedObject( "experimentalsgdaverage", lambda: sgd_new.SGD( # pylint: disable=g-long-lambda 0.002, use_ema=True, ema_overwrite_frequency=1)) OPTIMIZER_FN = [ adadelta_new_fn, adagrad_new_fn, adam_new_fn, rmsprop_new_fn, sgd_new_fn, ]
def testPassingLegacyClipnorm(self): optimizer = adam_new.Adam(clipnorm=1) self.assertEqual(optimizer._clipnorm, 1)