Exemple #1
0
    def testJitCompile(self, strategy):
        # Test the optimizer yields same numerical results when jit_compile is
        # on and off.
        with strategy.scope():
            optimizer_1 = adam_new.Adam(ema_option=optimizer_lib.EMAOption(
                use_ema=True, ema_overwrite_frequency=1))
            optimizer_2 = adam_new.Adam(jit_compile=True,
                                        ema_option=optimizer_lib.EMAOption(
                                            use_ema=True,
                                            ema_overwrite_frequency=1))
            model_1 = keras.Sequential([
                keras.layers.Input(shape=(2, )),
                keras.layers.Dense(5),
                keras.layers.Dense(1)
            ])
            model_2 = keras.models.clone_model(model_1)
            model_2.set_weights(model_1.get_weights())

            def per_worker_dataset_fn():
                def dataset_fn(_):
                    x = np.random.rand(6, 2)
                    y = [1, 1, 1, 0, 0, 0]
                    ds = tf.data.Dataset.from_tensor_slices((x, y))
                    ds = ds.repeat().batch(6)
                    return ds

                return strategy.distribute_datasets_from_function(dataset_fn)

            ds = per_worker_dataset_fn()

            @tf.function
            def train_step(ds):
                def replica_fn(data):
                    features, labels = data
                    with tf.GradientTape() as tape:
                        output_1 = model_1(features)
                        loss_1 = keras.losses.MeanSquaredError(
                            reduction=losses_utils.ReductionV2.NONE)(labels,
                                                                     output_1)
                    grads_1 = tape.gradient(loss_1,
                                            model_1.trainable_variables)
                    optimizer_1.apply_gradients(
                        zip(grads_1, model_1.trainable_variables))

                    with tf.GradientTape() as tape:
                        output_2 = model_2(features)
                        loss_2 = keras.losses.MeanSquaredError(
                            reduction=losses_utils.ReductionV2.NONE)(labels,
                                                                     output_2)
                    grads_2 = tape.gradient(loss_2,
                                            model_2.trainable_variables)
                    optimizer_2.apply_gradients(
                        zip(grads_2, model_2.trainable_variables))

                strategy.run(replica_fn, args=(next(iter(ds)), ))

            for _ in range(3):
                train_step(ds)
                self.assertAllClose(model_1.trainable_variables[0][0],
                                    model_2.trainable_variables[0][0])
Exemple #2
0
  def testCheckpointOptimizer(self):
    x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
    lr_schedule = learning_rate_schedule.ExponentialDecay(
        initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9)
    optimizer_1 = adam_new.Adam(
        learning_rate=lr_schedule, beta_1=0.8, beta_2=0.888)
    grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])

    for _ in range(1):
      optimizer_1.apply_gradients(zip([grads], [x]))

    # Then save the variable and optimizer to a checkpoint.
    checkpoint_1 = tf.train.Checkpoint(var=x, optimizer=optimizer_1)
    checkpoint_path = checkpoint_1.save(self.get_temp_dir())

    # Create a new optimizer and call restore on it (and x)
    x2 = tf.Variable([[0., 0.], [0., 0.]], dtype=x.dtype)
    optimizer_2 = adam_new.Adam(learning_rate=0.02, beta_1=0.7, beta_2=0.777)
    optimizer_2.build([x2])
    checkpoint_2 = tf.train.Checkpoint(var=x2, optimizer=optimizer_2)
    checkpoint_2.restore(checkpoint_path)

    self.assertTrue(
        (self.evaluate(optimizer_1._momentums._storage[0]) == self.evaluate(
            optimizer_2._momentums._storage[0])).all())
    self.assertEqual(
        self.evaluate(optimizer_1._iterations),
        self.evaluate(optimizer_2._iterations))
Exemple #3
0
  def testSetLearningRate(self):
    optimizer = adam_new.Adam(learning_rate=1.0)
    self.assertIsInstance(optimizer._learning_rate, tf.Variable)
    self.assertEqual(self.evaluate(optimizer.learning_rate), 1.0)
    optimizer.learning_rate = 2.0
    self.assertEqual(self.evaluate(optimizer.learning_rate), 2.0)
    # Test the legacy setter.
    optimizer.lr = 3.0
    self.assertEqual(self.evaluate(optimizer.learning_rate), 3.0)

    lr_schedule = learning_rate_schedule.ExponentialDecay(
        initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9)
    optimizer = adam_new.Adam(learning_rate=lr_schedule)
    self.assertIsInstance(optimizer._learning_rate,
                          learning_rate_schedule.ExponentialDecay)
    self.assertEqual(optimizer.learning_rate, 0.01)
    # Test the legacy property.
    self.assertEqual(optimizer.lr, 0.01)

    x = tf.Variable([1.0, 2.0], dtype=tf.float32)
    grads = tf.convert_to_tensor([1.0, 2.0])
    for _ in range(2):
      optimizer.apply_gradients(zip([grads], [x]))
    self.assertTrue(optimizer.learning_rate < 0.01 and
                    optimizer.learning_rate > 0.00999)
    with self.assertRaisesRegex(TypeError, "This optimizer was created with*"):
      optimizer.learning_rate = 2.0
  def testClipGlobalNorm(self):
    optimizer = adam_new.Adam(global_clipnorm=1)
    grad = [tf.cast([100.0, 100.0], dtype=tf.float32),
            tf.cast([100.0, 100.0], dtype=tf.float32)]
    clipped_grad = optimizer._clip_gradients(grad)
    self.assertAllClose(clipped_grad[0], [0.5, 0.5])

    with self.assertRaisesRegex(ValueError, "At most one of*"):
      _ = adam_new.Adam(
          learning_rate=1, epsilon=0, global_clipnorm=1, clipnorm=1)
  def testSetLearningRate(self):
    optimizer = adam_new.Adam(learning_rate=1.0)
    self.assertIsInstance(optimizer._learning_rate, tf.Variable)
    self.assertEqual(self.evaluate(optimizer.learning_rate), 1.0)
    optimizer.learning_rate = 2.0
    self.assertEqual(self.evaluate(optimizer.learning_rate), 2.0)

    lr_schedule = learning_rate_schedule.ExponentialDecay(
        initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9)
    optimizer = adam_new.Adam(learning_rate=lr_schedule)
    self.assertIsInstance(optimizer._learning_rate,
                          learning_rate_schedule.ExponentialDecay)
    with self.assertRaisesRegex(TypeError, "This optimizer was created with*"):
      optimizer.learning_rate = 2.0
Exemple #6
0
 def testGetAndFromConfig(self):
   optimizer = adam_new.Adam(
       learning_rate=np.float64(0.05),
       beta_1=0.7,
       beta_2=0.77,
       amsgrad=True,
       epsilon=0.001,
       clipnorm=0.5,
       use_ema=True,
       ema_momentum=0.5,
       ema_overwrite_frequency=50)
   config = optimizer.get_config()
   self.assertDictEqual(
       config, {
           "learning_rate": np.float32(0.05),
           "beta_1": 0.7,
           "beta_2": 0.77,
           "epsilon": 0.001,
           "amsgrad": True,
           "clipnorm": 0.5,
           "global_clipnorm": None,
           "clipvalue": None,
           "use_ema": True,
           "ema_momentum": 0.5,
           "ema_overwrite_frequency": 50,
           "jit_compile": False,
       })
   restored_optimizer = adam_new.Adam.from_config(config)
   self.assertDictEqual(restored_optimizer.get_config(),
                        optimizer.get_config())
Exemple #7
0
 def testGetAndFromConfig(self):
     gradients_clip_option = optimizer_lib.GradientsClipOption(clipnorm=0.5)
     ema_option = optimizer_lib.EMAOption(use_ema=True,
                                          ema_momentum=0.5,
                                          ema_overwrite_frequency=50)
     optimizer = adam_new.Adam(learning_rate=np.float64(0.05),
                               beta_1=0.7,
                               beta_2=0.77,
                               amsgrad=True,
                               epsilon=0.001,
                               gradients_clip_option=gradients_clip_option,
                               ema_option=ema_option)
     config = optimizer.get_config()
     self.assertDictEqual(
         config, {
             "learning_rate": np.float32(0.05),
             "beta_1": 0.7,
             "beta_2": 0.77,
             "epsilon": 0.001,
             "amsgrad": True,
             "gradients_clip_option": {
                 "clipnorm": 0.5,
                 "global_clipnorm": None,
                 "clipvalue": None,
             },
             "ema_option": {
                 "use_ema": True,
                 "ema_momentum": 0.5,
                 "ema_overwrite_frequency": 50,
             }
         })
     restored_optimizer = adam_new.Adam.from_config(config)
     self.assertDictEqual(restored_optimizer.get_config(),
                          optimizer.get_config())
Exemple #8
0
  def testPassingLegacyArgsRaiseWarning(self):
    with self.assertLogs(level="WARNING") as log_output:
      logging.set_verbosity(logging.WARNING)
      _ = adam_new.Adam(clipnorm=1, decay=0.5)
      expected_log = "decay is deprecated in"
      output = log_output[0][0].message

      self.assertTrue(re.search(expected_log, output))
Exemple #9
0
 def testClipGlobalNorm(self):
   optimizer = adam_new.Adam(global_clipnorm=1)
   grad = [
       tf.cast([100.0, 100.0], dtype=tf.float32),
       tf.cast([100.0, 100.0], dtype=tf.float32)
   ]
   clipped_grad = optimizer._clip_gradients(grad)
   self.assertAllClose(clipped_grad[0], [0.5, 0.5])
Exemple #10
0
 def testSetIterations(self):
   optimizer = adam_new.Adam()
   optimizer.iterations = tf.Variable(2, dtype=tf.int32)
   self.assertEqual(optimizer.iterations, 2)
   var_list = [tf.Variable(2.0), tf.Variable(2.0)]
   grads = tf.convert_to_tensor([1.0, 1.0])
   optimizer.apply_gradients(zip(grads, var_list))
   self.assertEqual(optimizer.iterations, 3)
   with self.assertRaisesRegex(RuntimeError, "Cannot set*"):
     optimizer.iterations = 2
Exemple #11
0
 def testReturnAllOptimizerVariables(self):
   x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
   optimizer = adam_new.Adam()
   grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
   optimizer.apply_gradients(zip([grads], [x]))
   optimizer_variables = optimizer.variables
   all_names = [var._shared_name for var in optimizer_variables]
   self.assertLen(optimizer_variables, 4)
   self.assertCountEqual(
       all_names,
       ["iteration", "learning_rate", "Adam/m/Variable", "Adam/v/Variable"])
Exemple #12
0
  def testMovingAverageOptimizer(self):
    # We set polyak averaging with ema_momentum = 1 so that the
    #  moving average is always the original value of the variables.
    ema_option = optimizer_lib.EMAOption(
        use_ema=True, ema_momentum=1, ema_overwrite_frequency=2)
    optimizer = adam_new.Adam(ema_option=ema_option)
    x = tf.Variable([1.0, 2.0], dtype=tf.float32)
    x_origin = tf.Variable(x)
    grads = tf.convert_to_tensor([1.0, 2.0])
    # First iteration, we store the moving average, and do not do overriding.
    optimizer.apply_gradients(zip([grads], [x]))
    self.assertAllEqual(optimizer._model_variables_moving_average[0], x_origin)
    self.assertNotAllEqual(x, x_origin)

    # Second iteration, we store the moving average, and override model vars.
    optimizer.apply_gradients(zip([grads], [x]))
    self.assertAllEqual(x, x_origin)
Exemple #13
0
 def testGetConfig(self):
     optimizer = adam_new.Adam(learning_rate=np.float64(0.05),
                               beta_1=0.7,
                               beta_2=0.77,
                               amsgrad=True,
                               epsilon=0.001)
     config = optimizer.get_config()
     self.assertDictEqual(
         config, {
             "learning_rate": np.float32(0.05),
             "beta_1": 0.7,
             "beta_2": 0.77,
             "epsilon": 0.001,
             "amsgrad": True,
             "clipnorm": None,
             "global_clipnorm": None,
             "clipvalue": None,
         })
    "RmsPropV1", lambda: tf.compat.v1.train.RMSPropOptimizer(0.001))

# TODO(shiningsun): consider adding the other v1 optimizers
optimizers_v1 = [
    gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn,
    ftrl_optimizer_v1_fn, rmsprop_optimizer_v1_fn
]

adadelta_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001))
adagrad_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
adam_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0))
adam_experimental_fn = tf.__internal__.test.combinations.NamedObject(
    "AdamExperimental", lambda: adam_experimental.Adam(0.001))
adamax_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "AdamaxKerasV2", lambda: adamax_keras_v2.Adamax(0.001, epsilon=1.0))
nadam_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "NadamKerasV2", lambda: nadam_keras_v2.Nadam(0.001, epsilon=1.0))
ftrl_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "FtrlKerasV2", lambda: ftrl_keras_v2.Ftrl(0.001))
gradient_descent_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "GradientDescentKerasV2", lambda: gradient_descent_keras_v2.SGD(0.001))
rmsprop_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
    "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001))

# TODO(shiningsun): consider adding the other v2 optimizers
optimizers_v2 = [
    gradient_descent_optimizer_keras_v2_fn, adagrad_optimizer_keras_v2_fn
]
Exemple #15
0
 def testClipValue(self):
     gradients_clip_option = optimizer_lib.GradientsClipOption(clipvalue=1)
     optimizer = adam_new.Adam(gradients_clip_option=gradients_clip_option)
     grad = [tf.convert_to_tensor([100.0, 100.0])]
     clipped_grad = optimizer._clip_gradients(grad)
     self.assertAllEqual(clipped_grad[0], [1.0, 1.0])
Exemple #16
0
 def testClipValue(self):
   optimizer = adam_new.Adam(clipvalue=1)
   grad = [tf.convert_to_tensor([100.0, 100.0])]
   clipped_grad = optimizer._clip_gradients(grad)
   self.assertAllEqual(clipped_grad[0], [1.0, 1.0])
Exemple #17
0
 def testClipNorm(self):
   optimizer = adam_new.Adam(clipnorm=1)
   grad = [tf.convert_to_tensor([100.0, 100.0])]
   clipped_grad = optimizer._clip_gradients(grad)
   self.assertAllClose(clipped_grad[0], [2**0.5 / 2, 2**0.5 / 2])
Exemple #18
0
 def testBuildIndexDict(self):
   optimizer = adam_new.Adam()
   var_list = [tf.Variable(0, name=f"var{i}") for i in range(10)]
   optimizer._build_index_dict(var_list)
   self.assertEqual(optimizer._index_dict[optimizer._var_key(var_list[7])], 7)
Exemple #19
0
 def testAddVariableFromReference(self):
   optimizer = adam_new.Adam()
   variable = optimizer.add_variable_from_reference(
       tf.Variable(1.0, name="tmp"), "test")
   self.assertEqual(variable._shared_name, "test/tmp")
   self.assertEqual(self.evaluate(variable), 0)
Exemple #20
0
 def testAdam(self):
   self._compare_numerical(
       adam_old.Adam(amsgrad=True), adam_new.Adam(amsgrad=True))
Exemple #21
0
    ds_combinations.mirrored_strategy_with_two_gpus,
    ds_combinations.tpu_strategy,
    ds_combinations.cloud_tpu_strategy,
    ds_combinations.multi_worker_mirrored_2x1_cpu,
    ds_combinations.multi_worker_mirrored_2x2_gpu,
    ds_combinations.central_storage_strategy_with_two_gpus,
]

adadelta_new_fn = tf.__internal__.test.combinations.NamedObject(
    "experimentaladadelta",
    lambda: adadelta_new.Adadelta(  # pylint: disable=g-long-lambda
        0.002, use_ema=True, ema_overwrite_frequency=None))
adagrad_new_fn = tf.__internal__.test.combinations.NamedObject(
    "experimentaladagrad", lambda: adagrad_new.Adagrad(0.002))
adam_new_fn = tf.__internal__.test.combinations.NamedObject(
    "experimentaladam", lambda: adam_new.Adam(0.002))
rmsprop_new_fn = tf.__internal__.test.combinations.NamedObject(
    "experimentalrmsprop", lambda: rmsprop_new.RMSprop(0.002))
sgd_new_fn = tf.__internal__.test.combinations.NamedObject(
    "experimentalsgdaverage",
    lambda: sgd_new.SGD(  # pylint: disable=g-long-lambda
        0.002, use_ema=True, ema_overwrite_frequency=1))

OPTIMIZER_FN = [
    adadelta_new_fn,
    adagrad_new_fn,
    adam_new_fn,
    rmsprop_new_fn,
    sgd_new_fn,
]
Exemple #22
0
 def testPassingLegacyClipnorm(self):
   optimizer = adam_new.Adam(clipnorm=1)
   self.assertEqual(optimizer._clipnorm, 1)