def _compile_keras_model(self, hp, model): # Specify hyperparameters from compile(...) optimizer_name = hp.Choice( "optimizer", ["adam", "sgd", "adam_weight_decay"], default="adam", ) # TODO: add adadelta optimizer when it can optimize embedding layer on GPU. learning_rate = hp.Choice("learning_rate", [1e-1, 1e-2, 1e-3, 1e-4, 2e-5, 1e-5], default=1e-3) if optimizer_name == "adam": optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) elif optimizer_name == "sgd": optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate) elif optimizer_name == "adam_weight_decay": steps_per_epoch = int(self.num_samples / self.batch_size) num_train_steps = steps_per_epoch * self.epochs warmup_steps = int(self.epochs * self.num_samples * 0.1 / self.batch_size) lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0, ) if warmup_steps: lr_schedule = keras_layers.WarmUp( initial_learning_rate=learning_rate, decay_schedule_fn=lr_schedule, warmup_steps=warmup_steps, ) optimizer = keras_layers.AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) model.compile(optimizer=optimizer, metrics=self._get_metrics(), loss=self._get_loss()) return model
def test_adam_weight_decay(tmp_path): model = tf.keras.Sequential([tf.keras.layers.Dense(10, input_shape=(10,))]) lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=0.1, decay_steps=100, end_learning_rate=0.0, ) lr_schedule = layer_module.WarmUp( initial_learning_rate=0.1, decay_schedule_fn=lr_schedule, warmup_steps=10, ) optimizer = layer_module.AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) model.compile(loss="mse", optimizer=optimizer) model.fit(np.random.rand(100, 10), np.random.rand(100, 10), epochs=2) model.save(os.path.join(tmp_path, "model"))