def create_optimizer( init_lr, num_train_steps, num_warmup_steps, learning_rate_type="polynomial", adam_beta_2=0.999, adam_epsilon=1e-06, weight_decay_rate=0.0, optimizer_type="adamw", ): """Creates an optimizer with learning rate schedule.""" if optimizer_type == "adafactor": return AdafactorOptimizer(learning_rate=init_lr) # Implements linear decay of the learning rate. if learning_rate_type == "linear": if num_warmup_steps: learning_rate_fn = WarmUp_Linear( initial_learning_rate=init_lr, num_training_steps=num_train_steps, warmup_steps=num_warmup_steps, ) else: learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, ) if num_warmup_steps: learning_rate_fn = WarmUp( initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps, ) if optimizer_type == "adamw": logging.info("using Adamw optimizer") optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=adam_beta_2, epsilon=adam_epsilon, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) elif optimizer_type == "lamb": logging.info("using Lamb optimizer") optimizer = tfa_optimizers.LAMB( learning_rate=learning_rate_fn, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) else: raise ValueError("Unsupported optimizer type: ", optimizer_type) return optimizer, learning_rate_fn
def test_lamb_optimizer(self): params = {'optimizer': {'type': 'lamb'}} expected_optimizer_config = tfa_optimizers.LAMB().get_config() opt_config = optimization_config.OptimizationConfig(params) opt_factory = optimizer_factory.OptimizerFactory(opt_config) lr = opt_factory.build_learning_rate() optimizer = opt_factory.build_optimizer(lr) self.assertIsInstance(optimizer, tfa_optimizers.LAMB) self.assertEqual(expected_optimizer_config, optimizer.get_config())
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer, use_lr_schedule, use_bias_correction_for_adamw=False): """Creates an optimizer with learning rate schedule. Extended based on official.nlp.optimization.create_optimizer :param init_lr Initial learning rate :param num_train_steps Number of training steps :param num_warmup_steps Number of warmup steps :param optimizer Type of optimizer :param use_lr_schedule Whether to use learning rate scheudling such as warm up and decay :param use_bias_correction_for_adamw Whether to use bias correction in AdamWeightDecay optimzer """ lr_schedule = init_lr if use_lr_schedule: # Implements linear decay of the learning rate. lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, power=1.0, end_learning_rate=0.0) if num_warmup_steps: lr_schedule = WarmUp(initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps) optimizer_dct = { 'sgd': tf.keras.optimizers.SGD(learning_rate=lr_schedule), 'adam': tf.keras.optimizers.Adam(learning_rate=lr_schedule), 'adamw': AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'], use_bias_correction=use_bias_correction_for_adamw), 'lamb': tfa_optimizers.LAMB( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) } return optimizer_dct[optimizer]
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type="adam"): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. if optimizer_type == "adam": power = 1.0 decayed_learning_rate_at_crossover_point = init_lr * ( (1.0 - float(num_warmup_steps) / float(num_train_steps))**power) else: power = 0.5 decayed_learning_rate_at_crossover_point = init_lr init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point) print( 'decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, init_lr)) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=power) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) if optimizer_type == 'adam': optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) else: skip_list = [ 'None' ] # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None optimizer = tfa_optimizers.LAMB( learning_rate=learning_rate_fn, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'], exclude_from_layer_adaptation=skip_list) return optimizer
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01, layerwise_lr_decay=-1, n_transformer_layers=None, clip_norm=1.0, optimizer="adam", skip_adaptive=False, power=1.0, beta_1=0.9, beta_2=0.999, end_lr=0.0): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=end_lr, power=power ) if num_warmup_steps: learning_rate_fn = WarmUp( initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps ) layer_decay = None if layerwise_lr_decay > 0 and n_transformer_layers is not None: layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers) if optimizer == "adam": optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=weight_decay_rate, layer_decay=layer_decay, beta_1=beta_1, beta_2=beta_2, epsilon=1e-6, exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"], clip_norm=clip_norm, ) else: if skip_adaptive: skip_list = ["layer_norm", "bias", "LayerNorm"] else: skip_list = ["None"] log("Skip list for LAMB {}".format(skip_list)) optimizer = tfa_optimizers.LAMB( learning_rate=learning_rate_fn, weight_decay_rate=weight_decay_rate, beta_1=beta_1, beta_2=beta_2, epsilon=1e-6, exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"], exclude_from_layer_adaptation=skip_list, ) return optimizer
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, end_lr=0.0, optimizer_type="adamw"): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=end_lr, ) if num_warmup_steps: lr_schedule = WarmUp( initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps, ) if optimizer_type == "adamw": logging.info("using Adamw optimizer") optimizer = AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["layer_norm", "bias"], ) elif optimizer_type == "lamb": logging.info("using Lamb optimizer") optimizer = tfa_optimizers.LAMB( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["layer_norm", "bias"], ) else: raise ValueError("Unsupported optimizer type: ", optimizer_type) return optimizer
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, end_lr=0.0, optimizer_type='adamw', beta_1=0.9, poly_power=1.0): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=end_lr, power=poly_power) if num_warmup_steps: lr_schedule = WarmUp( initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps) if optimizer_type == 'adamw': logging.info('using Adamw optimizer') optimizer = AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=beta_1, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) elif optimizer_type == 'lamb': logging.info('using Lamb optimizer') optimizer = tfa_optimizers.LAMB( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=beta_1, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) else: raise ValueError('Unsupported optimizer type: ', optimizer_type) return optimizer