def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, Global_step, optimizer="adamw", poly_power=1.0, start_warmup_step=0): """Creates an optimizer training op.""" #global_step = tf.train.get_or_create_global_step() # by chenming if Global_step: global_step = Global_step else: global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=poly_power, cycle=False) # Implements linear warmup. I.e., if global_step - start_warmup_step < # num_warmup_steps, the learning rate will be # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`. if num_warmup_steps: tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step) + ", for " + str(num_warmup_steps) + " steps ++++++") global_steps_int = tf.cast(global_step, tf.int32) start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32) global_steps_int = global_steps_int - start_warm_int warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is OK that you use this optimizer for finetuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) # It is OK to use AdamW in the finetuning even the model is trained by LAMB. # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a # batch size of 64 in the finetune. if optimizer == "adamw": tf.logging.info("using adamw") optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) elif optimizer == "lamb": tf.logging.info("using lamb") optimizer = lamb_optimizer.LAMBOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) else: raise ValueError("Not supported optimizer: ", optimizer) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(list(zip(grads, tvars)), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this. # But if you use a different optimizer, you should probably take this line # out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, optimizer="adamw", weight_decay_rate=0.01, end_lr_rate=0.0001, use_layer_wise_warmup=False, total_warmup_phases=0): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=learning_rate * end_lr_rate, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) if optimizer == "adamw": optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=[ "teacher", "LayerNorm", "layer_norm", "bias", "FakeLayerNorm" ], use_layer_wise_warmup=use_layer_wise_warmup, total_warmup_phases=total_warmup_phases, num_train_steps=num_train_steps) elif optimizer == "lamb": optimizer = lamb_optimizer.LAMBOptimizer( learning_rate=learning_rate, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=[ "teacher", "LayerNorm", "layer_norm", "bias", "FakeLayerNorm" ], use_layer_wise_warmup=use_layer_wise_warmup, total_warmup_phases=total_warmup_phases, num_train_steps=num_train_steps) else: raise ValueError("Not supported optimizer: ", optimizer) if use_tpu: optimizer = tf.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) tvars = [ var for var in tf.trainable_variables() if not var.name.startswith("teacher") ] grads = tf.gradients(loss, tvars, colocate_gradients_with_ops=True) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) ntvars = [ var for var in tf.trainable_variables() if var.name.startswith("teacher") ] ngrads = [None for var in ntvars] train_op = optimizer.apply_gradients(zip(grads + ngrads, tvars + ntvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op