def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): # AdafactorOptimizer.beta1 = 0.0 # AdafactorOptimizer.clipping_threshold = 1.0 # AdafactorOptimizer.decay_rate = None # AdafactorOptimizer.epsilon1 = 1e-30 # AdafactorOptimizer.epsilon2 = 0.001 # AdafactorOptimizer.factored = True # AdafactorOptimizer.min_dim_size_to_factor = 128 # AdafactorOptimizer.multiply_by_parameter_scale = True global_step = tf.train.get_or_create_global_step() optimizer = adafactor.AdafactorOptimizer( multiply_by_parameter_scale=True, learning_rate=init_lr, decay_rate=None, beta1=0.0, clipping_threshold=1.0, factored=True, epsilon1=1e-30, epsilon2=0.001, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) return train_op
def adafactor(learning_rate, hparams): try: from tensor2tensor.utils import adafactor as af except ImportError: print(("Adafactor requires the tensor2tensor library." "Please run 'pip install tensor2tensor' to get Adafactor.")) raise ImportError del learning_rate del hparams return af.AdafactorOptimizer()
def testCallableLearningRate(self): def lr(): return 0.01 opt = adafactor.AdafactorOptimizer(learning_rate=lr) v1 = tf.Variable([1., 2.]) v2 = tf.Variable([3., 4.]) with tf.GradientTape() as tape: tape.watch([v1, v2]) loss = v1 * v2 v1_grad, v2_grad = tape.gradient(loss, [v1, v2]) opt.apply_gradients(((v1_grad, v1), (v2_grad, v2)))
def get_optimizer(params, learning_rate): """Gets the optimzer based on the hparams and current mode (TPU vs. CPU/GPU). Args: params: A dictionary containing training hyperparameters. learning_rate: A float32 scalar. Returns: A string or an optimizer instance. """ optimizer = None if params['optimizer'] == 'Adafactor': try: from tensor2tensor.utils import ( adafactor, ) # pylint: disable=g-import-not-at-top optimizer = adafactor.AdafactorOptimizer( learning_rate=learning_rate) except ImportError: logging.error('tensor2tensor not installed. Cannot use Adafactor.' 'Defaulting to Adam.') params['optimizer'] = 'Adam' if params['optimizer'] == 'Adam': optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate, beta1=params['optimizer_beta1'], beta2=params['optimizer_beta2'], epsilon=params['optimizer_epsilon'], ) if params['optimizer'] == 'AdamWeightDecay': optimizer = AdamWeightDecayOptimizer( learning_rate, weight_decay_rate=params['weight_decay_rate'], beta_1=params['optimizer_beta1'], beta_2=params['optimizer_beta2'], epsilon=params['optimizer_epsilon'], exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'], ) if params['optimizer'] == 'SGD': optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate) if optimizer is None: raise ValueError('Unknown optimizer: {}.'.format(params['optimizer'])) if params['use_tpu']: # Average the gradients across TPU cores. optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer) return optimizer
def __init__(self, params, learning_rate): """Gets the optimzer based on the hparams and current mode (TPU vs. CPU/GPU). Args: params: A dictionary containing training hyperparameters. learning_rate: A float32 scalar. Returns: A string or an optimizer instance. """ optimizer = None if params["optimizer"] == "Adafactor": try: from tensor2tensor.utils import adafactor # pylint: disable=g-import-not-at-top optimizer = adafactor.AdafactorOptimizer( learning_rate=learning_rate) except ImportError: logging.error( "tensor2tensor not installed. Cannot use Adafactor." "Defaulting to Adam.") params["optimizer"] = "Adam" if params["optimizer"] == "Adam": optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate, beta1=params["optimizer_beta1"], beta2=params["optimizer_beta2"], epsilon=params["optimizer_epsilon"]) if params["optimizer"] == "AdamWeightDecay": # AdamWeightDecay 사용 optimizer = AdamWeightDecayOptimizer( learning_rate, weight_decay_rate=params["weight_decay_rate"], # 0.01 beta_1=params["optimizer_beta1"], # 0.9 beta_2=params["optimizer_beta2"], # 0.999 epsilon=params["optimizer_epsilon"], # 1e-06 exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if params["optimizer"] == "SGD": optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate) if optimizer is None: raise ValueError("Unknown optimizer: {}.".format( params["optimizer"])) if params["use_tpu"]: # Average the gradients across TPU cores. optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer) self.optimizer = optimizer
def model_fn(features, labels, mode, config, params): """Estimator model function.""" # Not sure why it does this? del labels del config del params tf.get_variable_scope().set_initializer( tf.variance_scaling_initializer(1.0, mode="fan_avg", distribution="uniform")) # PREDICTION (e.g. evaluate) if mode == tf.estimator.ModeKeys.PREDICT: predictions, _, _ = model_params.estimator_prediction_fn(features) if include_features_in_predictions: predictions.update(features) if decode_keys: # Decode the raw ids into strings in prediction. def decode_host_call(tensor_dict): for key in decode_keys: predictions[key] = public_parsing_ops.decode( tensor_dict[key], model_params.vocab_filename, model_params.encoder_type) return tensor_dict contrib_tpu.outside_compilation(decode_host_call, predictions) return tpu_estimator.TPUEstimatorSpec(mode=mode, predictions=predictions) # TRAINING training = mode == tf.estimator.ModeKeys.TRAIN # use_tpu is false by default so this skips if use_tpu and model_params.use_bfloat16: with contrib_tpu.bfloat16_scope(): loss, outputs = model_params.model()(features, training) else: XENT_loss, outputs = model_params.model()(features, training) # XENT_loss, outputs = model_params.model().double_sampling(features, training, model_params.batch_size, # features["targets"].get_shape().as_list()[1], # mixed=True) # TPU requires outputs all have batch dimension and doesn't handle scalar. # Tile all scalars to 1 dimension vector. outputs = _tile_scalar_to_batch_size(outputs, model_params.batch_size) # Create optimizer and define learning rate if mode == tf.estimator.ModeKeys.TRAIN: init_lr = model_params.learning_rate global_step = tf.train.get_global_step() lr = init_lr / 0.01 * tf.rsqrt( tf.maximum(tf.to_float(global_step), 10000)) if train_init_checkpoint: lr = tf.minimum( tf.to_float(global_step + 1) / train_warmup_steps * init_lr, lr) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0) if use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) ############################################################################################################### ##### VARIABLES ############################################################################################### # Create index tensors to stack and get corresponding probabilities from logp # max_seq_len = outputs["targets"].get_shape().as_list()[1] # sequence_index = tf.constant(np.arange(0, max_seq_len)) # batch_index = tf.constant(np.zeros(sequence_index.get_shape().as_list()[0]), dtype=tf.int64) ##### I.I.D SAMPLING ########################################################################################## """ Here we sample the tokens that are produced by teacher forcing. """ # Normalise logits to log-prob, and compute Gumbel samples with location # logit_probs = tf.math.softmax(outputs["logits"], axis=2) # should not be x <= 0 # clipped_logit_probs = tf.clip_by_value(logit_probs, 1e-8, 1.0) # logp = tf.log(clipped_logit_probs) # RETURNS TEACHER FORCING SAMPLED TOKEN VARIATIONS # argmax_logp_index, soft_logp_index, topk_out, z = iid_sampling(logp, max_seq_len, greedy=True, soft=False, # topk=False, k=2) # topk_probs, topk_indices = topk_out # TEST SAMPLING METHODS PROVIDED BY PEGASUS # sampled_BxT = iid_process_logits(outputs["logits"], max_seq_len, model_params.batch_size, # outputs["logits"].get_shape().as_list()[-1], # top_k=0, top_p=0.9, temperature=1.0) ##### DECODER SAMPLING ######################################################################################## """ Here we sample the tokens using the decoder. Beam size == 1. PREDS: IDs LOGP: transformed logits SCORE: scalar score using RISK trick LOGP: [BxTxV] beam logp LOGITS: [BxTxV] beam logits the dictionary contains the following keys: {ids, logp_BxT, sent_score, logp_BxTxV} # Note: the logp_BxTxV are analogous to z -> should be used for RELAX, preds are the BxT of these -> b=H(z), and # logp are the corresponding values (score is normalised to sentence score). """ # greedy_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 0.0} # random_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 1.0} # topk_beam_params = {"_beam": 3, "top_k": 10000, "top_p": 0.0, "temperature": 1.0} # topp_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.9, "temperature": 1.0} # greedy_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=greedy_beam_params, sentence_score=False) # random_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=random_beam_params, sentence_score=False) # topk_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=topk_beam_params, sentence_score=False) # topp_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=topp_beam_params, sentence_score=False) # BEAM SEARCH # greedy_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=greedy_beam_params) # random_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=random_beam_params) # topk_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=topk_beam_params) # topp_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=topp_beam_params) ##### RELAX VARIABLES ######################################################################################### """ Here we create the variables for RELAX. Pass in the logp, logits, and z that has already been sampled/created from manipulation. Will return z_tilde [BxTxV] and logp(b) [BxT]. """ # TEACHER FORCING SAMPLING # z_tilde, logp_b = create_variables(z, logp, batch_index, sequence_index, clipped_logit_probs) # DECODER SAMPLING -> sample_b is already argmaxed in decode loop # z_tilde, logp_b = create_variables_from_samples(random_dict["logits_BxTxV"], random_dict["logp_BxTxV"], # random_dict["ids"], batch_index, sequence_index) ##### TEXT AND ROUGE ########################################################################################## """ Here we first convert sequences to text, and calculate corresponding rouge scores/losses. """ # target_text = rouge_decoding(outputs["targets"], model_params) # TARGET SAMPLES # argmax_pred_text = rouge_decoding(argmax_logp_index, model_params) # ARGMAX SAMPLES # soft_pred_text = rouge_decoding(soft_logp_index, model_params) # SOFTMAX SAMPLES # additional_pred_text = rouge_decoding(sampled_BxT, model_params) # ADDITIONAL SAMPLES # Token-level ROUGE # ROUGE_token = tf.py_function(rouge_token,(outputs["targets"], random_dict["ids"], 0, 0), tf.float32) # CALCULATE ROUGE LOSS: ROUGE score -> ROUGE loss = -ROUGE score # NOTE: for ROUGE variant, change value (0: precision, 1: recall, 2: f1) # rouge_loss_argmax = -tf.py_function(evaluate_rl, (target_text, argmax_pred_text, 2), tf.float32) # rouge_loss_soft = -tf.py_function(evaluate_rl, (target_text, soft_pred_text, 2), tf.float32) # rouge_loss_extra = -tf.py_function(evaluate_rl, (target_text, additional_pred_text, 2), tf.float32) ##### REINFORCE LOSS ########################################################################################## """ Calculate standard REINFORCE loss. Can be document-level (score using RISK trick), or token-level [BxT]. """ # FIND CORRESPONDING LOG_PROBS OF THE I.I.D SAMPLED TOKENS # ARGMAX -> logp(argmax(y)) # argmax_logp = iid_log_probs(argmax_logp_index, batch_index, sequence_index, logp) # SOFTMAX -> logp(sample_y) # softmax_logp = iid_log_probs(soft_logp_index, batch_index, sequence_index, logp) # ADDITIONAL # additional_logp = iid_log_probs(sampled_BxT, batch_index, sequence_index, logp) # CHANGE BELOW IF USING DECODER SAMPLED TOKENS/SCORES # weight the logp by ROUGE score (neg ROUGE_loss), sum values # reinforce_loss = tf.reduce_sum(tf.multiply(rouge_loss_argmax, argmax_logp)) ##### REINFORCE w/ BASELINE ################################################################################### """ Calculate RwB using Socher's loss function (2017). Optional: use a Q_func as baseline. """ # improve the probs of the SOFT labels (soft - hard)*soft_logp # improve the probs of the HARD labels (hard - soft)*hard_logp # BASELINE: CONTROL VARIATE # ffn_output = control_variate(source, targets) # with tf.variable_scope("Q_func"): # cv = rwb_Q_func(tf.reshape(softmax_logp, [1, 32]), tf.reshape(additional_logp, [1, 32])) # cv_loss = tf.reduce_mean(tf.square(tf.subtract(rouge_loss_argmax, cv))) # loss_difference = tf.subtract(rouge_loss_soft, rouge_loss_argmax) # reinforce_baseline = tf.reduce_sum(tf.multiply(loss_difference, softmax_logp)) # BASELINE: HINGE LOSS # rouge_soft = -rouge_loss_soft # rouge_hard = -rouge_loss_argmax # hinge = -tf.maximum((rouge_soft - rouge_hard), 0) # hinge_baseline = tf.reduce_sum(tf.multiply(hinge, softmax_logp)) ##### REINFORCE w/ THRESHOLD ################################################################################## """ Calculate REINFORCE with a constant threshold as the baseline. """ # we take output of ROUGE score as ROUGE_loss = -ROUGE score # intermediate_loss = tf.reduce_sum(tf.multiply(tf.subtract(0.3, -rouge_loss_argmax), argmax_logp)) ##### EXPECTED RISK MINIMISATION ############################################################################## """ Calculate the RISK loss using n sequences from sampling process. """ # L_risk = risk_loss(model_params.batch_size, max_seq_len, # rouge_losses=[rouge_loss_argmax, rouge_loss_soft, rouge_loss_extra], # logps=[topk_dict["logp1"], topk_dict["logp2"], topk_dict["logp3"]], n=3) ##### MIXED LOSS ############################################################################################## """ Implement a mixed loss function that is weighted by an alpha term. """ # combined_loss = tf.math.add(tf.multiply(tf.constant(0.3, dtype=tf.float32), XENT_loss), # tf.multiply(tf.constant(0.7, dtype=tf.float32), L_risk)) # OR conditional loss switch # constraint = tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32) # combined_loss = tf.cond(constraint > 0.8, lambda: hard_reinforce_loss, lambda: XENT_loss) ##### RELAX CONTROL VARIATE ################################################################################### """ Prepare the target sequence for use in the control variate. """ # z = random_dict["logp_BxTxV"] # z_target, zt_target = create_cv_target(outputs, batch_index, sequence_index, z, z_tilde) ##### RELAX LOSS ############################################################################################## """ Manipulate z and z_tilde using the Q_func to mimic ROUGE loss. """ # with tf.variable_scope("Q_func"): # c_z = Q_func(z, z_target) # with tf.variable_scope("Q_func", reuse=True): # c_z_tilde = Q_func(z_tilde, zt_target) # Formulate RELAX as a loss function # f_y = rouge_loss_soft # negative for loss (defined above) # c_z_tilde1 = tf.stop_gradient(tf.identity(c_z_tilde)) # clone, detach, stop grad # L_relax = tf.reduce_sum(((f_y - c_z_tilde1)*logp_b) - c_z_tilde + c_z) # OR construct gradient estimator # theta = [tv for tv in tf.trainable_variables() if "Q_func" not in tv.name] # d_logp_d_theta = tf.gradients(logp_b, theta)[0] # logp # d_c_z_tilde_d_theta = tf.gradients(c_z_tilde, theta)[0] # d_c_z_d_theta = tf.gradients(c_z, theta)[0] # relax = tf.reduce_sum(f_y - c_z_tilde)*d_logp_d_theta - d_c_z_tilde_d_theta + d_c_z_d_theta # relax = tf.gradients(L_relax, theta)[0] # Calculate the first optimization step with loss # list_of_gradient_variable_pairs = optimizer.compute_gradients(L_relax) # train_op = optimizer.apply_gradients(list_of_gradient_variable_pairs, global_step=global_step) # Variance reduction objective # variance_loss = tf.reduce_mean(tf.square(relax), name="variance_loss") # initialise adafactor again for variance optimiser # var_opt = adafactor.AdafactorOptimizer( # learning_rate=lr, # decay_rate=adafactor.adafactor_decay_rate_pow(0.8), # beta1=0.0) # est_params = [eta, log_temperature] # TODO: REBAR implementation # Adds the parameters of the FFNN # nn_params = [tv for tv in tf.trainable_variables() if "Q_func" in tv.name] # est_params = nn_params # est_params = est_params + nn_params # TODO: REBAR implementation # Additional optimization step # var_gradvars = var_opt.compute_gradients(variance_loss, var_list=est_params) # var_train_op = var_opt.apply_gradients(var_gradvars) # This may allow for both train ops to be passed in the return statement below? # with tf.control_dependencies([train_op, var_train_op]): # train_op = tf.no_op() ############################################################################################################### # Calculate gradients # If freezing layers, only optimise wrt certain layers (find names) - speeds up, worsens performance # last_params = [tv for tv in tf.trainable_variables() if "decoder/LayerNorm/" in tv.name] # list_of_gradient_variable_pairs = optimizer.compute_gradients(combined_loss, var_list=last_params) list_of_gradient_variable_pairs = optimizer.compute_gradients( XENT_loss) train_op = optimizer.apply_gradients( list_of_gradient_variable_pairs, global_step=global_step) tf.logging.set_verbosity(tf.logging.INFO) # Debugging steps - add into logging hook directly if needed # tf.debugging.check_numerics(sum_logp, "DEBUG: sum_logp has a NaN") logging_hook = tf.train.LoggingTensorHook( { "loss": XENT_loss, # "variance_loss": variance_loss, # "cv_loss": cv_loss, "learning_rate": lr, "global_step": global_step, }, every_n_iter=5) # This is the configured estimator function that is returned to train the model return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=XENT_loss, train_op=train_op, training_hooks=[logging_hook], scaffold_fn=_load_vars_from_checkpoint(use_tpu, train_init_checkpoint), host_call=add_scalars_to_summary( model_dir, { "learning_rate": lr, # "rouge_loss_hard": rouge_loss_argmax, # "rouge_loss_soft": rouge_loss_soft, # "rouge_loss_extra": rouge_loss_extra, # "reinforce_loss": reinforce_loss, # "risk_loss": L_risk, # "XENT_loss": XENT_loss, })) # EVALUATION (evaluating the performance) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = model_params.estimator_eval_metrics_fn( features, outputs) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=XENT_loss, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, config, params): """Estimator model function.""" del labels del config del params tf.compat.v1.get_variable_scope().set_initializer( tf.compat.v1.variance_scaling_initializer( 1.0, mode="fan_avg", distribution="uniform")) if mode == tf.estimator.ModeKeys.PREDICT: predictions = model_params.estimator_prediction_fn(features) if include_features_in_predictions: predictions.update(features) if decode_keys: # Decode the raw ids into strings in prediction. def decode_host_call(tensor_dict): for key in decode_keys: predictions[key] = public_parsing_ops.decode( tensor_dict[key], model_params.vocab_filename, model_params.encoder_type) return tensor_dict contrib_tpu.outside_compilation(decode_host_call, predictions) return contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) training = mode == tf.estimator.ModeKeys.TRAIN if use_tpu and model_params.use_bfloat16: with contrib_tpu.bfloat16_scope(): loss, outputs = model_params.model()(features, training) else: loss, outputs = model_params.model()(features, training) # TPU requires ouputs all have batch dimension and doesn't handle scalar. # Tile all scalars to 1 dimension vector. outputs = _tile_scalar_to_batch_size(outputs, model_params.batch_size) if mode == tf.estimator.ModeKeys.TRAIN: init_lr = model_params.learning_rate global_step = tf.compat.v1.train.get_global_step() lr = init_lr / 0.01 * tf.math.rsqrt( tf.maximum(tf.cast(global_step, dtype=tf.float32), 10000)) if train_init_checkpoint: lr = tf.minimum( tf.cast(global_step + 1, dtype=tf.float32) / train_warmup_steps * init_lr, lr) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0) if use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=global_step) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=_load_vars_from_checkpoint(use_tpu, train_init_checkpoint), host_call=add_scalars_to_summary(model_dir, {"learning_rate": lr})) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = model_params.estimator_eval_metrics_fn(features, outputs) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, params): tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape)) input_ids = features['input_ids'] target_ids = features['target_ids'] masked_lm_positions = features['masked_lm_positions'] masked_lm_ids = features['masked_lm_ids'] masked_lm_weights = features['masked_lm_weights'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = transformer.TransformerEncoderDecoderModel( vocab_size, hidden_size, filter_size, num_heads, num_encoder_layers, num_decoder_layers, label_smoothing, dropout, ) loss, outputs = model({ 'inputs': input_ids, 'targets': target_ids }, training=is_training) # ( # masked_lm_loss, # masked_lm_example_loss, # masked_lm_log_probs, # ) = get_masked_lm_output( # model._context['memory'], # model._embedding_layer.weights_VxD, # masked_lm_positions, # masked_lm_ids, # masked_lm_weights, # ) total_loss = loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: init_lr = learning_rate global_step = tf.train.get_global_step() lr = (init_lr / 0.01 * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000))) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=global_step) # global_step = tf.train.get_global_step() # lr = learning_rate_schedule_noam( # global_step, # total_train_steps = num_train_steps, # warmup_steps = num_warmup_steps, # ) # optimizer = adafactor.AdafactorOptimizer( # learning_rate = lr, beta1 = 0.0 # ) # if use_tpu: # optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # train_op = optimizer.minimize(loss, global_step = global_step) train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, ) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn( masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, ): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights, ) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { 'masked_lm_accuracy': masked_lm_accuracy, 'masked_lm_loss': masked_lm_mean_loss, } eval_metrics = ( metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, ], ) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn, ) else: raise ValueError('Only TRAIN and EVAL modes are supported: %s' % (mode)) return output_spec
def model_fn(features, labels, mode, params): X = features['inputs'] Y = features['targets'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = Model(X, Y) student = StudentModel(X, Y) student_logits = student.logits[:, :, 0, 0] student_task_xent, weights = padded_cross_entropy_loss( student_logits, student.Y) teacher_targets = tf.nn.softmax(model.logits[:, :, 0, 0] / distill_temperature) student_distill_xent = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.stop_gradient(teacher_targets), logits=student_logits / distill_temperature, ) student_distill_xent = tf.reduce_sum(student_distill_xent * weights) student_distill_xent *= distill_temperature**2 phase_loss = task_balance * student_task_xent phase_loss += (1 - task_balance) * student_distill_xent loss = phase_loss / tf.reduce_sum(weights) task_loss = student_task_xent / tf.reduce_sum(weights) distill_loss = student_distill_xent / tf.reduce_sum(weights) tf.identity(loss, 'total_loss') tf.identity(task_loss, 'task_loss') tf.identity(distill_loss, 'distill_loss') tf.summary.scalar('total_loss', loss) tf.summary.scalar('task_loss', task_loss) tf.summary.scalar('distill_loss', distill_loss) tvars = [v for v in tf.trainable_variables() if 'student/' not in v.name] initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() lr = tf.rsqrt(tf.maximum(tf.to_float(global_step), num_warmup_steps)) optimizer = adafactor.AdafactorOptimizer(learning_rate=lr, beta1=0.0) train_op = optimizer.minimize(loss, global_step=global_step) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss) return estimator_spec
def model_fn(features, labels, mode, params): tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape)) inputs = features['inputs'] targets = features['targets'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = modeling.TransformerModel(bert_config) (llh, logits, pred_ids), _ = model(inputs, target_ids=targets, training=is_training) total_loss = padded_cross_entropy_loss( logits, targets, bert_config['label_smoothing'], bert_config['vocab_size'], ) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') print(initialized_variable_names) for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: init_lr = learning_rate global_step = tf.train.get_global_step() lr = (init_lr / 0.01 * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000))) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(total_loss, global_step=global_step) # if not bert_config['use_bias']: # logging.info('Fixing position embedding, i.e. not trainable.') # posemb = 'pegasus/embeddings/position_embeddings' # tvars = list( # filter(lambda v: v.name.split(':')[0] != posemb, tvars) # ) # gradients = optimizer.compute_gradients(total_loss, tvars) # train_op = optimization.create_optimizer( # total_loss, # learning_rate, # num_train_steps, # num_warmup_steps, # use_tpu, # ) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=None, scaffold_fn=scaffold_fn, ) else: raise ValueError('Only TRAIN and EVAL modes are supported: %s' % (mode)) return output_spec
def model_fn(features, labels, mode, params): tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape)) inputs = features['input_ids'] targets = features['target_ids'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = transformer.TransformerEncoderDecoderModel( vocab_size, hidden_size, filter_size, num_heads, num_encoder_layers, num_decoder_layers, label_smoothing, dropout, ) loss, outputs = model({ 'inputs': inputs, 'targets': targets }, training=is_training) total_loss = loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: init_lr = learning_rate global_step = tf.train.get_global_step() lr = (init_lr / 0.01 * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000))) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=global_step) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, scaffold_fn=scaffold_fn) else: raise ValueError('Only TRAIN and EVAL modes are supported: %s' % (mode)) return output_spec