def model_fn(features, labels, mode, params): tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape)) input_ids = features['input_ids'] target_ids = features['target_ids'] masked_lm_positions = features['masked_lm_positions'] masked_lm_ids = features['masked_lm_ids'] masked_lm_weights = features['masked_lm_weights'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = transformer.TransformerEncoderDecoderModel( vocab_size, hidden_size, filter_size, num_heads, num_encoder_layers, num_decoder_layers, label_smoothing, dropout, ) loss, outputs = model({ 'inputs': input_ids, 'targets': target_ids }, training=is_training) # ( # masked_lm_loss, # masked_lm_example_loss, # masked_lm_log_probs, # ) = get_masked_lm_output( # model._context['memory'], # model._embedding_layer.weights_VxD, # masked_lm_positions, # masked_lm_ids, # masked_lm_weights, # ) total_loss = loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: init_lr = learning_rate global_step = tf.train.get_global_step() lr = (init_lr / 0.01 * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000))) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=global_step) # global_step = tf.train.get_global_step() # lr = learning_rate_schedule_noam( # global_step, # total_train_steps = num_train_steps, # warmup_steps = num_warmup_steps, # ) # optimizer = adafactor.AdafactorOptimizer( # learning_rate = lr, beta1 = 0.0 # ) # if use_tpu: # optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # train_op = optimizer.minimize(loss, global_step = global_step) train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, ) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn( masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, ): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights, ) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { 'masked_lm_accuracy': masked_lm_accuracy, 'masked_lm_loss': masked_lm_mean_loss, } eval_metrics = ( metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, ], ) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn, ) else: raise ValueError('Only TRAIN and EVAL modes are supported: %s' % (mode)) return output_spec
def model_fn(features, labels, mode, params): tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape)) inputs = features['input_ids'] targets = features['target_ids'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = transformer.TransformerEncoderDecoderModel( vocab_size, hidden_size, filter_size, num_heads, num_encoder_layers, num_decoder_layers, label_smoothing, dropout, ) loss, outputs = model({ 'inputs': inputs, 'targets': targets }, training=is_training) total_loss = loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: init_lr = learning_rate global_step = tf.train.get_global_step() lr = (init_lr / 0.01 * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000))) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=global_step) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, scaffold_fn=scaffold_fn) else: raise ValueError('Only TRAIN and EVAL modes are supported: %s' % (mode)) return output_spec