def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] embeddings = self._create_model(model_config, run_config, input_ids, input_mask, segment_ids, model_type) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: loss = tf.Variable(0.0, name="loss", dtype=tf.float32) train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={ "embeddings": embeddings }, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" def metric_fn(label_ids, predict_ids): precision = tf.metrics.precision(labels=label_ids, predictions=predict_ids) recall = tf.metrics.recall(labels=label_ids, predictions=predict_ids) metric = { "precision": precision, "recall": recall, } return metric tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_masks = features["input_masks"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] if mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] else None loss, predict_ids = self._create_model(input_ids, input_masks, segment_ids, label_ids, label_list, mode) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: masked_label_ids = self._get_masked_data(label_ids, label_list) masked_predict_ids = self._get_masked_data( predict_ids, label_list) eval_metrics = (metric_fn, [masked_label_ids, masked_predict_ids]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"predict": predict_ids}, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) assert is_training #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems["mems"] = params["cache"] #### Get loss from inputs total_loss, new_mems, monitor_dict = function_builder.get_loss( FLAGS, features, labels, mems, is_training) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems["mems"] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info("#params: {}".format(num_params)) #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss) monitor_dict["lr"] = learning_rate monitor_dict["gnorm"] = gnorm #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint(FLAGS, global_vars=True) #### Creating host calls host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) train_spec.cache = new_cache return train_spec
def __init__(self, config, FLAGS): self.config = config self.max_segment_len = config['max_segment_len'] self.max_span_width = config["max_span_width"] self.genres = {g: i for i, g in enumerate(config["genres"])} self.subtoken_maps = {} self.gold = {} self.eval_data = None # Load eval data lazily. self.FLAGS = FLAGS self.xlnet_config = xlnet.XLNetConfig( json_path=FLAGS.xlnet_config_file) print("################ spiece_model_file #############") print(FLAGS.spiece_model_file) self.tokenizer = spm.SentencePieceProcessor() self.tokenizer.load(FLAGS.spiece_model_file) input_props = [] input_props.append((tf.int32, [None, None])) # input_ids. input_props.append((tf.int32, [None, None])) # seg_ids. input_props.append((tf.float32, [None, None])) # input_mask input_props.append((tf.int32, [None])) # Text lengths. input_props.append((tf.int32, [None, None])) # Speaker IDs. input_props.append((tf.int32, [])) # Genre. input_props.append((tf.bool, [])) # Is training. input_props.append((tf.int32, [None])) # Gold starts. input_props.append((tf.int32, [None])) # Gold ends. input_props.append((tf.int32, [None])) # Cluster ids. input_props.append((tf.int32, [None])) # Sentence Map self.queue_input_tensors = [ tf.placeholder(dtype, shape) for dtype, shape in input_props ] dtypes, shapes = zip(*input_props) queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes) self.enqueue_op = queue.enqueue(self.queue_input_tensors) self.input_tensors = queue.dequeue() self.predictions, self.loss = self.get_predictions_and_loss( *self.input_tensors) ##### xlnet ###### scaffold_fn = model_utils.init_from_checkpoint(FLAGS) self.train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, self.loss) self.global_step = tf.train.get_or_create_global_step()
def train(ps_device): ##### Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, uncased=FLAGS.uncased, num_passes=FLAGS.num_passes, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): tf.logging.info("num of batches {}".format(record_info_dict["num_batch"])) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] ##### Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop # initialize mems tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) fetches = [ loss, tower_new_mems, global_step, gnorm, learning_rate, train_op ] total_loss, prev_step = 0., -1 while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np fetched = sess.run(fetches, feed_dict=feed_dict) loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np if curr_step > 0 and curr_step % FLAGS.iterations == 0: curr_loss = total_loss / (curr_step - prev_step) tf.logging.info( "[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.logging.info("Model saved in path: {}".format(save_path)) if curr_step >= FLAGS.train_steps: break
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: (total_loss, per_example_loss, logits, p) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) # Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) """ change """ conf_mat = get_metrics_ops(label_ids, predictions, 3, is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { 'eval_accuracy': accuracy, 'eval_loss': loss, "conf_mat": conf_mat } def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) # Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, "is_real": features["is_real_example"], "p": p, } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec # Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate # Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: # Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits = function_builder.get_race_loss( FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return {'eval_accuracy': accuracy, 'eval_loss': loss} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs outputs = function_builder.get_qa_outputs(FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) scaffold_fn = None #### Evaluation mode if mode == tf.estimator.ModeKeys.PREDICT: if FLAGS.init_checkpoint: tf.logging.info( "init_checkpoint not being used in predict mode.") predictions = { "unique_ids": features["unique_ids"], "start_top_index": outputs["start_top_index"], "start_top_log_probs": outputs["start_top_log_probs"], "end_top_index": outputs["end_top_index"], "end_top_log_probs": outputs["end_top_log_probs"], "cls_logits": outputs["cls_logits"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec ### Compute loss seq_length = tf.shape(features["input_ids"])[1] def compute_loss(log_probs, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss start_loss = compute_loss(outputs["start_log_probs"], features["start_positions"]) end_loss = compute_loss(outputs["end_log_probs"], features["end_positions"]) total_loss = (start_loss + end_loss) * 0.5 cls_logits = outputs["cls_logits"] is_impossible = tf.reshape(features["is_impossible"], [-1]) regression_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=is_impossible, logits=cls_logits) regression_loss = tf.reduce_mean(regression_loss) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is # comparable to start_loss and end_loss total_loss += regression_loss * 0.5 #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec