def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] embeddings = self._create_model(model_config, run_config, input_ids, input_mask, segment_ids, model_type) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: loss = tf.Variable(0.0, name="loss", dtype=tf.float32) train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"embeddings": embeddings}, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" def metric_fn(label_ids, predict_ids): precision = tf.metrics.precision(labels=label_ids, predictions=predict_ids) recall = tf.metrics.recall(labels=label_ids, predictions=predict_ids) metric = { "precision": precision, "recall": recall, } return metric tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_masks = features["input_masks"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] if mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] else None loss, predict_ids = self._create_model(model_config, run_config, input_ids, input_masks, segment_ids, label_ids, label_list, mode) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: masked_label_ids = self._get_masked_data(label_ids, label_list) masked_predict_ids = self._get_masked_data( predict_ids, label_list) eval_metrics = (metric_fn, [masked_label_ids, masked_predict_ids]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"predict": predict_ids}, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_training = mode == tf.estimator.ModeKeys.TRAIN assert is_training #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems['mems'] = params['cache'] #### Get loss from inputs total_loss, new_mems, monitor_dict = function_builder.get_loss( FLAGS, features, labels, mems, is_training ) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems['mems'] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss ) monitor_dict['lr'] = learning_rate monitor_dict['gnorm'] = gnorm #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint( FLAGS, global_vars = True ) #### Creating host calls host_call = function_builder.construct_scalar_host_call( monitor_dict = monitor_dict, model_dir = FLAGS.model_dir, prefix = 'train/', reduce_fn = tf.reduce_mean, ) #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode = mode, loss = total_loss, train_op = train_op, host_call = host_call, scaffold_fn = scaffold_fn, ) train_spec.cache = new_cache return train_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" def metric_fn(sent_label_ids, sent_predict_ids): sent_accuracy = tf.metrics.accuracy( labels=sent_label_ids, predictions=sent_predict_ids) metric = { "sent_accuracy": sent_accuracy, } return metric tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_masks = features["input_masks"] segment_ids = features["segment_ids"] sent_label_ids = features["sent_label_ids"] if mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] else None loss, sent_predict_ids, sent_predict_scores, sent_predict_probs = self._create_model( input_ids, input_masks, segment_ids, sent_label_ids, sent_label_list, mode) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn, [sent_label_ids, sent_predict_ids]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={ "sent_predict_id": sent_predict_ids, "sent_predict_score": sent_predict_scores, "sent_predict_prob": sent_predict_probs }, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) # no need for dropout in prediction mode xlnet_config.dropout = 0.0 xlnet_config.dropatt = 0.0 run_config = xlnet.create_run_config(False, True, FLAGS) # no need for dropout in prediction mode run_config.dropout = 0.0 run_config.dropatt = 0.0 xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # load pretrained models scaffold_fn = init_from_checkpoint(FLAGS) # Get a sequence output seq_out = xlnet_model.get_sequence_output() tokens = tf.transpose(seq_out, [1, 0, 2]) predictions = { "unique_id": unique_ids, 'tokens': tokens, 'input_mask': tf.transpose(inp_mask, [1, 0]) } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_eval = mode == tf.estimator.ModeKeys.EVAL assert is_eval #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems['mems'] = params['cache'] #### Get loss from inputs total_loss, total_accuracy, new_mems, monitor_dict = custom_function_builder.get_loss( FLAGS, features, labels, mems, False) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems['mems'] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss) monitor_dict['lr'] = learning_rate monitor_dict['gnorm'] = gnorm #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint(FLAGS, global_vars=True) # def metric_fn(accuracy): # return # # eval_metrics = (metric_fn, [total_accuracy]) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops={'accuracy': total_accuracy}, scaffold=scaffold_fn, ) return output_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs #********************************************************************************************# bsz_per_core = tf.shape(features["input_ids"])[0] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label_ids = features["label_ids"] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) #summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size] xlnet_model_out = xlnet_model.get_sequence_output() embedding = tf.transpose(xlnet_model_out, [1, 0, 2]) max_seq_length = embedding.shape[1].value # 算序列真实长度 used = tf.sign(tf.abs(features["input_ids"])) lengths = tf.reduce_sum( used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度 # 添加CRF output layer blstm_crf = BLSTM_CRF(embedded_chars=embedding, hidden_unit=10, cell_type="lstm", num_layers=1, dropout_rate=0.5, initializers=initializers, num_labels=n_class, seq_length=max_seq_length, labels=label_ids, lengths=lengths, is_training=is_training) total_loss, logits, trans, pred_ids = blstm_crf.add_blstm_crf_layer( crf_only=True) #********************************************************************************************# #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(label_ids, pred_ids): return { "eval_loss": tf.metrics.mean_squared_error(labels=label_ids, predictions=pred_ids), } eval_metrics = metric_fn(features["label_ids"], pred_ids) eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metrics) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "logits": logits, "labels": label_ids, "pred_ids": pred_ids, "input_mask": features["input_mask"] } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def train(ps_device): # Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, uncased=FLAGS.uncased, num_passes=FLAGS.num_passes, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): tf.logging.info("num of batches {}".format(record_info_dict["num_batch"])) # Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] # Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) # average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] # get train op train_op, learning_rate, gnorm = model_utils.get_train_op(FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() # Training loop # initialize mems tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.97)#allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) sess.graph.finalize() run_metadata = tf.RunMetadata() options = tf.RunOptions(trace_level=tf.RunOptions.SOFTWARE_TRACE) dot_rep = graph_to_dot(tf.get_default_graph()) # s = Source(dot_rep, filename="test.gv", format="PNG") with open('profs/xln.dot', 'w') as fwr: fwr.write(str(dot_rep)) operations_tensors = {} operations_attributes = {} operations_names = tf.get_default_graph().get_operations() count1 = 0 count2 = 0 for operation in operations_names: operation_name = operation.name operations_info = tf.get_default_graph( ).get_operation_by_name(operation_name).values() try: operations_attributes[operation_name] = [] operations_attributes[operation_name].append( operation.type) operations_attributes[operation_name].append(tf.get_default_graph( ).get_tensor_by_name(operation_name + ':0').dtype._is_ref_dtype) except: pass if len(operations_info) > 0: if not (operations_info[0].shape.ndims is None): operation_shape = operations_info[0].shape.as_list( ) operation_dtype_size = operations_info[0].dtype.size if not (operation_dtype_size is None): operation_no_of_elements = 1 for dim in operation_shape: if not(dim is None): operation_no_of_elements = operation_no_of_elements * dim total_size = operation_no_of_elements * operation_dtype_size operations_tensors[operation_name] = total_size else: count1 = count1 + 1 else: count1 = count1 + 1 operations_tensors[operation_name] = -1 # print('no shape_1: ' + operation_name) # print('no shape_2: ' + str(operations_info)) # operation_namee = operation_name + ':0' # tensor = tf.get_default_graph().get_tensor_by_name(operation_namee) # print('no shape_3:' + str(tf.shape(tensor))) # print('no shape:' + str(tensor.get_shape())) else: # print('no info :' + operation_name) # operation_namee = operation.name + ':0' count2 = count2 + 1 operations_tensors[operation_name] = -1 # try: # tensor = tf.get_default_graph().get_tensor_by_name(operation_namee) # print(tensor) # print(tf.shape(tensor)) # except: # print('no tensor: ' + operation_namee) print(count1) print(count2) with open('./profs/tensors_sz_32.txt', 'w') as f: for tensor, size in operations_tensors.items(): f.write('"' + tensor + '"::' + str(size) + '\n') with open('./profs/operations_attributes.txt', 'w') as f: for op, attrs in operations_attributes.items(): strr = op for attr in attrs: strr += '::' + str(attr) strr += '\n' f.write(strr) fetches = [loss, tower_new_mems, global_step, gnorm, learning_rate, train_op] iter = 0 total_loss, prev_step = 0., -1 while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np if iter % 10 == 7 or iter == 0: fetched = sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) #if iter > 0: profile(run_metadata, iter) else: t0 = time.time() fetched = sess.run(fetches, feed_dict=feed_dict) print(time.time() - t0) if iter == 0: mem_options = tf.profiler.ProfileOptionBuilder.time_and_memory() mem_options["min_bytes"] = 0 mem_options["min_micros"] = 0 mem_options["output"] = 'file:outfile=./profs/mem.txt' mem_options["select"] = ("bytes", "peak_bytes", "output_bytes", "residual_bytes") mem = tf.profiler.profile( tf.Graph(), run_meta=run_metadata, cmd="scope", options=mem_options) with open('profs/mem2.txt', 'w') as f: f.write(str(mem)) iter += 1 loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np if curr_step > 0 and curr_step % FLAGS.iterations == 0: curr_loss = total_loss / (curr_step - prev_step) tf.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.logging.info("Model saved in path: {}".format(save_path)) if curr_step >= FLAGS.train_steps: break
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits = function_builder.get_race_loss( FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { 'eval_accuracy': accuracy, 'eval_loss': loss} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) metric_args = [per_example_loss, label_ids, logits, is_real_example] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): # ### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) return_dict = function_builder.get_classification_outputs( FLAGS, features, is_training) # per_example_loss = return_dict["per_example_loss"] cls_logits = return_dict["cls_logits"] # ### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) # ### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) if mode == tf.estimator.ModeKeys.PREDICT: # label_ids = tf.reshape(features["cls"], [-1]) predictions = { "feature_id": features["feature_id"], "cls_logits": cls_logits, # "cls": label_ids, } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec def compute_loss(log_probs, positions, depth): one_hot_positions = tf.one_hot(positions, depth=depth, dtype=tf.float32) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss cls_log_probs = return_dict["cls_log_probs"] num_choices = FLAGS.num_choices if num_choices: num_classes = num_choices else: num_classes = FLAGS.num_classes total_loss = compute_loss(cls_log_probs, features["cls"], depth=num_classes) # ### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {'loss/cls': total_loss, "lr": learning_rate} # ### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: # ### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['cls'], [-1]) predictions = tf.argmax(cls_logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs outputs = function_builder.get_qa_outputs(FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) scaffold_fn = None #### Evaluation mode if mode == tf.estimator.ModeKeys.PREDICT: if FLAGS.init_checkpoint: tf.logging.info("init_checkpoint not being used in predict mode.") predictions = { "unique_ids": features["unique_ids"], "start_top_index": outputs["start_top_index"], "start_top_log_probs": outputs["start_top_log_probs"], "end_top_index": outputs["end_top_index"], "end_top_log_probs": outputs["end_top_log_probs"], "cls_logits": outputs["cls_logits"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec ### Compute loss seq_length = tf.shape(features["input_ids"])[1] def compute_loss(log_probs, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss start_loss = compute_loss( outputs["start_log_probs"], features["start_positions"]) end_loss = compute_loss( outputs["end_log_probs"], features["end_positions"]) total_loss = (start_loss + end_loss) * 0.5 cls_logits = outputs["cls_logits"] is_impossible = tf.reshape(features["is_impossible"], [-1]) regression_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=is_impossible, logits=cls_logits) regression_loss = tf.reduce_mean(regression_loss) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is # comparable to start_loss and end_loss total_loss += regression_loss * 0.5 #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def train(ps_device): ##### Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( info_dir=os.path.join(FLAGS.record_info_dir, "train"), split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) valid_input_fn, record_info_dict_valid = data_utils.get_input_fn( info_dir=os.path.join(FLAGS.record_info_dir, "valid"), split="valid", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): num_train_batches = record_info_dict["num_batch"] tf.logging.info("num of train batches {}".format( record_info_dict["num_batch"])) tf.logging.info("num of validation batches {}".format( record_info_dict_valid["num_batch"])) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) valid_set = valid_input_fn(params) t_iter = train_set.make_initializable_iterator() example = t_iter.get_next() v_iter = valid_set.make_initializable_iterator() v_example = v_iter.get_next() if FLAGS.num_core_per_host > 1: # train set examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] # validation set v_examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in v_example.keys(): vals = tf.split(v_example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): v_examples[device_id][key] = vals[device_id] else: examples = [example] v_examples = [v_example] ##### Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] v_tower_mems, v_tower_losses, v_tower_new_mems = [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} v_mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) v_mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) v_loss_i, v_new_mems_i = single_core_graph(is_training=False, features=v_examples[i], mems=v_mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) v_tower_mems.append(v_mems_i) v_tower_losses.append(v_loss_i) v_tower_new_mems.append(v_new_mems_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] if len(v_tower_losses) > 1: v_loss = tf.add_n(v_tower_losses) / len(v_tower_losses) else: v_loss = v_tower_losses[0] ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, None, num_train_batches, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop # initialize mems tower_mems_np = [] v_tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} v_mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) v_mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) v_tower_mems_np.append(v_mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) # Create performance summaries for Tensorboard logging training_performance_summaries, valid_performance_summaries = tb.tensorboard_setup( ) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) # variables that are run in the session fetches = [ loss, tower_new_mems, global_step, gnorm, learning_rate, train_op ] v_fetches = [v_loss, v_tower_new_mems] # Create writers for Tensorboard logging info_dict = { "id": FLAGS.run_id, "n_layers": FLAGS.n_layers, "d_model": FLAGS.d_model, "n_heads": FLAGS.n_head } train_summary_writer, valid_summary_writer = tb.create_writers( sess, info_dict, logging_dir=FLAGS.tb_logging_dir) total_loss, prev_step = 0., -1 for i in range(FLAGS.epochs): # Train loop try: sess.run(t_iter.initializer) while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np fetched = sess.run(fetches, feed_dict=feed_dict) loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np print(curr_step) # Log training progress if curr_step > 0 and curr_step % FLAGS.log_steps == 0: curr_loss = total_loss / (curr_step - prev_step) summ = tb.run_train(sess, training_performance_summaries, curr_loss) train_summary_writer.add_summary(summ, curr_step) tf.logging.info( "[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}". format(curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step # Save checkpoint if curr_step > 0 and FLAGS.save_steps is not None and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.logging.info( "Model saved in path: {}".format(save_path)) except tf.errors.OutOfRangeError: pass # Validation loop try: sess.run(v_iter.initializer) v_total_loss, v_steps = 0., 0 while True: v_feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in v_tower_mems_np[i].keys(): for m, m_np in zip(v_tower_mems[i][key], v_tower_mems_np[i][key]): v_feed_dict[m] = m_np v_fetched = sess.run(v_fetches, feed_dict=v_feed_dict) v_loss_np, v_tower_mems_np = v_fetched[:] v_total_loss += v_loss_np v_steps += 1 except tf.errors.OutOfRangeError: val_loss = v_total_loss / v_steps v_pplx = math.exp(val_loss) tf.logging.info( "Validation: [{}] | loss {:.2f} | pplx {:>7.2f}".format( curr_step, val_loss, v_pplx)) summ_valid = tb.run_valid(sess, valid_performance_summaries, val_loss, v_pplx) valid_summary_writer.add_summary(summ_valid, curr_step) tf.logging.info("------------ Epoch {} ------------".format(i))
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: (total_loss, per_example_loss, logits) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) f1 = tf.contrib.metrics.f1_score(label_ids, predictions) #print('Label ids object type: {}'.format(type(label_ids))) #print('Predictions object type: {}'.format(type(predictions))) ''' cm = tf.math.confusion_matrix(label_ids,predictions,num_classes=n_class) print('Converting confusion matrix into its values.') sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) _cm = sess.run(cm) sess.close() print("Created value of confusion matrix: {}".format(_cm)) ''' ''' sess = tf.Session() #sess.run(tf.global_variables_initializer()) _cm = sess.run(cm) sess.close() print("Created value of confusion matrix: {}".format(_cm)) ''' ''' This giant part below was supposed to calculate f1 precision etc but it failed because eval() and run() gives error. Error: tensorflow.python.framework.errors_impl.FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element. [[node IteratorGetNext (defined at content/drive/My Drive/thesis/xlnet/run_classifier.py:866 ''' ''' sess = tf.InteractiveSession() label_ids_np=label_ids.eval() predictions_np = predictions.eval() sess.close() print('Conversion succeeded.') print('Label_ids_np type: {}'.format(type(label_ids_np))) print('Predictions_np type: {}'.format(type(predictions_np))) sess = tf.InteractiveSession() print('Tf conversion: from {} to {} '.format(type(tf.constant([1,2,3])),type(tf.constant([1,2,3]).eval()))) sess.close() #precision, recall, f1, _ = precision_recall_fscore_support(label_ids, predictions, average="macro", labels=list(range(0,n_class))) #mcc = matthews_corrcoef(label_ids_np, predictions_np) sess = tf.get_default_session() with sess.as_default(): label_ids_np = label_ids.eval() predictions_np = predictions.eval() sess = tf.Session() sess.run(tf.global_variables_initializer()) label_ids_np = sess.run(label_ids) predictions_np = sess.run(predictions) sess.close() precision_macro = scu.get_precision_macro(label_ids_np,predictions_np) recall_macro = scu.get_recall_macro(label_ids_np,predictions_np) f1_macro = scu.get_f1_macro(label_ids_np,predictions_np) mcc = scu.get_mcc_score(label_ids_np,predictions_np) print(f1_macro) print(mcc) ''' return {'eval_accuracy': accuracy, 'eval_loss': loss, 'f1': f1} def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): # Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits, label, mask = function_builder.get_crf_outputs( FLAGS, features, is_training) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # predict mode if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "logits": logits, "labels": label, 'mask': features['input_mask'], 'label_mask': mask } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec # Evaluation mode elif mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, weight): eval_input_dict = { 'labels': label_ids, 'predictions': logits, 'weights': weight } accuracy = tf.metrics.accuracy(**eval_input_dict) eval_input_dict = { 'labels': tf.one_hot(label_ids, FLAGS.crf_classes), 'predictions': tf.one_hot(logits, FLAGS.crf_classes), 'weights': weight } f1 = tf.contrib.metrics.f1_score(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss) return {'eval_accuracy': accuracy, 'eval_loss': loss, 'f1': f1} metric_args = [per_example_loss, label, logits, mask] eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec # load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def main(_): if FLAGS.server_ip and FLAGS.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(FLAGS.server_ip, FLAGS.server_port), redirect_output=True) ptvsd.wait_for_attach() tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.logging.set_verbosity(tf.logging.INFO) #### Validate flags if FLAGS.save_steps is not None: FLAGS.log_step_count_steps = min(FLAGS.log_step_count_steps, FLAGS.save_steps) if FLAGS.do_predict: predict_dir = FLAGS.predict_dir if not tf.gfile.Exists(predict_dir): tf.gfile.MakeDirs(predict_dir) processors = { "mnli_matched": MnliMatchedProcessor, "mnli_mismatched": MnliMismatchedProcessor, 'sts-b': StsbProcessor, 'imdb': ImdbProcessor, "yelp5": Yelp5Processor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval, `do_predict` or " "`do_submit` must be True.") if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) # ########################### LOAD PT model # ########################### LOAD PT model # import torch # from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification # save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME) # tf.logging.info("Model loaded from path: {}".format(save_path)) # device = torch.device("cuda", 4) # config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b') # config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME) # config.to_json_file(config_path) # pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True, num_labels=1) # pt_model.to(device) # pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) # from torch.optim import Adam # optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, # amsgrad=False) # ########################### LOAD PT model # ########################### LOAD PT model task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() if not FLAGS.is_regression else None sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) # run_config = model_utils.configure_tpu(FLAGS) # model_fn = get_model_fn(len(label_list) if label_list is not None else None) spm_basename = os.path.basename(FLAGS.spiece_model_file) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = tf.estimator.Estimator( # model_fn=model_fn, # config=run_config) if FLAGS.do_train: train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) tf.logging.info("Use tfrecord file {}".format(train_file)) train_examples = processor.get_train_examples(FLAGS.data_dir) tf.logging.info("Num of train samples: {}".format(len(train_examples))) file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_file, FLAGS.num_passes) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] ##### Create computational graph tower_losses, tower_grads_and_vars, tower_inputs, tower_hidden_states, tower_logits = [], [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): loss_i, grads_and_vars_i, inputs_i, hidden_states_i, logits_i = single_core_graph( is_training=True, features=examples[i], label_list=label_list) tower_losses.append(loss_i) tower_grads_and_vars.append(grads_and_vars_i) tower_inputs.append(inputs_i) tower_hidden_states.append(hidden_states_i) tower_logits.append(logits_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) inputs = dict((n, tf.concat([t[n] for t in tower_inputs], 0)) for n in tower_inputs[0]) hidden_states = list( tf.concat(t, 0) for t in zip(*tower_hidden_states)) logits = tf.concat(tower_logits, 0) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] inputs = tower_inputs[0] hidden_states = tower_hidden_states[0] logits = tower_logits[0] # Summaries merged = tf.summary.merge_all() ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop saver = tf.train.Saver(max_to_keep=FLAGS.max_save) gpu_options = tf.GPUOptions(allow_growth=True) #### load pretrained models model_utils.init_from_checkpoint(FLAGS, global_vars=True) writer = tf.summary.FileWriter(logdir=FLAGS.model_dir, graph=tf.get_default_graph()) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) ######### ##### PYTORCH import torch from torch.optim import Adam from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME + '-00') saver.save(sess, save_path) tf.logging.info("Model saved in path: {}".format(save_path)) device = torch.device("cuda", 4) config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1) tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1) pt_model = XLNetForSequenceClassification.from_pretrained( save_path, from_tf=True, config=config) pt_model.to(device) pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, amsgrad=False) # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps, # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay) ##### PYTORCH ######### fetches = [ loss, global_step, gnorm, learning_rate, train_op, merged, inputs, hidden_states, logits ] total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0 total_logits = None total_labels = None while True: feed_dict = {} # for i in range(FLAGS.num_core_per_host): # for key in tower_mems_np[i].keys(): # for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): # feed_dict[m] = m_np fetched = sess.run(fetches) loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched total_loss += loss_np if total_logits is None: total_logits = logits_np total_labels = inputs_np['label_ids'] else: total_logits = np.append(total_logits, logits_np, axis=0) total_labels = np.append(total_labels, inputs_np['label_ids'], axis=0) ######### ##### PYTORCH f_inp = torch.tensor(inputs_np["input_ids"], dtype=torch.long, device=device) f_seg_id = torch.tensor(inputs_np["segment_ids"], dtype=torch.long, device=device) f_inp_mask = torch.tensor(inputs_np["input_mask"], dtype=torch.float, device=device) f_label = torch.tensor(inputs_np["label_ids"], dtype=torch.float, device=device) # with torch.no_grad(): # _, hidden_states_pt, _ = pt_model.transformer(f_inp, f_seg_id, f_inp_mask) # logits_pt, _ = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask) pt_model.train() outputs = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask, labels=f_label) loss_pt = outputs[0] loss_pt = loss_pt.mean() total_loss_pt += loss_pt.item() # # hidden_states_pt = list(t.detach().cpu().numpy() for t in hidden_states_pt) # # special_pt = special_pt.detach().cpu().numpy() # # Optimizer pt pt_model.zero_grad() loss_pt.backward() gnorm_pt = torch.nn.utils.clip_grad_norm_( pt_model.parameters(), FLAGS.clip) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate_np optimizer.step() ##### PYTORCH ######### if curr_step > 0 and curr_step % FLAGS.log_step_count_steps == 0: curr_loss = total_loss / (curr_step - prev_step) curr_loss_pt = total_loss_pt / (curr_step - prev_step) tf.logging.info( "[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, gnorm_np, learning_rate_np, curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) ######### ##### PYTORCH tf.logging.info( " PT [{}] | gnorm PT {:.2f} lr PT {:8.6f} " "| loss PT {:.2f} | pplx PT {:>7.2f}, bpc PT {:>7.4f}". format(curr_step, gnorm_pt, learning_rate_np, curr_loss_pt, math.exp(curr_loss_pt), curr_loss_pt / math.log(2))) ##### PYTORCH ######### total_loss, total_loss_pt, prev_step = 0., 0., curr_step writer.add_summary(summary_np, global_step=curr_step) if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt-{}".format(curr_step)) saver.save(sess, save_path) tf.logging.info( "Model saved in path: {}".format(save_path)) ######### ##### PYTORCH # Save a trained model, configuration and tokenizer model_to_save = pt_model.module if hasattr( pt_model, 'module') else pt_model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_dir = os.path.join( FLAGS.output_dir, "pytorch-ckpt-{}".format(curr_step)) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tf.logging.info( "PyTorch Model saved in path: {}".format(output_dir)) ##### PYTORCH ######### if curr_step >= FLAGS.train_steps: break if FLAGS.do_eval: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file_base = "{}.len-{}.{}.eval.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file) assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) ########################### LOAD PT model # import torch # from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam # save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME) # saver.save(sess, save_path) # tf.logging.info("Model saved in path: {}".format(save_path)) # device = torch.device("cuda", 4) # config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1) # tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME) # config.to_json_file(config_path) # # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1) # pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True) # pt_model.to(device) # pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) # from torch.optim import Adam # optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, # amsgrad=False) # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps, # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay) ##### PYTORCH ######### fetches = [ loss, global_step, gnorm, learning_rate, train_op, merged, inputs, hidden_states, logits ] total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0 total_logits = None total_labels = None while True: feed_dict = {} # for i in range(FLAGS.num_core_per_host): # for key in tower_mems_np[i].keys(): # for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): # feed_dict[m] = m_np fetched = sess.run(fetches) loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched total_loss += loss_np if total_logits is None: total_logits = logits_np total_labels = inputs_np['label_ids'] else: total_logits = np.append(total_logits, logits_np, axis=0) total_labels = np.append(total_labels, inputs_np['label_ids'], axis=0)
def main(_): assert tf.gfile.Exists(FLAGS.init_checkpoint) if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) processor = SubLocProcessor() labels = processor.get_labels() train_examples = processor.get_train_examples(FLAGS.data_dir) test_examples = processor.get_test_examples(FLAGS.data_dir) train_file_path = os.path.join(FLAGS.output_dir, get_basename(FLAGS.max_seq_length, "train")) test_file_path = os.path.join(FLAGS.output_dir, get_basename(FLAGS.max_seq_length, "test")) def tokenize_fn(text): text = preprocess_text(text) return encode_ids(text) # Create TF-Record for train examples file_based_convert_examples_to_features(train_examples, labels, FLAGS.max_seq_length, tokenize_fn, train_file_path) # Create TF-Record for test examples file_based_convert_examples_to_features(test_examples, labels, FLAGS.max_seq_length, tokenize_fn, test_file_path) train_set = get_dataset(train_file_path, FLAGS.max_seq_length, FLAGS.batch_size) train_iter = train_set.make_one_shot_iterator() example = train_iter.get_next() inp = tf.transpose(example["input_ids"], [1, 0]) seg_id = tf.transpose(example["segment_ids"], [1, 0]) inp_mask = tf.transpose(example["input_mask"], [1, 0]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output = xlnet_model.get_sequence_output() init_from_checkpoint(FLAGS) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) try: while True: outs = sess.run(output) print(outs.shape) except tf.errors.OutOfRangeError: tf.logging.info("DONE")
def train(ps_device): ##### Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, uncased=FLAGS.uncased, num_passes=FLAGS.num_passes, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): tf.compat.v1.logging.info("num of batches {}".format(record_info_dict["num_batch"])) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] ##### Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op(FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop # initialize mems tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) fetches = [loss, tower_new_mems, global_step, gnorm, learning_rate, train_op] total_loss, prev_step = 0., -1 while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np fetched = sess.run(fetches, feed_dict=feed_dict) loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np if curr_step > 0 and curr_step % FLAGS.iterations == 0: curr_loss = total_loss / (curr_step - prev_step) tf.compat.v1.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.compat.v1.logging.info("Model saved in path: {}".format(save_path)) if curr_step >= FLAGS.train_steps: break
def main(): """Main function routine""" tf.logging.set_verbosity(tf.logging.INFO) # Text encoding sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) # Temporary fix for context problem. pad_txt = """In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision and denounces one of the men as a horse thief. Although his father initially slaps him for making such an accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. """ pad_ids = tokenize_fn(pad_txt) pad_ids.append(EOD_ID) to_special_symbol = {v: k for k, v in special_symbols.items()} def parse_ids(toks): """Uses sentencepiece to conver to text. Subsitute EOP_ID and EOD_ID with new lines, and rest with their names""" start = 0 sent = "" for i in range(len(toks)): if toks[i] in to_special_symbol: if start < i: sent += sp.decode_ids(toks[start:i]) if toks[i] in [EOD_ID, EOP_ID]: replace_by = "\n\n" else: replace_by = to_special_symbol[toks[i]] sent += f" {replace_by} " start = i + 1 if start < len(toks): sent += sp.decode_ids(toks[start:]) return sent if not FLAGS.bidirectional_eachstep: prediction_graph = prediction_graph_memory else: prediction_graph = prediction_graph_no_memory predictions, features = prediction_graph() gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=False) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) def predict(examples): """Given a list of texts in examples return the result""" preprocessor = get_preprocessor(examples, tokenize_fn, pad_ids) dataset = get_input_dataset(preprocessor) example = dataset.make_one_shot_iterator().get_next() num_examples = len(examples) num_batches = int(np.ceil(num_examples / FLAGS.batch_size)) for _ in tqdm(range(num_batches)): inputs = sess.run(example) output, conf = sess.run( predictions, feed_dict={features[k]: v for k, v in inputs.items()}) for _output, _conf in zip(output, conf): yield _output, _conf if FLAGS.unconditional or FLAGS.interactive: tf.logging.info("Interactive flag received." " Ignoring input files if any.") while True: if FLAGS.unconditional: text = "" else: text = input("----PROMPT----\n") outputs = predict([text] * FLAGS.num_samples) for i, (output, _) in enumerate(outputs): out = parse_ids(output.tolist()) print("======SAMPLE {}======".format(i)) print(out) print("=====================") if FLAGS.unconditional: break else: assert FLAGS.input_file!="", "Please provide either an"\ " input file or set interactive flag for command line input" assert os.path.exists(FLAGS.input_file), FLAGS.input_file+\ " does not exists" with open(FLAGS.input_file) as f: texts = [] text = "" for line in f: if line.strip() == "": if text != "": # Removing the last <eop> of prompt # since it is not desired if text.endswith("<eop>"): text = text[:-5] texts.extend([text] * FLAGS.num_samples) text = "" continue text += re.sub(r'\n', '<eop>', line) if text != "": texts.extend([text] * FLAGS.num_samples) tf.logging.info("Got %s lines in the input file", len(texts) // FLAGS.num_samples) tf.logging.info("Sampling each line %s times", FLAGS.num_samples) outputs = iter(predict(texts)) with open(os.path.join(FLAGS.input_file + ".xlnet"), 'w') as f: for i in range(0, len(texts), FLAGS.num_samples): f.write("\n======Example {}=================\n".format(i)) f.write(texts[i]) for j in range(FLAGS.num_samples): output, _ = next(outputs) out = parse_ids(output.tolist()) f.write("\n======Example {} SAMPLE {}======\n".format( i, j)) f.write(out) f.write("\n==================================\n")
def model_fn(features, labels, mode, params): for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs total_loss, logits, predicts = function_builder.get_ner_loss(FLAGS, features, is_training, num_labels) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(label_ids, logits, num_labels, mask): predictions = tf.math.argmax(logits, axis=-1, output_type=tf.int32) cm = metrics.streaming_confusion_matrix(label_ids, predictions, num_labels - 1, weights=mask) return { "confusion_matrix": cm } # eval_metrics = (metric_fn, [label_ids, logits, num_labels, mask]) eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: #label_ids = tf.reshape(features["label_ids"], [-1]) # predictions = { # "predicts": predicts, # } #print('predicts') output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predicts, scaffold_fn=scaffold_fn) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. #train_spec = tf.estimator.EstimatorSpec( # mode=mode, loss=total_loss, train_op=train_op) train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) return train_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: flag_val_dict = { "dropout": FLAGS.dropout, "model_dir": FLAGS.model_dir, "data_dir": FLAGS.data_dir, "use_tpu": FLAGS.use_tpu, "num_core_per_host": FLAGS.num_core_per_host, "master": FLAGS.master, "iterations": FLAGS.iterations, "learning_rate": FLAGS.learning_rate, "train_batch_size": FLAGS.train_batch_size, "model_config_path": FLAGS.model_config_path, } for name in list(features.keys()): t = features[name] if t.dtype == tf.int64: t = tf.cast(t, tf.int32) features[name] = t tf.logging.info(json.dumps(flag_val_dict)) (total_loss, per_example_loss, logits, probabilities) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) ################################### # precision,recall, f1 score # ################################### precision = metrics.precision(label_ids, predictions, 20, average="macro") recall = metrics.recall(label_ids, predictions, 20, average="macro") f = metrics.f1(label_ids, predictions, 20, average="macro") ################################### # confusion matrix # ################################### def eval_confusion_matrix(labels, predictions, num_classes): with tf.variable_scope("eval_confusion_matrix"): con_matrix = tf.confusion_matrix( labels=labels, predictions=predictions, num_classes=num_classes) con_matrix_sum = tf.Variable( tf.zeros(shape=(num_classes, num_classes), dtype=tf.int32), trainable=False, name="confusion_matrix_result", collections=[tf.GraphKeys.LOCAL_VARIABLES]) update_op = tf.assign_add(con_matrix_sum, con_matrix) return tf.convert_to_tensor(con_matrix_sum), update_op return { 'eval_accuracy': accuracy, 'eval_loss': loss, "eval_precision": precision, "eval_recall": recall, "eval_f": f, "conf_mat": eval_confusion_matrix(label_ids, predictions, num_classes=20) } def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, # "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={"probabilities": probabilities}) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def __init__(self, model_path): config_file = os.path.join(model_path, 'xlnet_config.json') spiece_file = os.path.join(model_path, 'spiece.model') ckpt_path = os.path.join(model_path, 'xlnet_model.ckpt') self.predict_batch_size = 100 self.max_seq_length = 256 self.max_predictions_per_seq = 20 is_training = False # construct xlnet config and save to model_dir xlnet_config = xlnet.XLNetConfig(json_path=config_file) # construct run config from FLAGS # self.run_config = xlnet.create_run_config(is_training, False, FLAGS) run_config = xlnet.RunConfig(is_training, False, False, 0.0, 0.0, init="normal", init_range=0.1, init_std=0.02, mem_len=None, reuse_len=256, bi_data=False, clamp_len=-1, same_length=False) graph = tf.Graph() with graph.as_default(): self.session = tf.Session() # shape: max_sentence_length x num_sentence self.input_ids = tf.placeholder(tf.int32, [None, None], name="input_ids") # shape: max_sentence_length x num_sentence self.seg_ids = tf.placeholder(tf.int32, [None, None], name="seg_ids") # shape: max_sentence_length x num_sentence self.input_mask = tf.placeholder(tf.int32, [None, None], name="input_mask") # shape: max_sentence_length x max_sentence_length x num_sentence self.perm_mask = tf.placeholder(tf.int32, [None, None, None], name="perm_mask") # shape: max_predictions_per_seq x max_sentence_length x num_sentence self.target_mapping = tf.placeholder(tf.int32, [None, None, None], name="target_mapping") # shape: max_sentence_length x num_sentence self.inp_q = tf.placeholder(tf.int32, [None, None], name="inp_q") # shape: max_sentence_length x num_sentence self.target = tf.placeholder(tf.int32, [None, None], name="target") # # shape: bool scaler # self.is_training = tf.placeholder(tf.bool, name="is_training") xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=self.input_ids, seg_ids=self.seg_ids, input_mask=self.input_mask, mems=None, perm_mask=self.perm_mask, target_mapping=self.target_mapping, inp_q=self.inp_q) output = xlnet_model.get_sequence_output() lookup_table = xlnet_model.get_embedding_table() initializer = xlnet_model.get_initializer() with tf.variable_scope("model", reuse=tf.AUTO_REUSE): # LM loss lm_loss, logits = modeling.lm_loss( hidden=output, target=self.target, n_token=xlnet_config.n_token, d_model=xlnet_config.d_model, initializer=initializer, lookup_table=lookup_table, tie_weight=True, bi_data=run_config.bi_data, use_tpu=run_config.use_tpu) self.masked_lm_example_loss = run_lm_predict.get_masked_lm_output( self.bert_config, model.get_sequence_output(), model.get_embedding_table(), self.masked_lm_positions, self.masked_lm_ids) # load the pretrained bert model parameters (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tf.trainable_variables(), bert_ckpt) tf.train.init_from_checkpoint(bert_ckpt, assignment_map) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) self.session.run(tf.global_variables_initializer()) self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs inp_ids = tf.transpose(features["input_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) output_ids = features["output_ids"] output_mask = tf.transpose(features["output_mask"], [1, 0]) # define decoder inputs decoder_inputs = tf.concat( (tf.ones_like(output_ids[:, :1]) * 2, output_ids[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 decoder_inputs = tf.transpose(decoder_inputs, [1, 0]) args = dict(FLAGS=FLAGS, is_training=is_training, inp_ids=inp_ids, inp_mask=inp_mask, source_ntoken=params.get("source_ntoken"), target_ntoken=params.get("target_ntoken"), output_mask=output_mask, output_ids=output_ids, decoder_inputs=decoder_inputs) s2sm = seq2seq_models.seq2seqmodel(**args) total_loss, per_example_loss, logits = s2sm.total_loss, s2sm.per_example_loss, s2sm.logits #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, logits, output_ids): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) istarget = tf.to_float(tf.not_equal(output_ids, 0)) accuracy = tf.reduce_sum( tf.to_float(tf.equal(predictions, output_ids)) * istarget / (tf.reduce_sum(istarget))) loss = tf.metrics.mean(values=per_example_loss) return { 'eval_accuracy': accuracy, 'eval_loss': loss, } #### Constucting evaluation TPUEstimatorSpec with new cache. metric_args = [per_example_loss, logits, output_ids] eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, export_outputs=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: pred = tf.argmax(logits, axis=-1, output_type=tf.int32) predictions = { "logits": logits, "pred": pred, "output_ids": output_ids, } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits = create_model(FLAGS, features, is_training, num_labels) num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_list, logits, input_mask): input_mask *= -1 input_mask += 1 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_list, 'predictions': predictions, 'weights': input_mask } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=input_mask) return { 'eval_accuracy': accuracy, 'eval_loss': loss} input_mask = tf.cast(features['input_mask'], dtype=tf.float32) label_list = features['label_list'] metric_args = [per_example_loss, label_list, logits, input_mask] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def test(ps_device): test_input_fn, record_info_dict_test = data_utils.get_input_fn( info_dir=os.path.join(FLAGS.record_info_dir, "test"), split="test", bsz_per_host=FLAGS.test_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) tf.logging.info("num of test batches {}".format(record_info_dict_test["num_batch"])) ##### Create input tensors / placeholders bsz_per_core = FLAGS.test_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.test_batch_size # the whole batch } test_set = test_input_fn(params) t_iter = test_set.make_initializable_iterator() t_example = t_iter.get_next() if FLAGS.num_core_per_host > 1: # test set t_examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in t_example.keys(): vals = tf.split(t_examples[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): t_examples[device_id][key] = vals[device_id] else: t_examples = [t_example] ##### Create computational graph v_tower_mems, v_tower_losses, v_tower_new_mems = [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary v_mems_i = {} if FLAGS.mem_len: v_mems_i["mems"] = create_mems_tf(bsz_per_core) v_loss_i, v_new_mems_i = single_core_graph( features=t_examples[i], mems=v_mems_i) v_tower_mems.append(v_mems_i) v_tower_losses.append(v_loss_i) v_tower_new_mems.append(v_new_mems_i) ## average losses and gradients across towers if len(v_tower_losses) > 1: v_loss = tf.add_n(v_tower_losses) / len(v_tower_losses) else: v_loss = v_tower_losses[0] gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) # Create performance summaries for Tensorboard logging test_performance_summaries = tb.tensorboard_setup_test() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) # Create writers for Tensorboard logging test_summary_writer = tb.create_test_writer(sess, logging_dir=FLAGS.tb_logging_dir) # initialize mems v_tower_mems_np = [] for i in range(FLAGS.num_core_per_host): v_mems_i_np = {} for key in v_tower_mems[i].keys(): v_mems_i_np[key] = initialize_mems_np(bsz_per_core) v_tower_mems_np.append(v_mems_i_np) v_fetches = [v_loss, v_tower_new_mems] sess.run(t_iter.initializer) v_total_loss = 0. v_steps = 0 try: while True: v_feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in v_tower_mems_np[i].keys(): for m, m_np in zip(v_tower_mems[i][key], v_tower_mems_np[i][key]): v_feed_dict[m] = m_np v_fetched = sess.run(v_fetches, feed_dict=v_feed_dict) v_loss_np, v_tower_mems_np = v_fetched[:] v_total_loss += v_loss_np v_steps += 1 print(v_steps) except tf.errors.OutOfRangeError: test_loss = v_total_loss/v_steps t_pplx = math.exp(test_loss) tf.logging.info("Test: loss {:.2f} | pplx {:>7.2f}".format( test_loss, t_pplx)) summ_test = tb.run_test(sess, test_performance_summaries, test_loss, t_pplx) test_summary_writer.add_summary(summ_test, 1)
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: (total_loss, per_example_loss, logits) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) # Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return {'eval_accuracy': accuracy, 'eval_loss': loss} def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) # Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec # Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate # Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: # Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #assert is_training assert tf.gfile.Exists(logdir) #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems["mems"] = params["cache"] #### Get loss from inputs if is_training: total_loss, new_mems, monitor_dict = function_builder.get_loss( FLAGS, features, labels, mems, is_training) else: total_loss, batch_loss, batch_tgt_mask, new_mems = function_builder.get_loss( FLAGS, features, labels, mems, is_training) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems["mems"] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info("#params: {}".format(num_params)) #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint(FLAGS, global_vars=True) if is_training: #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss, None) monitor_dict["gnorm"] = gnorm monitor_dict["lr"] = learning_rate monitor_dict['pplx'] = tf.math.exp(total_loss) ''' #### Creating host calls host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, log_dir=logdir, prefix="train/", reduce_fn=tf.reduce_mean) ''' #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, #host_call=host_call, scaffold_fn=scaffold_fn) train_spec.cache = new_cache return train_spec else: #### Constucting validation TPUEstimatorSpec with new cache. eval_metrics = function_builder.construct_scalar_metric_fn( batch_loss, batch_tgt_mask) eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) eval_spec.cache = new_cache return eval_spec