def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] embeddings = self._create_model(model_config, run_config, input_ids, input_mask, segment_ids, model_type) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: loss = tf.Variable(0.0, name="loss", dtype=tf.float32) train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"embeddings": embeddings}, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): bsz = tf.shape(features["entity_ids"])[0] qlen = tf.shape(features["entity_ids"])[1] features["entity_ids_list"] = tf.reshape(features["entity_ids_list"], [bsz, -1, qlen]) # Build the neural network # Because Dropout have different behavior at training and prediction time, we # need to create 2 distinct computation graphs that still share the same weights. is_training = (mode == tf.estimator.ModeKeys.TRAIN) # get semantic vector of input emb_a, _ = create_embed_encoder(features['entity_ids'], is_training=is_training) emb_b, _ = create_embed_encoder(features['entity_ids_list'], is_training=is_training, is_normal=False) # Define loss and optimizer sim_op, sim_emb = tf_sim(emb_a, emb_b) # TF Estimators requires to return a EstimatorSpec, that specify # the different ops for training, evaluating, predicting... if mode == tf.estimator.ModeKeys.TRAIN: # total_loss = tf_loss(sim_op, sim_emb) #total_loss = cross_entropy_loss(sim_op, features['label']) total_loss = multi_loss(sim_op, sim_emb, features['labels']) #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) # train_op = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate).minimize(total_loss, global_step=tf.train.get_global_step()) estim_specs = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) # If prediction mode, early return elif mode == tf.estimator.ModeKeys.PREDICT: estim_specs = tf.estimator.EstimatorSpec(mode, predictions={'sim_op': sim_op}) # pred_classes}) else: raise NotImplementedError return estim_specs
def score_fn(self): score, debug_info = get_score(self.feature_emb, self.feature_vec, self.is_training) ; self.debug_info['encoder_debug_info']=debug_info # calculate pairwise loss S_ij = self.label - tf.transpose(self.label) ; self.debug_info['label']=self.label S_ij = tf.maximum(tf.minimum(1., S_ij), -1.) P_ij = (1 / 2) * (1 + S_ij) s_i_minus_s_j = score - tf.transpose(score); self.debug_info['s_i_minus_s_j']=s_i_minus_s_j; self.debug_info['P_ij']=P_ij logloss = tf.nn.sigmoid_cross_entropy_with_logits(logits=s_i_minus_s_j, labels=P_ij) ; self.debug_info['logloss']=logloss # only extracted the loss of pairs of the same group mask1 = tf.equal(self.qid - tf.transpose(self.qid), 0) ; self.debug_info['qid']=self.qid mask1 = tf.cast(mask1, tf.float32) # exclude the pair of sample and itself n = tf.shape(self.feature_emb)[0] mask2 = tf.ones([n, n]) - tf.diag(tf.ones([n])) mask = mask1 * mask2 num_pairs = tf.reduce_sum(mask) ; self.debug_info['mask1']=mask1; self.debug_info['mask2']=mask2; self.debug_info['mask']=mask loss = tf.cond(tf.equal(num_pairs, 0), lambda: 0., lambda: tf.reduce_sum(logloss * mask) / num_pairs) # set optimazition #train_op = tf.train.AdamOptimizer().minimize(loss) #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, loss) self.loss, self.num_pairs, self.score, self.train_op = loss, num_pairs, score, train_op
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" def metric_fn(label_ids, predict_ids): precision = tf.metrics.precision(labels=label_ids, predictions=predict_ids) recall = tf.metrics.recall(labels=label_ids, predictions=predict_ids) metric = { "precision": precision, "recall": recall, } return metric tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_masks = features["input_masks"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] if mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] else None loss, predict_ids = self._create_model(model_config, run_config, input_ids, input_masks, segment_ids, label_ids, label_list, mode) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: masked_label_ids = self._get_masked_data(label_ids, label_list) masked_predict_ids = self._get_masked_data( predict_ids, label_list) eval_metrics = (metric_fn, [masked_label_ids, masked_predict_ids]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"predict": predict_ids}, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_training = mode == tf.estimator.ModeKeys.TRAIN assert is_training #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems['mems'] = params['cache'] #### Get loss from inputs total_loss, new_mems, monitor_dict = function_builder.get_loss( FLAGS, features, labels, mems, is_training ) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems['mems'] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss ) monitor_dict['lr'] = learning_rate monitor_dict['gnorm'] = gnorm #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint( FLAGS, global_vars = True ) #### Creating host calls host_call = function_builder.construct_scalar_host_call( monitor_dict = monitor_dict, model_dir = FLAGS.model_dir, prefix = 'train/', reduce_fn = tf.reduce_mean, ) #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode = mode, loss = total_loss, train_op = train_op, host_call = host_call, scaffold_fn = scaffold_fn, ) train_spec.cache = new_cache return train_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" def metric_fn(sent_label_ids, sent_predict_ids): sent_accuracy = tf.metrics.accuracy( labels=sent_label_ids, predictions=sent_predict_ids) metric = { "sent_accuracy": sent_accuracy, } return metric tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_masks = features["input_masks"] segment_ids = features["segment_ids"] sent_label_ids = features["sent_label_ids"] if mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] else None loss, sent_predict_ids, sent_predict_scores, sent_predict_probs = self._create_model( input_ids, input_masks, segment_ids, sent_label_ids, sent_label_list, mode) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op, _, _ = model_utils.get_train_op(FLAGS, loss) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn, [sent_label_ids, sent_predict_ids]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={ "sent_predict_id": sent_predict_ids, "sent_predict_score": sent_predict_scores, "sent_predict_prob": sent_predict_probs }, scaffold_fn=scaffold_fn) return output_spec
def xlnet_optimizer_layer(self): correct_prediction = tf.equal( tf.argmax(self.logits, 2), tf.cast(self.targets, tf.int64)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) FLAGS.train_steps = int( self.train_length / FLAGS.train_batch_size * self.max_epoch) FLAGS.warmup_steps = int(FLAGS.train_steps * 0.1) self.train_op, self.learning_rate, _ = model_utils.get_train_op(FLAGS, self.loss) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def __init__(self, dictionary_size, embedding_size=_WORD_EMBED_SIZE, output_classes=3): self.graph = tf.Graph() batch_size = None with self.graph.as_default(): # Create input placeholders self.word_ids = tf.placeholder(dtype=tf.int32, shape=(batch_size, None)) self.word_ids_len = tf.placeholder(dtype=tf.int32, shape=(batch_size, )) self.labels = tf.placeholder(dtype=tf.int32, shape=(batch_size, 1)) # Create word embeddings word_embeddings = tf.get_variable( "word_embeddings", [dictionary_size, embedding_size]) embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, self.word_ids) # Create RNN on top of word embeddings rnn_out = model_utils.rnn_vanilla( inputs=embedded_word_ids, units=_RNN_UNITS, sequence_length=self.word_ids_len) # Create predictions logits = tf.layers.dense(inputs=rnn_out, units=output_classes, activation=None) probabilities = tf.nn.softmax(logits=logits, axis=-1) self.outputs = tf.argmax(probabilities, axis=-1) # Create loss self.loss = tf.losses.sparse_softmax_cross_entropy( labels=self.labels, logits=logits) # Create accuracy node self.acc_value, self.acc_op, self.acc_reset = model_utils.accuracy( labels=self.labels, predictions=self.outputs) # Create summary for monitoring training progress tf.summary.scalar("loss", self.loss) tf.summary.scalar("acc", self.acc_value) self.summary = tf.summary.merge_all() # Create training operation self.train_op = model_utils.get_train_op(self.loss, learning_rate=1e-4)
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_eval = mode == tf.estimator.ModeKeys.EVAL assert is_eval #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems['mems'] = params['cache'] #### Get loss from inputs total_loss, total_accuracy, new_mems, monitor_dict = custom_function_builder.get_loss( FLAGS, features, labels, mems, False) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems['mems'] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss) monitor_dict['lr'] = learning_rate monitor_dict['gnorm'] = gnorm #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint(FLAGS, global_vars=True) # def metric_fn(accuracy): # return # # eval_metrics = (metric_fn, [total_accuracy]) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops={'accuracy': total_accuracy}, scaffold=scaffold_fn, ) return output_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs #********************************************************************************************# bsz_per_core = tf.shape(features["input_ids"])[0] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label_ids = features["label_ids"] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) #summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size] xlnet_model_out = xlnet_model.get_sequence_output() embedding = tf.transpose(xlnet_model_out, [1, 0, 2]) max_seq_length = embedding.shape[1].value # 算序列真实长度 used = tf.sign(tf.abs(features["input_ids"])) lengths = tf.reduce_sum( used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度 # 添加CRF output layer blstm_crf = BLSTM_CRF(embedded_chars=embedding, hidden_unit=10, cell_type="lstm", num_layers=1, dropout_rate=0.5, initializers=initializers, num_labels=n_class, seq_length=max_seq_length, labels=label_ids, lengths=lengths, is_training=is_training) total_loss, logits, trans, pred_ids = blstm_crf.add_blstm_crf_layer( crf_only=True) #********************************************************************************************# #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(label_ids, pred_ids): return { "eval_loss": tf.metrics.mean_squared_error(labels=label_ids, predictions=pred_ids), } eval_metrics = metric_fn(features["label_ids"], pred_ids) eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metrics) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "logits": logits, "labels": label_ids, "pred_ids": pred_ids, "input_mask": features["input_mask"] } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def train(ps_device): # Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, uncased=FLAGS.uncased, num_passes=FLAGS.num_passes, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): tf.logging.info("num of batches {}".format(record_info_dict["num_batch"])) # Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] # Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) # average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] # get train op train_op, learning_rate, gnorm = model_utils.get_train_op(FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() # Training loop # initialize mems tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.97)#allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) sess.graph.finalize() run_metadata = tf.RunMetadata() options = tf.RunOptions(trace_level=tf.RunOptions.SOFTWARE_TRACE) dot_rep = graph_to_dot(tf.get_default_graph()) # s = Source(dot_rep, filename="test.gv", format="PNG") with open('profs/xln.dot', 'w') as fwr: fwr.write(str(dot_rep)) operations_tensors = {} operations_attributes = {} operations_names = tf.get_default_graph().get_operations() count1 = 0 count2 = 0 for operation in operations_names: operation_name = operation.name operations_info = tf.get_default_graph( ).get_operation_by_name(operation_name).values() try: operations_attributes[operation_name] = [] operations_attributes[operation_name].append( operation.type) operations_attributes[operation_name].append(tf.get_default_graph( ).get_tensor_by_name(operation_name + ':0').dtype._is_ref_dtype) except: pass if len(operations_info) > 0: if not (operations_info[0].shape.ndims is None): operation_shape = operations_info[0].shape.as_list( ) operation_dtype_size = operations_info[0].dtype.size if not (operation_dtype_size is None): operation_no_of_elements = 1 for dim in operation_shape: if not(dim is None): operation_no_of_elements = operation_no_of_elements * dim total_size = operation_no_of_elements * operation_dtype_size operations_tensors[operation_name] = total_size else: count1 = count1 + 1 else: count1 = count1 + 1 operations_tensors[operation_name] = -1 # print('no shape_1: ' + operation_name) # print('no shape_2: ' + str(operations_info)) # operation_namee = operation_name + ':0' # tensor = tf.get_default_graph().get_tensor_by_name(operation_namee) # print('no shape_3:' + str(tf.shape(tensor))) # print('no shape:' + str(tensor.get_shape())) else: # print('no info :' + operation_name) # operation_namee = operation.name + ':0' count2 = count2 + 1 operations_tensors[operation_name] = -1 # try: # tensor = tf.get_default_graph().get_tensor_by_name(operation_namee) # print(tensor) # print(tf.shape(tensor)) # except: # print('no tensor: ' + operation_namee) print(count1) print(count2) with open('./profs/tensors_sz_32.txt', 'w') as f: for tensor, size in operations_tensors.items(): f.write('"' + tensor + '"::' + str(size) + '\n') with open('./profs/operations_attributes.txt', 'w') as f: for op, attrs in operations_attributes.items(): strr = op for attr in attrs: strr += '::' + str(attr) strr += '\n' f.write(strr) fetches = [loss, tower_new_mems, global_step, gnorm, learning_rate, train_op] iter = 0 total_loss, prev_step = 0., -1 while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np if iter % 10 == 7 or iter == 0: fetched = sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) #if iter > 0: profile(run_metadata, iter) else: t0 = time.time() fetched = sess.run(fetches, feed_dict=feed_dict) print(time.time() - t0) if iter == 0: mem_options = tf.profiler.ProfileOptionBuilder.time_and_memory() mem_options["min_bytes"] = 0 mem_options["min_micros"] = 0 mem_options["output"] = 'file:outfile=./profs/mem.txt' mem_options["select"] = ("bytes", "peak_bytes", "output_bytes", "residual_bytes") mem = tf.profiler.profile( tf.Graph(), run_meta=run_metadata, cmd="scope", options=mem_options) with open('profs/mem2.txt', 'w') as f: f.write(str(mem)) iter += 1 loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np if curr_step > 0 and curr_step % FLAGS.iterations == 0: curr_loss = total_loss / (curr_step - prev_step) tf.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.logging.info("Model saved in path: {}".format(save_path)) if curr_step >= FLAGS.train_steps: break
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits = function_builder.get_race_loss( FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { 'eval_accuracy': accuracy, 'eval_loss': loss} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) metric_args = [per_example_loss, label_ids, logits, is_real_example] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): # ### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) return_dict = function_builder.get_classification_outputs( FLAGS, features, is_training) # per_example_loss = return_dict["per_example_loss"] cls_logits = return_dict["cls_logits"] # ### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) # ### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) if mode == tf.estimator.ModeKeys.PREDICT: # label_ids = tf.reshape(features["cls"], [-1]) predictions = { "feature_id": features["feature_id"], "cls_logits": cls_logits, # "cls": label_ids, } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec def compute_loss(log_probs, positions, depth): one_hot_positions = tf.one_hot(positions, depth=depth, dtype=tf.float32) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss cls_log_probs = return_dict["cls_log_probs"] num_choices = FLAGS.num_choices if num_choices: num_classes = num_choices else: num_classes = FLAGS.num_classes total_loss = compute_loss(cls_log_probs, features["cls"], depth=num_classes) # ### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {'loss/cls': total_loss, "lr": learning_rate} # ### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: # ### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['cls'], [-1]) predictions = tf.argmax(cls_logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs outputs = function_builder.get_qa_outputs(FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) scaffold_fn = None #### Evaluation mode if mode == tf.estimator.ModeKeys.PREDICT: if FLAGS.init_checkpoint: tf.logging.info("init_checkpoint not being used in predict mode.") predictions = { "unique_ids": features["unique_ids"], "start_top_index": outputs["start_top_index"], "start_top_log_probs": outputs["start_top_log_probs"], "end_top_index": outputs["end_top_index"], "end_top_log_probs": outputs["end_top_log_probs"], "cls_logits": outputs["cls_logits"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec ### Compute loss seq_length = tf.shape(features["input_ids"])[1] def compute_loss(log_probs, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss start_loss = compute_loss( outputs["start_log_probs"], features["start_positions"]) end_loss = compute_loss( outputs["end_log_probs"], features["end_positions"]) total_loss = (start_loss + end_loss) * 0.5 cls_logits = outputs["cls_logits"] is_impossible = tf.reshape(features["is_impossible"], [-1]) regression_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=is_impossible, logits=cls_logits) regression_loss = tf.reduce_mean(regression_loss) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is # comparable to start_loss and end_loss total_loss += regression_loss * 0.5 #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def train(ps_device): ##### Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( info_dir=os.path.join(FLAGS.record_info_dir, "train"), split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) valid_input_fn, record_info_dict_valid = data_utils.get_input_fn( info_dir=os.path.join(FLAGS.record_info_dir, "valid"), split="valid", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): num_train_batches = record_info_dict["num_batch"] tf.logging.info("num of train batches {}".format( record_info_dict["num_batch"])) tf.logging.info("num of validation batches {}".format( record_info_dict_valid["num_batch"])) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) valid_set = valid_input_fn(params) t_iter = train_set.make_initializable_iterator() example = t_iter.get_next() v_iter = valid_set.make_initializable_iterator() v_example = v_iter.get_next() if FLAGS.num_core_per_host > 1: # train set examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] # validation set v_examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in v_example.keys(): vals = tf.split(v_example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): v_examples[device_id][key] = vals[device_id] else: examples = [example] v_examples = [v_example] ##### Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] v_tower_mems, v_tower_losses, v_tower_new_mems = [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} v_mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) v_mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) v_loss_i, v_new_mems_i = single_core_graph(is_training=False, features=v_examples[i], mems=v_mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) v_tower_mems.append(v_mems_i) v_tower_losses.append(v_loss_i) v_tower_new_mems.append(v_new_mems_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] if len(v_tower_losses) > 1: v_loss = tf.add_n(v_tower_losses) / len(v_tower_losses) else: v_loss = v_tower_losses[0] ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, None, num_train_batches, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop # initialize mems tower_mems_np = [] v_tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} v_mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) v_mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) v_tower_mems_np.append(v_mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) # Create performance summaries for Tensorboard logging training_performance_summaries, valid_performance_summaries = tb.tensorboard_setup( ) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) # variables that are run in the session fetches = [ loss, tower_new_mems, global_step, gnorm, learning_rate, train_op ] v_fetches = [v_loss, v_tower_new_mems] # Create writers for Tensorboard logging info_dict = { "id": FLAGS.run_id, "n_layers": FLAGS.n_layers, "d_model": FLAGS.d_model, "n_heads": FLAGS.n_head } train_summary_writer, valid_summary_writer = tb.create_writers( sess, info_dict, logging_dir=FLAGS.tb_logging_dir) total_loss, prev_step = 0., -1 for i in range(FLAGS.epochs): # Train loop try: sess.run(t_iter.initializer) while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np fetched = sess.run(fetches, feed_dict=feed_dict) loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np print(curr_step) # Log training progress if curr_step > 0 and curr_step % FLAGS.log_steps == 0: curr_loss = total_loss / (curr_step - prev_step) summ = tb.run_train(sess, training_performance_summaries, curr_loss) train_summary_writer.add_summary(summ, curr_step) tf.logging.info( "[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}". format(curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step # Save checkpoint if curr_step > 0 and FLAGS.save_steps is not None and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.logging.info( "Model saved in path: {}".format(save_path)) except tf.errors.OutOfRangeError: pass # Validation loop try: sess.run(v_iter.initializer) v_total_loss, v_steps = 0., 0 while True: v_feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in v_tower_mems_np[i].keys(): for m, m_np in zip(v_tower_mems[i][key], v_tower_mems_np[i][key]): v_feed_dict[m] = m_np v_fetched = sess.run(v_fetches, feed_dict=v_feed_dict) v_loss_np, v_tower_mems_np = v_fetched[:] v_total_loss += v_loss_np v_steps += 1 except tf.errors.OutOfRangeError: val_loss = v_total_loss / v_steps v_pplx = math.exp(val_loss) tf.logging.info( "Validation: [{}] | loss {:.2f} | pplx {:>7.2f}".format( curr_step, val_loss, v_pplx)) summ_valid = tb.run_valid(sess, valid_performance_summaries, val_loss, v_pplx) valid_summary_writer.add_summary(summ_valid, curr_step) tf.logging.info("------------ Epoch {} ------------".format(i))
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: (total_loss, per_example_loss, logits) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) f1 = tf.contrib.metrics.f1_score(label_ids, predictions) #print('Label ids object type: {}'.format(type(label_ids))) #print('Predictions object type: {}'.format(type(predictions))) ''' cm = tf.math.confusion_matrix(label_ids,predictions,num_classes=n_class) print('Converting confusion matrix into its values.') sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) _cm = sess.run(cm) sess.close() print("Created value of confusion matrix: {}".format(_cm)) ''' ''' sess = tf.Session() #sess.run(tf.global_variables_initializer()) _cm = sess.run(cm) sess.close() print("Created value of confusion matrix: {}".format(_cm)) ''' ''' This giant part below was supposed to calculate f1 precision etc but it failed because eval() and run() gives error. Error: tensorflow.python.framework.errors_impl.FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element. [[node IteratorGetNext (defined at content/drive/My Drive/thesis/xlnet/run_classifier.py:866 ''' ''' sess = tf.InteractiveSession() label_ids_np=label_ids.eval() predictions_np = predictions.eval() sess.close() print('Conversion succeeded.') print('Label_ids_np type: {}'.format(type(label_ids_np))) print('Predictions_np type: {}'.format(type(predictions_np))) sess = tf.InteractiveSession() print('Tf conversion: from {} to {} '.format(type(tf.constant([1,2,3])),type(tf.constant([1,2,3]).eval()))) sess.close() #precision, recall, f1, _ = precision_recall_fscore_support(label_ids, predictions, average="macro", labels=list(range(0,n_class))) #mcc = matthews_corrcoef(label_ids_np, predictions_np) sess = tf.get_default_session() with sess.as_default(): label_ids_np = label_ids.eval() predictions_np = predictions.eval() sess = tf.Session() sess.run(tf.global_variables_initializer()) label_ids_np = sess.run(label_ids) predictions_np = sess.run(predictions) sess.close() precision_macro = scu.get_precision_macro(label_ids_np,predictions_np) recall_macro = scu.get_recall_macro(label_ids_np,predictions_np) f1_macro = scu.get_f1_macro(label_ids_np,predictions_np) mcc = scu.get_mcc_score(label_ids_np,predictions_np) print(f1_macro) print(mcc) ''' return {'eval_accuracy': accuracy, 'eval_loss': loss, 'f1': f1} def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def main(_): if FLAGS.server_ip and FLAGS.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(FLAGS.server_ip, FLAGS.server_port), redirect_output=True) ptvsd.wait_for_attach() tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.logging.set_verbosity(tf.logging.INFO) #### Validate flags if FLAGS.save_steps is not None: FLAGS.log_step_count_steps = min(FLAGS.log_step_count_steps, FLAGS.save_steps) if FLAGS.do_predict: predict_dir = FLAGS.predict_dir if not tf.gfile.Exists(predict_dir): tf.gfile.MakeDirs(predict_dir) processors = { "mnli_matched": MnliMatchedProcessor, "mnli_mismatched": MnliMismatchedProcessor, 'sts-b': StsbProcessor, 'imdb': ImdbProcessor, "yelp5": Yelp5Processor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval, `do_predict` or " "`do_submit` must be True.") if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) # ########################### LOAD PT model # ########################### LOAD PT model # import torch # from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification # save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME) # tf.logging.info("Model loaded from path: {}".format(save_path)) # device = torch.device("cuda", 4) # config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b') # config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME) # config.to_json_file(config_path) # pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True, num_labels=1) # pt_model.to(device) # pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) # from torch.optim import Adam # optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, # amsgrad=False) # ########################### LOAD PT model # ########################### LOAD PT model task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() if not FLAGS.is_regression else None sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) # run_config = model_utils.configure_tpu(FLAGS) # model_fn = get_model_fn(len(label_list) if label_list is not None else None) spm_basename = os.path.basename(FLAGS.spiece_model_file) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = tf.estimator.Estimator( # model_fn=model_fn, # config=run_config) if FLAGS.do_train: train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) tf.logging.info("Use tfrecord file {}".format(train_file)) train_examples = processor.get_train_examples(FLAGS.data_dir) tf.logging.info("Num of train samples: {}".format(len(train_examples))) file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_file, FLAGS.num_passes) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] ##### Create computational graph tower_losses, tower_grads_and_vars, tower_inputs, tower_hidden_states, tower_logits = [], [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): loss_i, grads_and_vars_i, inputs_i, hidden_states_i, logits_i = single_core_graph( is_training=True, features=examples[i], label_list=label_list) tower_losses.append(loss_i) tower_grads_and_vars.append(grads_and_vars_i) tower_inputs.append(inputs_i) tower_hidden_states.append(hidden_states_i) tower_logits.append(logits_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) inputs = dict((n, tf.concat([t[n] for t in tower_inputs], 0)) for n in tower_inputs[0]) hidden_states = list( tf.concat(t, 0) for t in zip(*tower_hidden_states)) logits = tf.concat(tower_logits, 0) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] inputs = tower_inputs[0] hidden_states = tower_hidden_states[0] logits = tower_logits[0] # Summaries merged = tf.summary.merge_all() ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop saver = tf.train.Saver(max_to_keep=FLAGS.max_save) gpu_options = tf.GPUOptions(allow_growth=True) #### load pretrained models model_utils.init_from_checkpoint(FLAGS, global_vars=True) writer = tf.summary.FileWriter(logdir=FLAGS.model_dir, graph=tf.get_default_graph()) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) ######### ##### PYTORCH import torch from torch.optim import Adam from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME + '-00') saver.save(sess, save_path) tf.logging.info("Model saved in path: {}".format(save_path)) device = torch.device("cuda", 4) config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1) tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1) pt_model = XLNetForSequenceClassification.from_pretrained( save_path, from_tf=True, config=config) pt_model.to(device) pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, amsgrad=False) # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps, # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay) ##### PYTORCH ######### fetches = [ loss, global_step, gnorm, learning_rate, train_op, merged, inputs, hidden_states, logits ] total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0 total_logits = None total_labels = None while True: feed_dict = {} # for i in range(FLAGS.num_core_per_host): # for key in tower_mems_np[i].keys(): # for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): # feed_dict[m] = m_np fetched = sess.run(fetches) loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched total_loss += loss_np if total_logits is None: total_logits = logits_np total_labels = inputs_np['label_ids'] else: total_logits = np.append(total_logits, logits_np, axis=0) total_labels = np.append(total_labels, inputs_np['label_ids'], axis=0) ######### ##### PYTORCH f_inp = torch.tensor(inputs_np["input_ids"], dtype=torch.long, device=device) f_seg_id = torch.tensor(inputs_np["segment_ids"], dtype=torch.long, device=device) f_inp_mask = torch.tensor(inputs_np["input_mask"], dtype=torch.float, device=device) f_label = torch.tensor(inputs_np["label_ids"], dtype=torch.float, device=device) # with torch.no_grad(): # _, hidden_states_pt, _ = pt_model.transformer(f_inp, f_seg_id, f_inp_mask) # logits_pt, _ = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask) pt_model.train() outputs = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask, labels=f_label) loss_pt = outputs[0] loss_pt = loss_pt.mean() total_loss_pt += loss_pt.item() # # hidden_states_pt = list(t.detach().cpu().numpy() for t in hidden_states_pt) # # special_pt = special_pt.detach().cpu().numpy() # # Optimizer pt pt_model.zero_grad() loss_pt.backward() gnorm_pt = torch.nn.utils.clip_grad_norm_( pt_model.parameters(), FLAGS.clip) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate_np optimizer.step() ##### PYTORCH ######### if curr_step > 0 and curr_step % FLAGS.log_step_count_steps == 0: curr_loss = total_loss / (curr_step - prev_step) curr_loss_pt = total_loss_pt / (curr_step - prev_step) tf.logging.info( "[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, gnorm_np, learning_rate_np, curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) ######### ##### PYTORCH tf.logging.info( " PT [{}] | gnorm PT {:.2f} lr PT {:8.6f} " "| loss PT {:.2f} | pplx PT {:>7.2f}, bpc PT {:>7.4f}". format(curr_step, gnorm_pt, learning_rate_np, curr_loss_pt, math.exp(curr_loss_pt), curr_loss_pt / math.log(2))) ##### PYTORCH ######### total_loss, total_loss_pt, prev_step = 0., 0., curr_step writer.add_summary(summary_np, global_step=curr_step) if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt-{}".format(curr_step)) saver.save(sess, save_path) tf.logging.info( "Model saved in path: {}".format(save_path)) ######### ##### PYTORCH # Save a trained model, configuration and tokenizer model_to_save = pt_model.module if hasattr( pt_model, 'module') else pt_model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_dir = os.path.join( FLAGS.output_dir, "pytorch-ckpt-{}".format(curr_step)) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tf.logging.info( "PyTorch Model saved in path: {}".format(output_dir)) ##### PYTORCH ######### if curr_step >= FLAGS.train_steps: break if FLAGS.do_eval: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file_base = "{}.len-{}.{}.eval.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file) assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) ########################### LOAD PT model # import torch # from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam # save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME) # saver.save(sess, save_path) # tf.logging.info("Model saved in path: {}".format(save_path)) # device = torch.device("cuda", 4) # config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1) # tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME) # config.to_json_file(config_path) # # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1) # pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True) # pt_model.to(device) # pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) # from torch.optim import Adam # optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, # amsgrad=False) # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps, # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay) ##### PYTORCH ######### fetches = [ loss, global_step, gnorm, learning_rate, train_op, merged, inputs, hidden_states, logits ] total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0 total_logits = None total_labels = None while True: feed_dict = {} # for i in range(FLAGS.num_core_per_host): # for key in tower_mems_np[i].keys(): # for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): # feed_dict[m] = m_np fetched = sess.run(fetches) loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched total_loss += loss_np if total_logits is None: total_logits = logits_np total_labels = inputs_np['label_ids'] else: total_logits = np.append(total_logits, logits_np, axis=0) total_labels = np.append(total_labels, inputs_np['label_ids'], axis=0)
def model_fn(features, labels, mode, params): # ### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) # ### Get loss from inputs sep_layer = FLAGS.sep_layer if FLAGS.supervise: FLAGS.sep_layer = 0 # teacher sep at 0 logger.info('supervise decompose layer: {}'.format(FLAGS.sep_layer)) with tf.variable_scope("teacher", reuse=tf.AUTO_REUSE): teacher_outputs = get_decomposed_qa_outputs( FLAGS, features, is_training) else: teacher_outputs = None if FLAGS.decompose: FLAGS.sep_layer = sep_layer logger.info('decompose at layer: {}'.format(FLAGS.sep_layer)) outputs = get_decomposed_qa_outputs(FLAGS, features, is_training) else: logger.info('running in normal mode') outputs = get_qa_outputs(FLAGS, features, is_training) # ### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) scaffold_fn = None # ### Evaluation mode if mode == tf.estimator.ModeKeys.PREDICT: if FLAGS.init_checkpoint: logger.info( "init_checkpoint not being used in predict mode.") predictions = { "feature_id": features["feature_id"], "start_logits": outputs["start_logits"], "end_logits": outputs["end_logits"], "cls_logits": outputs["cls_logits"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec # ## Compute loss seq_length = FLAGS.max_seq_length def compute_loss(log_probs, positions, depth=seq_length): one_hot_positions = tf.one_hot( positions, depth=depth, dtype=tf.float32) loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss start_loss = compute_loss( outputs["start_log_probs"], features["answer_start"]) end_loss = compute_loss( outputs["end_log_probs"], features["answer_end"]) total_loss = (start_loss + end_loss) * 0.5 cls_loss = compute_loss( outputs["cls_log_probs"], features["cls"], depth=FLAGS.num_classes) # note(zhiliny): by default multiply the loss by 0.5 so that # the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 monitor_dict = {"loss/start": start_loss, "loss/end": end_loss, "loss/cls": cls_loss, 'loss/ce': total_loss} if teacher_outputs is not None: ce_loss = total_loss gamma = FLAGS.ll_gamma alpha = FLAGS.dl_alpha beta = FLAGS.ul_beta # supervise upper and logits temp = FLAGS.temperature kd_loss = get_distill_loss(teacher_outputs, outputs, temp) mse_loss = get_upper_loss(teacher_outputs, outputs) monitor_dict["loss/kd"] = kd_loss monitor_dict["loss/mse"] = mse_loss total_loss = gamma * ce_loss + alpha * kd_loss + beta * mse_loss # ### Configuring the optimizer all_trainable_variables = tf.trainable_variables() # set fine tune scope if FLAGS.tune_scopes: tune_scopes = FLAGS.tune_scopes.split(',') else: tune_scopes = None logger.info('tune_scopes: {}'.format(tune_scopes)) if isinstance(tune_scopes, list): scoped_variables = [] for scope in tune_scopes: scoped_variables.extend(tf.trainable_variables(scope)) trainable_variables = scoped_variables else: trainable_variables = all_trainable_variables if FLAGS.init_scopes: init_scopes = FLAGS.init_scopes.split(',') else: init_scopes = None logger.info('init_scopes: {}'.format(init_scopes)) if isinstance(init_scopes, list): to_be_init_variables = [] for scope in init_scopes: to_be_init_variables.extend(tf.trainable_variables(scope)) else: to_be_init_variables = all_trainable_variables initialized_variable_names = {} scaffold_fn = None # ### load pretrained models init_checkpoint = FLAGS.init_checkpoint if init_checkpoint: logger.info("Initialize from the ckpt {}".format(init_checkpoint)) assign_map, initialized_variable_names = my_init_from_checkpoint( init_checkpoint, to_be_init_variables) # logger.info('assign_map: \n{}'.format(assign_map)) # logger.info('initialized_variable_names: \n{}'.format( # initialized_variable_names)) if FLAGS.use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assign_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assign_map) logger.info("**** Initialized Variables ****") for var in to_be_init_variables: init_str = "" if var.name in initialized_variable_names: init_str = ", *INIT*" logger.info(" name=%s, shape=%s%s", var.name, var.shape, init_str) if mode == tf.estimator.ModeKeys.TRAIN: logger.info("**** Trainable Variables ****") for var in trainable_variables: init_str = "" if var.name in initialized_variable_names: init_str = ", *INIT_AND_TRAINABLE*" logger.info("*TRAINABLE* name=%s, shape=%s%s", var.name, var.shape, init_str) train_op, learning_rate, _ = get_train_op( FLAGS, total_loss, trainable_variables=trainable_variables) monitor_dict["lr"] = learning_rate # ### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: host_call = construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): # Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits, label, mask = function_builder.get_crf_outputs( FLAGS, features, is_training) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # predict mode if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "logits": logits, "labels": label, 'mask': features['input_mask'], 'label_mask': mask } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec # Evaluation mode elif mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, weight): eval_input_dict = { 'labels': label_ids, 'predictions': logits, 'weights': weight } accuracy = tf.metrics.accuracy(**eval_input_dict) eval_input_dict = { 'labels': tf.one_hot(label_ids, FLAGS.crf_classes), 'predictions': tf.one_hot(logits, FLAGS.crf_classes), 'weights': weight } f1 = tf.contrib.metrics.f1_score(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss) return {'eval_accuracy': accuracy, 'eval_loss': loss, 'f1': f1} metric_args = [per_example_loss, label, logits, mask] eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec # load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def train(ps_device): ##### Get input function and model function train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, bi_data=FLAGS.bi_data, num_hosts=1, num_core_per_host=1, # set to one no matter how many GPUs perm_size=FLAGS.perm_size, mask_alpha=FLAGS.mask_alpha, mask_beta=FLAGS.mask_beta, uncased=FLAGS.uncased, num_passes=FLAGS.num_passes, use_bfloat16=FLAGS.use_bfloat16, num_predict=FLAGS.num_predict) # for key, info in record_info_dict.items(): tf.compat.v1.logging.info("num of batches {}".format(record_info_dict["num_batch"])) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] ##### Create computational graph tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): # The mems for each tower is a dictionary mems_i = {} if FLAGS.mem_len: mems_i["mems"] = create_mems_tf(bsz_per_core) loss_i, new_mems_i, grads_and_vars_i = single_core_graph( is_training=True, features=examples[i], mems=mems_i) tower_mems.append(mems_i) tower_losses.append(loss_i) tower_new_mems.append(new_mems_i) tower_grads_and_vars.append(grads_and_vars_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op(FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop # initialize mems tower_mems_np = [] for i in range(FLAGS.num_core_per_host): mems_i_np = {} for key in tower_mems[i].keys(): mems_i_np[key] = initialize_mems_np(bsz_per_core) tower_mems_np.append(mems_i_np) saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) model_utils.init_from_checkpoint(FLAGS, global_vars=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) fetches = [loss, tower_new_mems, global_step, gnorm, learning_rate, train_op] total_loss, prev_step = 0., -1 while True: feed_dict = {} for i in range(FLAGS.num_core_per_host): for key in tower_mems_np[i].keys(): for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): feed_dict[m] = m_np fetched = sess.run(fetches, feed_dict=feed_dict) loss_np, tower_mems_np, curr_step = fetched[:3] total_loss += loss_np if curr_step > 0 and curr_step % FLAGS.iterations == 0: curr_loss = total_loss / (curr_step - prev_step) tf.compat.v1.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, fetched[-3], fetched[-2], curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) total_loss, prev_step = 0., curr_step if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt") saver.save(sess, save_path) tf.compat.v1.logging.info("Model saved in path: {}".format(save_path)) if curr_step >= FLAGS.train_steps: break
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: flag_val_dict = { "dropout": FLAGS.dropout, "model_dir": FLAGS.model_dir, "data_dir": FLAGS.data_dir, "use_tpu": FLAGS.use_tpu, "num_core_per_host": FLAGS.num_core_per_host, "master": FLAGS.master, "iterations": FLAGS.iterations, "learning_rate": FLAGS.learning_rate, "train_batch_size": FLAGS.train_batch_size, "model_config_path": FLAGS.model_config_path, } for name in list(features.keys()): t = features[name] if t.dtype == tf.int64: t = tf.cast(t, tf.int32) features[name] = t tf.logging.info(json.dumps(flag_val_dict)) (total_loss, per_example_loss, logits, probabilities) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) ################################### # precision,recall, f1 score # ################################### precision = metrics.precision(label_ids, predictions, 20, average="macro") recall = metrics.recall(label_ids, predictions, 20, average="macro") f = metrics.f1(label_ids, predictions, 20, average="macro") ################################### # confusion matrix # ################################### def eval_confusion_matrix(labels, predictions, num_classes): with tf.variable_scope("eval_confusion_matrix"): con_matrix = tf.confusion_matrix( labels=labels, predictions=predictions, num_classes=num_classes) con_matrix_sum = tf.Variable( tf.zeros(shape=(num_classes, num_classes), dtype=tf.int32), trainable=False, name="confusion_matrix_result", collections=[tf.GraphKeys.LOCAL_VARIABLES]) update_op = tf.assign_add(con_matrix_sum, con_matrix) return tf.convert_to_tensor(con_matrix_sum), update_op return { 'eval_accuracy': accuracy, 'eval_loss': loss, "eval_precision": precision, "eval_recall": recall, "eval_f": f, "conf_mat": eval_confusion_matrix(label_ids, predictions, num_classes=20) } def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, # "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={"probabilities": probabilities}) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs inp_ids = tf.transpose(features["input_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) output_ids = features["output_ids"] output_mask = tf.transpose(features["output_mask"], [1, 0]) # define decoder inputs decoder_inputs = tf.concat( (tf.ones_like(output_ids[:, :1]) * 2, output_ids[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 decoder_inputs = tf.transpose(decoder_inputs, [1, 0]) args = dict(FLAGS=FLAGS, is_training=is_training, inp_ids=inp_ids, inp_mask=inp_mask, source_ntoken=params.get("source_ntoken"), target_ntoken=params.get("target_ntoken"), output_mask=output_mask, output_ids=output_ids, decoder_inputs=decoder_inputs) s2sm = seq2seq_models.seq2seqmodel(**args) total_loss, per_example_loss, logits = s2sm.total_loss, s2sm.per_example_loss, s2sm.logits #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, logits, output_ids): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) istarget = tf.to_float(tf.not_equal(output_ids, 0)) accuracy = tf.reduce_sum( tf.to_float(tf.equal(predictions, output_ids)) * istarget / (tf.reduce_sum(istarget))) loss = tf.metrics.mean(values=per_example_loss) return { 'eval_accuracy': accuracy, 'eval_loss': loss, } #### Constucting evaluation TPUEstimatorSpec with new cache. metric_args = [per_example_loss, logits, output_ids] eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, export_outputs=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: pred = tf.argmax(logits, axis=-1, output_type=tf.int32) predictions = { "logits": logits, "pred": pred, "output_ids": output_ids, } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs total_loss, logits, predicts = function_builder.get_ner_loss(FLAGS, features, is_training, num_labels) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(label_ids, logits, num_labels, mask): predictions = tf.math.argmax(logits, axis=-1, output_type=tf.int32) cm = metrics.streaming_confusion_matrix(label_ids, predictions, num_labels - 1, weights=mask) return { "confusion_matrix": cm } # eval_metrics = (metric_fn, [label_ids, logits, num_labels, mask]) eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: #label_ids = tf.reshape(features["label_ids"], [-1]) # predictions = { # "predicts": predicts, # } #print('predicts') output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predicts, scaffold_fn=scaffold_fn) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. #train_spec = tf.estimator.EstimatorSpec( # mode=mode, loss=total_loss, train_op=train_op) train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) return train_spec
def model_fn(features, labels, mode, params): is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits = create_model(FLAGS, features, is_training, num_labels) num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_list, logits, input_mask): input_mask *= -1 input_mask += 1 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_list, 'predictions': predictions, 'weights': input_mask } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=input_mask) return { 'eval_accuracy': accuracy, 'eval_loss': loss} input_mask = tf.cast(features['input_mask'], dtype=tf.float32) label_list = features['label_list'] metric_args = [per_example_loss, label_list, logits, input_mask] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): kwargs = dict( is_training = True, use_tpu = False, use_bfloat16 = False, dropout = 0.1, dropatt = 0.1, init = 'normal', init_range = 0.1, init_std = 0.05, clamp_len = -1, ) xlnet_parameters = xlnet.RunConfig(**kwargs) xlnet_config = xlnet.XLNetConfig( json_path = 'xlnet-base-29-03-2020/config.json' ) training_parameters = dict( decay_method = 'poly', train_steps = num_train_steps, learning_rate = initial_learning_rate, warmup_steps = num_warmup_steps, min_lr_ratio = 0.0, weight_decay = 0.00, adam_epsilon = 1e-8, num_core_per_host = 1, lr_layer_decay_rate = 1, use_tpu = False, use_bfloat16 = False, dropout = 0.1, dropatt = 0.1, init = 'normal', init_range = 0.1, init_std = 0.05, clip = 1.0, clamp_len = -1, ) training_parameters = Parameter(**training_parameters) X = features['X'] segment_ids = features['segment'] input_masks = tf.cast(features['mask'], tf.float32) X_b = features['X_b'] segment_ids_b = features['segment_b'] input_masks_b = tf.cast(features['mask_b'], tf.float32) Y = features['label'][:, 0] with tf.compat.v1.variable_scope('xlnet', reuse = False): xlnet_model = xlnet.XLNetModel( xlnet_config = xlnet_config, run_config = xlnet_parameters, input_ids = tf.transpose(X, [1, 0]), seg_ids = tf.transpose(segment_ids, [1, 0]), input_mask = tf.transpose(input_masks, [1, 0]), ) summary = xlnet_model.get_pooled_out('last', True) with tf.compat.v1.variable_scope('xlnet', reuse = True): xlnet_model = xlnet.XLNetModel( xlnet_config = xlnet_config, run_config = xlnet_parameters, input_ids = tf.transpose(X_b, [1, 0]), seg_ids = tf.transpose(segment_ids_b, [1, 0]), input_mask = tf.transpose(input_masks_b, [1, 0]), ) summary_b = xlnet_model.get_pooled_out('last', True) vectors_concat = [summary, summary_b, tf.abs(summary - summary_b)] vectors_concat = tf.concat(vectors_concat, axis = 1) logits = tf.layers.dense(vectors_concat, 2) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits = logits, labels = Y ) ) tf.identity(loss, 'train_loss') accuracy = tf.metrics.accuracy( labels = Y, predictions = tf.argmax(logits, axis = 1) ) tf.identity(accuracy[1], name = 'train_accuracy') tvars = tf.trainable_variables() init_checkpoint = 'xlnet-base-29-03-2020/model.ckpt-300000' assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint( tvars, init_checkpoint ) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if mode == tf.estimator.ModeKeys.TRAIN: train_op, learning_rate, _ = model_utils.get_train_op( training_parameters, loss ) tf.summary.scalar('learning_rate', learning_rate) estimator_spec = tf.estimator.EstimatorSpec( mode = mode, loss = loss, train_op = train_op ) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode = tf.estimator.ModeKeys.EVAL, loss = loss, eval_metric_ops = {'accuracy': accuracy}, ) return estimator_spec
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Get loss from inputs if FLAGS.is_regression: (total_loss, per_example_loss, logits) = function_builder.get_regression_loss( FLAGS, features, is_training) else: (total_loss, per_example_loss, logits) = function_builder.get_classification_loss( FLAGS, features, n_class, is_training) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) # Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return {'eval_accuracy': accuracy, 'eval_loss': loss} def regression_metric_fn(per_example_loss, label_ids, logits, is_real_example): loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) pearsonr = tf.contrib.metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) return {'eval_loss': loss, 'eval_pearsonr': pearsonr} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) # Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) if FLAGS.is_regression: metric_fn = regression_metric_fn else: metric_fn = metric_fn metric_args = [ per_example_loss, label_ids, logits, is_real_example ] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.reshape(features["label_ids"], [-1]) predictions = { "logits": logits, "labels": label_ids, "is_real": features["is_real_example"] } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec # Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate # Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: # Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['label_ids'], [-1]) predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def model_fn(features, labels, mode, params): """doc.""" #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #assert is_training assert tf.gfile.Exists(logdir) #### Retrieve `mems` from `params["cache"]` mems = {} idx = 0 if FLAGS.mem_len > 0: mems["mems"] = params["cache"] #### Get loss from inputs if is_training: total_loss, new_mems, monitor_dict = function_builder.get_loss( FLAGS, features, labels, mems, is_training) else: total_loss, batch_loss, batch_tgt_mask, new_mems = function_builder.get_loss( FLAGS, features, labels, mems, is_training) #### Turn `new_mems` into `new_cache` new_cache = [] if FLAGS.mem_len > 0: new_cache += new_mems["mems"] #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info("#params: {}".format(num_params)) #### Customized initial checkpoint scaffold_fn = model_utils.init_from_checkpoint(FLAGS, global_vars=True) if is_training: #### Configuring the optimizer train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, total_loss, None) monitor_dict["gnorm"] = gnorm monitor_dict["lr"] = learning_rate monitor_dict['pplx'] = tf.math.exp(total_loss) ''' #### Creating host calls host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, log_dir=logdir, prefix="train/", reduce_fn=tf.reduce_mean) ''' #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, #host_call=host_call, scaffold_fn=scaffold_fn) train_spec.cache = new_cache return train_spec else: #### Constucting validation TPUEstimatorSpec with new cache. eval_metrics = function_builder.construct_scalar_metric_fn( batch_loss, batch_tgt_mask) eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) eval_spec.cache = new_cache return eval_spec