def val_loop_s2s(model, loader): iterator = tqdm(loader, unit='Batch') total_f1 = 0.0 for i_batch, sample_batched in enumerate(iterator): #iterator.set_description('Test Batch %i/%i' % (1, params.nof_epoch)) test_batch_para = Variable( sample_batched["Context_Tensor"]).unsqueeze(2) test_batch_quest = Variable( sample_batched["Question_Tensor"]).unsqueeze(2) target_batch = Variable(sample_batched["Answer"]).unsqueeze(2) #if USE_CUDA: test_batch_para = test_batch_para.cuda() test_batch_quest = test_batch_quest.cuda() # para_len * 3 o = model(test_batch_para, test_batch_quest) start_probs = o[:, 1] end_probs = o[:, 2] start_pos = torch.argmax(start_probs) end_pos = torch.argmax(end_probs) para = test_batch_para.tolist() # Flatten para = [l for item in para[0] for l in item] total_f1 += compute_f1( para[start_pos:end_pos + 1], para[target_batch.tolist( )[0][0][0]:target_batch.tolist()[0][1][0] + 1]) if i_batch % 100 == 0: print(total_f1 / (i_batch + 1)) print( f"Final Average F1 score (across {len(iterator)} examples): {total_f1/len(iterator)}" )
def test_loop(model, loader): model.eval() total_f1 = 0.0 for i_batch, sample_batched in enumerate(loader): #iterator.set_description('Test Batch %i/%i' % (epoch+1, params.nof_epoch)) test_batch_para = Variable( sample_batched["Context_Tensor"]).unsqueeze(2) test_batch_quest = Variable( sample_batched["Question_Tensor"]).unsqueeze(2) target_batch = Variable(sample_batched["Answer"]).unsqueeze(2) train_batch_quest_text = sample_batched["Question_Txt"] train_batch_para_text = sample_batched["Context_Txt"] if USE_CUDA: test_batch_para = test_batch_para.cuda() test_batch_quest = test_batch_quest.cuda() o, p = model(test_batch_para, test_batch_quest, train_batch_para_text, train_batch_quest_text) # Convert to list p_ = p.tolist()[0] p_.sort() para = test_batch_para.tolist() # Flatten para = [l for item in para[0] for l in item] total_f1 += compute_f1( para[p_[0]:p_[1] + 1], para[target_batch.tolist( )[0][0][0]:target_batch.tolist()[0][1][0] + 1]) if i_batch % 100 == 0: print('Batch', i_batch, total_f1 / (i_batch + 1)) print( f"Final Average F1 score (across {len(loader)} examples): {total_f1/len(loader)}" )
def compute_metrics(actual, preds): """The function computes all the metrics auroc, f1 and accuracy Arguments: actual {list} -- the list of true labels preds {list} -- the list of predictions """ f1 = eval.compute_f1(actual, preds) acc = eval.compute_acc(actual, preds) auroc = eval.compute_auroc(actual, preds) print("AUROC = ", auroc) print("F1 = ", f1) print("Accuracy = ", acc)
def val_loop_lengthwise(model, loader): iterator = tqdm(loader, unit='Batch') total_f1 = 0.0 total_f1_by_len = {} count_by_len = {} for i_batch, sample_batched in enumerate(iterator): #iterator.set_description('Test Batch %i/%i' % (1, params.nof_epoch)) test_batch_para = Variable( sample_batched["Context_Tensor"]).unsqueeze(2) test_batch_quest = Variable( sample_batched["Question_Tensor"]).unsqueeze(2) target_batch = Variable(sample_batched["Answer"]).unsqueeze(2) test_batch_quest_text = sample_batched["Question_Txt"] test_batch_para_text = sample_batched["Context_Txt"] #if USE_CUDA: test_batch_para = test_batch_para.cuda() test_batch_quest = test_batch_quest.cuda() # para_len * 3 o = model(test_batch_para, test_batch_quest, test_batch_para_text, test_batch_quest_text) start_probs = o[:, 1] end_probs = o[:, 2] start_pos = torch.argmax(start_probs) end_pos = torch.argmax(end_probs) para = test_batch_para.tolist() # Flatten para = [l for item in para[0] for l in item] target_start = target_batch.tolist()[0][0][0] target_end = target_batch.tolist()[0][1][0] + 1 target_len = int(target_end) - int(target_start) target_len = target_len // 5 score = compute_f1(para[start_pos:end_pos + 1], para[target_start:target_end]) if target_len not in total_f1_by_len: total_f1_by_len[target_len] = 0.0 count_by_len[target_len] = 0 total_f1_by_len[target_len] += score count_by_len[target_len] += 1 print("Avg F1 by target length: ") for k, v in total_f1_by_len.items(): print( f"Target_len = {k}, Count: {count_by_len[k]} {v/count_by_len[k]}")
def val_loop_s2s(model, loader): iterator = tqdm(loader, unit='Batch') total_f1 = 0.0 goodExamples = 0 badExamples = 0 out = "" for i_batch, sample_batched in enumerate(iterator): #iterator.set_description('Test Batch %i/%i' % (1, params.nof_epoch)) test_batch_para = Variable( sample_batched["Context_Tensor"]).unsqueeze(2) test_batch_quest = Variable( sample_batched["Question_Tensor"]).unsqueeze(2) target_batch = Variable(sample_batched["Answer"]).unsqueeze(2) test_batch_quest_text = sample_batched["Question_Txt"] test_batch_para_text = sample_batched["Context_Txt"] #if USE_CUDA: test_batch_para = test_batch_para.cuda() test_batch_quest = test_batch_quest.cuda() # para_len * 3 o = model(test_batch_para, test_batch_quest, test_batch_para_text, test_batch_quest_text) start_probs = o[:, 1] end_probs = o[:, 2] start_pos = torch.argmax(start_probs) end_pos = torch.argmax(end_probs) para = test_batch_para.tolist() # Flatten para = [l for item in para[0] for l in item] score = compute_f1( para[start_pos:end_pos + 1], para[target_batch.tolist( )[0][0][0]:target_batch.tolist()[0][1][0] + 1]) if score > 0.7 and goodExamples < 100: out += ('\n' + 'Score: ' + str(score) + '\n' + 'Question: ' + str(test_batch_quest_text) + '\n' + 'Context: ' + str(test_batch_para_text) + '\n' + 'Predicted Answer\n' + str(getText(para[start_pos:end_pos + 1])) ) + '\n Target Answer: ' + str( getText(para[int(target_batch[0][0].item( )):int(target_batch[0][1].item()) + 1])) + '\n' print('\n') print('Score', score) print('Question', test_batch_quest_text) print('Context', test_batch_para_text) print('Predicted Answer', getText(para[start_pos:end_pos + 1])) print( 'Target Answer', getText(para[int(target_batch[0][0].item() ):int(target_batch[0][1].item()) + 1])) print('\n') goodExamples += 1 if score < 0.4 and badExamples < 100: print('\n') print('Score', score) print('Question', test_batch_quest_text) print('Context', test_batch_para_text) print('Predicted Answer', getText(para[start_pos:end_pos + 1])) print( 'Target Answer', getText(para[int(target_batch[0][0].item() ):int(target_batch[0][1].item()) + 1])) print('\n') out += ('\n' + 'Score: ' + str(score) + '\n' + 'Question: ' + str(test_batch_quest_text) + '\n' + 'Context: ' + str(test_batch_para_text) + '\n' + 'Predicted Answer\n' + str(getText(para[start_pos:end_pos + 1])) ) + '\n Target Answer: ' + str( getText(para[int(target_batch[0][0].item( )):int(target_batch[0][1].item()) + 1])) + '\n' badExamples += 1 total_f1 += score # if i_batch % 100 == 0: # print(total_f1/(i_batch+1)) resultFile.write(out) print( f"Final Average F1 score (across {len(iterator)} examples): {total_f1/len(iterator)}" )
def main(): if cfig.do_process: raw_filename = os.path.join(cfig.raw_data_base_dir, cfig.raw_data_filename) tag_dic, tagname_dic = get_tag_dict(cfig.tag_file) LABEL_COLUMNS = [] for t in tagname_dic: LABEL_COLUMNS.append(tagname_dic[t]) print("labels=", LABEL_COLUMNS) titles = ["sentence"] titles.extend(LABEL_COLUMNS) process_data(raw_filename, tag_dic, titles) exit() # 共用的 estimator run_config = tf.estimator.RunConfig( model_dir=cfig.models_dir, save_summary_steps=cfig.save_summary_steps, keep_checkpoint_max=1, # 只保留一个 save_checkpoints_steps=cfig.save_checkpoints_steps) bert_config = modeling.BertConfig.from_json_file(cfig.BERT_CONFIG) # Compute # train and warmup steps from batch size num_train_steps = None num_warmup_steps = None is_train_data_fixed = True if cfig.do_train: ##change path accordingly train_data_path = os.path.join(cfig.data_base_dir, cfig.train_file_name) train = pd.read_csv(train_data_path, delimiter="\t") # print(train.head()) if cfig.is_use_all_train_data: x_train = train # 采用全部的训练数据集进行训练,此时则没有评估阶段,或者说评估的结果是不可信的 else: if is_train_data_fixed: # 训练集固定的 if not cfig.is_use_all_train_data: x_train, x_val = train_test_split(train, random_state=42, train_size=cfig.train_val_ratio, shuffle=True) # 查看是否每次都是一样的,x_train[2], x_val[2]是否每次都一样。应该是相同的,random_state值固定则每次都得到同样的划分 # 为了方便和PyTorch版本对比,将x_train, x_val进行保存。所以,PyTorch版本只需要直接使用train_spilt_0.9.csv # 和dev_spilt_0.9.csv即可 train_path = os.path.join(cfig.data_base_dir, "train_spilt_0.9.csv") dev_path = os.path.join(cfig.data_base_dir, "dev_spilt_0.9.csv") x_train.to_csv(train_path, sep="\t", encoding="utf-8", index=0) # 不保留行索引,因为已经有了 x_val.to_csv(dev_path, sep="\t", encoding="utf-8", index=0) else: # 没有使用dev data pass else: # 训练集和dev set 每次都不同 x_train, x_val = train_test_split(train, train_size=cfig.train_val_ratio, shuffle=True) train_examples = create_examples(x_train) num_train_steps = int(len(train_examples) / cfig.train_batch_size * cfig.num_train_epochs) num_warmup_steps = int(num_train_steps * cfig.warmup_proportion) # 创建模型函数 model_fn = model_fn_builder( bert_config=bert_config, num_labels=cfig.num_labels, init_checkpoint=cfig.BERT_INIT_CHKPNT, learning_rate=cfig.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params={"batch_size": cfig.train_batch_size}) tokenization.validate_case_matches_checkpoint(True, cfig.BERT_INIT_CHKPNT) tokenizer = tokenization.FullTokenizer(vocab_file=cfig.BERT_VOCAB, do_lower_case=True) # print(tokenizer.tokenize("这是一个例子而已,请注意")) if not os.path.exists(cfig.output_dir): os.makedirs(cfig.output_dir) if cfig.do_train: # 进行训练 train_file = os.path.join(cfig.output_dir, "train.tf_record") if not os.path.exists(train_file): open(train_file, 'w').close() # 训练集转为features file_based_convert_examples_to_features(train_examples, cfig.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", cfig.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 创建导入函数 train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=cfig.max_seq_length, is_training=True, drop_remainder=True) print('Beginning Training!') current_time = datetime.now() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) print("Training took time ", datetime.now() - current_time) # val_data_path = os.path.join(cfig.data_base_dir, 'dev.csv') # x_val = pd.read_csv(val_data_path, delimiter="\t") if not cfig.is_use_all_train_data: print("Beginning do evaluation!") # 如果使用全部的train进行训练,此时是没有划分出dev data进行评估之用的 eval_file = os.path.join(cfig.output_dir, "eval.tf_record") if not os.path.exists(eval_file): open(eval_file, 'w').close() eval_examples = create_examples(x_val) file_based_convert_examples_to_features(eval_examples, cfig.max_seq_length, tokenizer, eval_file) # This tells the estimator to run through the entire set. eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=cfig.max_seq_length, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(cfig.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if cfig.do_predict: print('Beginning Predictions!') tag_dic, tagname_dic = get_tag_dict(cfig.tag_file) # 进行prediction # 事先从原始数据中划分为test data,使其保持独立性 test_data_path = os.path.join(cfig.data_base_dir, cfig.test_file_name) # 这里dev.csv if cfig.test_data_format == "csv": test = pd.read_csv(test_data_path, delimiter="\t") # x_test = test[:10000] # testing a small sample x_test = test x_test = x_test.reset_index(drop=True) predict_examples = create_examples(x_test, True) # 注意,此时并不是从原始的json格式中读取数据 else: # 是json格式。这里拿第一阶段的数据进行测试 fin = open(test_data_path, "r", encoding="utf-8") predict_examples = create_examples(fin, True, cfig.test_data_format, tag_dic) test_features = convert_examples_to_features(predict_examples, cfig.max_seq_length, tokenizer) current_time = datetime.now() predict_input_fn = input_fn_builder(features=test_features, seq_length=cfig.max_seq_length, is_training=False, drop_remainder=False) # ckp_path = os.path.join("./model_aug_bacthsize_4_learningrate_3e-5_epoch_3_laborscore_73_44", "best_model.ckpt") predictions = estimator.predict(predict_input_fn) # , checkpoint_path=ckp_path print("Prediction took time ", datetime.now() - current_time) label_columns = [] for t in tagname_dic: label_columns.append(tagname_dic[t]) output_df, prob_list = create_output(predictions, label_columns) # DataFrame格式 # prob_list存放的是每个样本的预测结果,列方向是对每个标签的预测概率 preds_labels = [] predic_one_hot_list = [] for i in range(len(prob_list)): row_data = prob_list[i] # 是一个list if len(row_data) != cfig.num_labels: print("maybe error") array = np.array(row_data) predic_one_hot = np.where(array > 0.5, 1, 0) predic_one_hot_list.append(predic_one_hot) indexs = np.where(array >= 0.5) temp = [] if len(indexs[0]) > 0: for j in indexs[0]: temp.append(j + 1) # 注意,这里已经+1了 preds_labels.append(temp) # 与标准答案进行对比 true_tags_count = 0 predic_tags_count = 0 all_qual_num = 0 test_list = [] for i in range(len(predict_examples)): # 遍历数据集中的标准 one_exam = predict_examples[i] one_tags = one_exam.labels # 真实labels,是一个arrya,值为0和1 test_list.append(one_tags) # 和predic_one_hot_list逐行对比 pred_rs = np.array(predic_one_hot_list[i]) # pdb.set_trace() # predic_labels = preds_labels[i] if 1 in one_tags: # 存在1的时候,说明真实标签非空,统计有标签结果的样本数量 true_tags_count = true_tags_count + 1 if 1 in pred_rs: predic_tags_count = predic_tags_count + 1 # 存在 if 1 in one_tags and 1 in pred_rs and (one_tags == pred_rs).all(): all_qual_num = all_qual_num + 1 print("true_count={},predict_count={}".format(true_tags_count, predic_tags_count)) print("all_qual_num=", all_qual_num) # out_filename = "{}_output.json".format(cfig.task_type_name) # outf_file = os.path.join(cfig.output_dir, out_filename) # inf_path = os.path.join(labor_data_path, data_filename) # generate_pred_file(label_columns, labor_preds, inf_path, outf_file) # 从2个矩阵的角度计算score。默认计算得分的方式,是将结果填入填入原始json格式中的label字段,再与标准的比对 prediction_array = np.array(predic_one_hot_list) print(prediction_array.shape) # prediction_array = prediction_array.T # 行方向为样本,列方向为类别 # print(prediction_array.shape) test_array = np.array(test_list) print(test_array.shape) # test_array = test_array.T score_labor = eval.compute_f1(prediction_array, test_array, tag_dic, tagname_dic) print('score_labor', score_labor)