def evaluate_model(config: Config, model: BertCRF, batch_insts_ids, name: str, insts: List[Instance]): # evaluation metrics = np.asarray([0, 0, 0], dtype=int) batch_id = 0 batch_size = config.batch_size for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] input_ids, input_seq_lens, annotation_mask, labels = batch input_masks = input_ids.gt(0) # get the predict result batch_max_scores, batch_max_ids = model( input_ids, input_seq_lens=input_seq_lens, annotation_mask=annotation_mask, labels=None, attention_mask=input_masks) metrics += evaluate_batch_insts(batch_insts=one_batch_insts, batch_pred_ids=batch_max_ids, batch_gold_ids=batch[-1], word_seq_lens=batch[1], idx2label=config.idx2labels) batch_id += 1 # calculate the precision, recall and f1 score p, total_predict, total_entity = metrics[0], metrics[1], metrics[2] precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore = 2.0 * precision * recall / ( precision + recall) if precision != 0 or recall != 0 else 0 logging.info("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision, recall, fscore)) return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF_sl, batch_insts_ids, name: str, insts: List[Instance]): ## evaluation metrics = np.asarray([0, 0, 0], dtype=int) batch_id = 0 batch_size = config.batch_size for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(batch) metrics += evaluate_batch_insts(batch_insts=one_batch_insts, batch_pred_ids=batch_max_ids, batch_gold_ids=batch[-2], word_seq_lens=batch[1], idx2label=config.idx2labels) batch_id += 1 p, total_predict, total_entity = metrics[0], metrics[1], metrics[2] precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore = 2.0 * precision * recall / ( precision + recall) if precision != 0 or recall != 0 else 0 print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision, recall, fscore), flush=True) return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance], print_each_type_metric: bool = False): ## evaluation p_dict, total_predict_dict, total_entity_dict = Counter(), Counter( ), Counter() batch_id = 0 batch_size = config.batch_size with torch.no_grad(): for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(**batch) batch_p, batch_predict, batch_total = evaluate_batch_insts( one_batch_insts, batch_max_ids, batch["labels"], batch["word_seq_lens"], config.idx2labels) p_dict += batch_p total_predict_dict += batch_predict total_entity_dict += batch_total batch_id += 1 if print_each_type_metric: for key in total_entity_dict: precision_key, recall_key, fscore_key = get_metric( p_dict[key], total_entity_dict[key], total_predict_dict[key]) print( f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}" ) total_p = sum(list(p_dict.values())) total_predict = sum(list(total_predict_dict.values())) total_entity = sum(list(total_entity_dict.values())) precision, recall, fscore = get_metric(total_p, total_entity, total_predict) print(colored( f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, F1: {fscore:.2f}", 'blue'), flush=True) return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance]): ## evaluation p_dict, total_predict_dict, total_entity_dict = Counter(), Counter( ), Counter() batch_id = 0 batch_size = config.batch_size for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(batch) batch_p, batch_predict, batch_total = evaluate_batch_insts( one_batch_insts, batch_max_ids, batch[-1], batch[1], config.idx2labels, config.use_crf_layer) p_dict += batch_p total_predict_dict += batch_predict total_entity_dict += batch_total batch_id += 1 for key in total_entity_dict: precision_key, recall_key, fscore_key = get_metric( p_dict[key], total_entity_dict[key], total_predict_dict[key]) print("[%s] Prec.: %.2f, Rec.: %.2f, F1: %.2f" % (key, precision_key, recall_key, fscore_key)) if key == config.new_type: precision_new_type, recall_new_type, fscore_new_type = get_metric( p_dict[key], total_entity_dict[key], total_predict_dict[key]) total_p = sum(list(p_dict.values())) total_predict = sum(list(total_predict_dict.values())) total_entity = sum(list(total_entity_dict.values())) precision, recall, fscore = get_metric(total_p, total_entity, total_predict) print(colored( "[%s set Total] Prec.: %.2f, Rec.: %.2f, F1: %.2f" % (name, precision, recall, fscore), 'blue'), flush=True) if config.choose_by_new_type: return [precision_new_type, recall_new_type, fscore_new_type] else: return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance]): ## evaluation metrics_exact = np.asarray([0, 0, 0], dtype=int) metrics_overlap = np.asarray([0, 0, 0], dtype=int) dict_exact = {} dict_overlap = {} batch_id = 0 batch_size = config.batch_size for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(batch) results = evaluate_batch_insts(one_batch_insts, batch_max_ids, batch[-1], batch[1], config.idx2labels) metrics_exact += results[0] metrics_overlap += results[1] for key in results[2]: if key not in dict_exact: dict_exact[key] = [0, 0, 0] dict_exact[key][0] += results[2][key][0] dict_exact[key][1] += results[2][key][1] dict_exact[key][2] += results[2][key][2] for key in results[3]: if key not in dict_overlap: dict_overlap[key] = [0, 0, 0] dict_overlap[key][0] += results[3][key][0] dict_overlap[key][1] += results[3][key][1] dict_overlap[key][2] += results[3][key][2] batch_id += 1 p_exact, total_predict, total_entity = metrics_exact[0], metrics_exact[ 1], metrics_exact[2] precision_exact = p_exact * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall_exact = p_exact * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore_exact = 2.0 * precision_exact * recall_exact / ( precision_exact + recall_exact) if precision_exact != 0 or recall_exact != 0 else 0 print("[%s set - Exact] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision_exact, recall_exact, fscore_exact), flush=True) #print_report(dict_exact) p_overlap, total_predict, total_entity = metrics_overlap[ 0], metrics_overlap[1], metrics_overlap[2] precision_overlap = p_overlap * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall_overlap = p_overlap * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore_overlap = 2.0 * precision_overlap * recall_overlap / ( precision_overlap + recall_overlap) if precision_overlap != 0 or recall_overlap != 0 else 0 print("[%s set - Overlap] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision_overlap, recall_overlap, fscore_overlap), flush=True) #print_report(dict_overlap) return [precision_exact, recall_exact, fscore_exact], [precision_overlap, recall_overlap, fscore_overlap], dict_exact, dict_overlap
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance]): ## evaluation tp, fp, tn, fn = 0, 0, 0, 0 # metrics, metrics_e2e = np.asarray([0, 0, 0], dtype=int), np.asarray([0, 0, 0], dtype=int) metrics, metrics_e2e = np.asarray([0, 0, 0], dtype=int), np.zeros( (1, 3), dtype=int) pair_metrics = np.asarray([0, 0, 0], dtype=int) batch_idx = 0 batch_size = config.batch_size # print('insts',len(insts)) for batch in batch_insts_ids: # print('batch_idx * batch_size:(batch_idx + 1) * batch_size', batch_idx* batch_size,(batch_idx + 1) * batch_size ) one_batch_insts = insts[batch_idx * batch_size:(batch_idx + 1) * batch_size] processed_batched_data = simple_batching(config, batch) # print(len(one_batch_insts)) batch_max_scores, batch_max_ids, pair_ids = model.decode( processed_batched_data) metrics += evaluate_batch_insts(one_batch_insts, batch_max_ids, processed_batched_data[-6], processed_batched_data[2], config.idx2labels) # print(processed_batched_data[-1]) metrics_e2e += evaluate_batch_insts_e2e( one_batch_insts, batch_max_ids, processed_batched_data[-6], processed_batched_data[2], config.idx2labels, processed_batched_data[-8], pair_ids, processed_batched_data[-1]) word_seq_lens = processed_batched_data[2].tolist() for batch_id in range(batch_max_ids.size()[0]): # print('batch_max_ids[batch_id]: ',batch_max_ids[batch_id].size(),batch_max_ids[batch_id]) length = word_seq_lens[batch_id] # prediction = batch_max_ids[batch_id][:length] # prediction = torch.flip(prediction,dims = [0]) gold = processed_batched_data[-6][batch_id][:length] # gold = torch.flip(gold, dims=[0]) # s_id = (prediction == 2).nonzero() # b_id = (prediction == 3).nonzero() # e_id = (prediction == 4).nonzero() # i_id = (prediction == 5).nonzero() # pred_id = torch.cat([s_id, b_id, e_id, i_id]).squeeze(1) # pred_id,_ = pred_id.sort(0, descending=False) # pred_id = pred_id[pred_id < processed_batched_data[-1][batch_id]] s_id = (gold == 2).nonzero() b_id = (gold == 3).nonzero() e_id = (gold == 4).nonzero() i_id = (gold == 5).nonzero() gold_id = torch.cat([s_id, b_id, e_id, i_id]).squeeze(1) gold_id, _ = gold_id.sort(0, descending=False) gold_id = gold_id[gold_id < processed_batched_data[-1][batch_id]] # argu_id = torch.LongTensor(list(set(gold_id.tolist()).intersection(set(pred_id.tolist())))) argu_id = torch.LongTensor(list(set(gold_id.tolist()))) # print('gold_id', gold_id, 'pred_id', pred_id, 'argu_id', argu_id) # print(pair_ids[batch_id].size(), batch[-3][batch_id].size()) one_batch_insts[batch_id].gold2 = processed_batched_data[-3][ batch_id].tolist() one_batch_insts[batch_id].pred2 = pair_ids[batch_id].squeeze( 2).tolist() # print(one_batch_insts[batch_id].gold2) # print(torch.sum(one_batch_insts[batch_id].pred2, dim=1)) # pred2 = one_batch_insts[batch_id].pred2[argu_id] pred2 = pair_ids[batch_id].squeeze(2) # gold2 = one_batch_insts[batch_id].gold2[argu_id] gold2 = processed_batched_data[-3][batch_id] # print('argu_id: ',argu_id.size(),argu_id) # print('one_batch_insts[batch_id].pred2: ',one_batch_insts[batch_id].pred2.size(),one_batch_insts[batch_id].pred2) gold_pairs = gold2.flatten() pred_pairs = pred2.flatten() # print(gold_pairs,pred_pairs) sum_table = gold_pairs + pred_pairs # print(sum_table.size(),sum_table[:100]) sum_table_sliced = sum_table[sum_table >= 0] # print(sum_table_sliced.size(),sum_table_sliced) tp_tmp = len(sum_table_sliced[sum_table_sliced == 2]) tn_tmp = len(sum_table_sliced[sum_table_sliced == 0]) tp += tp_tmp tn += tn_tmp ones = len(gold_pairs[gold_pairs == 1]) zeros = len(gold_pairs[gold_pairs == 0]) fp += (zeros - tn_tmp) fn += (ones - tp_tmp) # print(tp,tp_tmp,tn,tn_tmp,ones,zeros,fp,fn) batch_idx += 1 print('tp, fp, fn, tn: ', tp, fp, fn, tn) precision_2 = 1.0 * tp / (tp + fp) * 100 if tp + fp != 0 else 0 recall_2 = 1.0 * tp / (tp + fn) * 100 if tp + fn != 0 else 0 f1_2 = 2.0 * precision_2 * recall_2 / ( precision_2 + recall_2) if precision_2 + recall_2 != 0 else 0 acc = 1.0 * (tp + tn) / (fp + fn + tp + tn) * 100 if fp + fn + tp + tn != 0 else 0 p, total_predict, total_entity = metrics[0], metrics[1], metrics[2] precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore = 2.0 * precision * recall / ( precision + recall) if precision != 0 or recall != 0 else 0 p_e2e, total_predict_e2e, total_entity_e2e = metrics_e2e[:, 0], metrics_e2e[:, 1], metrics_e2e[:, 2] # precision_e2e = p_e2e * 1.0 / total_predict_e2e * 100 if total_predict_e2e != 0 else 0 # recall_e2e = p_e2e * 1.0 / total_entity_e2e * 100 if total_entity_e2e != 0 else 0 # fscore_e2e = 2.0 * precision_e2e * recall_e2e / (precision_e2e + recall_e2e) if precision_e2e != 0 or recall_e2e != 0 else 0 total_predict_e2e[total_predict_e2e == 0] = sys.maxsize total_entity_e2e[total_entity_e2e == 0] = sys.maxsize precision_e2e = p_e2e * 1.0 / total_predict_e2e * 100 recall_e2e = p_e2e * 1.0 / total_entity_e2e * 100 sum_e2e = precision_e2e + recall_e2e sum_e2e[sum_e2e == 0] = sys.maxsize fscore_e2e = 2.0 * precision_e2e * recall_e2e / sum_e2e print("Task1: ", p, total_predict, total_entity) # print("Overall: ", p_e2e, total_predict_e2e, total_entity_e2e) print("Task1: [%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision, recall, fscore), flush=True) print( "Task2: [%s set] Precision: %.2f, Recall: %.2f, F1: %.2f, acc: %.2f" % (name, precision_2, recall_2, f1_2, acc), flush=True) percs = [0.9] #percs = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] for i in range(len(percs)): print("Overall ", percs[i], ": [%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision_e2e[i], recall_e2e[i], fscore_e2e[i]), flush=True) return [ precision, recall, fscore, precision_2, recall_2, f1_2, acc, precision_e2e, recall_e2e, fscore_e2e ]
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance]): ## evaluation i = 0 metrics = np.asarray([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], dtype=int) batch_id = 0 batch_size = config.batch_size for batch in batch_insts_ids: i += 1 flag = 0 one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(batch) if i == len(batch_insts_ids) - 1: flag = 1 metrics += evaluate_batch_insts(one_batch_insts, batch_max_ids, batch[-1], batch[1], config.idx2labels, config.use_crf_layer, config.test_kind, flag) batch_id += 1 p, p_special, total_predict, total_entity, special_predict, special_entity = metrics[ 0], metrics[1], metrics[2], metrics[3], metrics[4], metrics[5] wrong_prediction = {} wrong_prediction["BLater"] = metrics[6] wrong_prediction["BEarlier"] = metrics[7] wrong_prediction["ILater"] = metrics[8] wrong_prediction["IEarlier"] = metrics[9] wrong_prediction["O2misc"] = metrics[10] wrong_prediction["misc2O"] = metrics[11] wrong_prediction[1] = metrics[12] wrong_prediction[2] = metrics[13] wrong_prediction[3] = metrics[14] wrong_prediction[4] = metrics[15] wrong_prediction[5] = metrics[16] wrong_prediction[6] = metrics[17] wrong_prediction[7] = metrics[18] wrong_prediction["length1"] = metrics[19] wrong_prediction["length2"] = metrics[20] wrong_prediction["length3"] = metrics[21] wrong_prediction["length4"] = metrics[22] wrong_prediction["length5"] = metrics[23] wrong_prediction["length6"] = metrics[24] wrong_prediction["length7"] = metrics[25] precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore = 2.0 * precision * recall / ( precision + recall) if precision != 0 or recall != 0 else 0 precision_special = p_special * 1.0 / special_predict * 100 if special_predict != 0 else 0 recall_special = p_special * 1.0 / special_entity * 100 if special_entity != 0 else 0 fscore_special = 2.0 * precision_special * recall_special / (precision_special + recall_special) \ if precision_special != 0 or recall_special != 0 else 0 print("---[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision, recall, fscore), flush=True) print("---[%s of %s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (config.test_kind, name, precision_special, recall_special, fscore_special), flush=True) print(p_special, special_entity, special_predict) for inn in wrong_prediction.keys(): if str(inn).startswith("length"): print(wrong_prediction[inn], end=" ") print() print(wrong_prediction) if name == "test": print() return [precision, recall, fscore]