def main(prediction_file, golden_file): path = os.getcwd() prediction_file = os.path.join(path, prediction_file) golden_file = os.path.join(path, golden_file) actual = [] with open(golden_file, "r") as f: for line in f: actual.append(json.loads(line)) predictions = [] with open(prediction_file, "r") as f: for line in f: predictions.append(json.loads(line)) assert len(predictions) == len( actual), "The two file provided does not have the same number of lines" score, acc, precision, recall, f1 = fever_score(predictions, actual) tab = PrettyTable() tab.field_names = [ "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1" ] tab.add_row((round(score, 4), round(acc, 4), round(precision, 4), round(recall, 4), round(f1, 4))) print(tab)
def main(args): # load the actual evaluation set - ground truth jlr = JSONLineReader() data_lines = jlr.read(args.actual_data_file) predictions_file_path = os.path.join(args.out_dir_rte, "refined_predictions.jsonl") submission_lines = jlr.read(predictions_file_path) sorted_lines = [] for g1, line in enumerate(data_lines[:10]): # debug instance = {} instance["id"] = line["id"] instance.update(submission_lines[g1]) sorted_lines.append(instance) score, acc, precision, recall, f1 = fever_score(sorted_lines, data_lines[:10]) # debug tab = PrettyTable() tab.field_names = [ "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1", ] tab.add_row(( round(score, 4), round(acc, 4), round(precision, 4), round(recall, 4), round(f1, 4), )) print(tab)
def evaluate_sentence_selection(self, loader, labels): """ Evaluate model on validation data Parameters ---------- loader : data.DataLoader Data loader class containing validation data labels : dict Index to output class Returns ------- ([float], [float]): Loss history and running loss history """ jsons = [] for i, batch in tqdm(enumerate(loader), total=len(loader)): X, y, json_list = batch for json in json_list: json["predicted_label"] = labels[json["predicted_label"]] json["label"] = labels[json["label"]] jsons.append(json) # print(f"Evaluation loss: {running_loss}") # print("Classification report after epoch:") strict_score, label_accuracy, precision, recall, f1 = fever_score(jsons) print(f"Fever score: {strict_score}") print(f"Label accuracy: {label_accuracy}") print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F1: {f1}") return jsons
def main(evidence_file, golden_file): path = os.getcwd() evidence_file = os.path.join(path, evidence_file) golden_file = os.path.join(path, golden_file) actual = [] with open(golden_file, "r") as f: for line in f: actual.append(json.loads(line)) predictions = [] with open(evidence_file, "r") as f: for i, line in enumerate(f): line = json.loads(line) line["predicted_label"] = actual[i]["label"] line["predicted_evidence"] = list( map(lambda e: e[1][:2], line["predicted_sentences"])) predictions.append(line) assert len(predictions) == len( actual), "The two file provided does not have the same number of lines" score, _, precision, recall, f1 = fever_score(predictions, actual) tab = PrettyTable() tab.field_names = [ "OFEVER Score", "Evidence Precision", "Evidence Recall", "Evidence F1" ] tab.add_row((round(score, 4), round(precision, 4), round(recall, 4), round(f1, 4))) print(tab)
def test_global_recall_partial_two_sents(self): instance = { "label": "supports", "predicted_label": "supports", "evidence": [[[None, None, "page", 0], [None, None, "page", 1]]], "predicted_evidence": [["page", 0], ["page", 1]] } _, _, _, r, _ = fever_score([instance], max_evidence=2) self.assertEqual(r, 1)
def test_strict_partial_zero(self): instance = { "label": "supports", "predicted_label": "supports", "evidence": [[[None, None, "page", 0], [None, None, "page", 1]]], "predicted_evidence": [["page", 0], ["page", 1]] } strict, _, _, _, _ = fever_score([instance], max_evidence=1) self.assertEqual(strict, 0)
def test_global_precision_partial_one_sent(self): instance = { "label": "supports", "predicted_label": "supports", "evidence": [[[None, None, "page", 0], [None, None, "page", 2]]], "predicted_evidence": [["page", 0], ["page", 1]] } _, _, p, _, _ = fever_score([instance], max_evidence=1) self.assertEqual(p, 1)
def test_non_modification(self): instance = { "label": "supports", "predicted_label": "supports", "evidence": [[[None, None, "page", 0], [None, None, "page", 1]]], "predicted_evidence": [["page", 0], ["page", 1]] } instance_copy = instance.copy() _, _, _, _, _ = fever_score([instance], max_evidence=0) self.assertEqual(instance_copy, instance)
def train(model, optimizer, criterion, path, best_f1, epoch_num): if not model.training: model.train() epoch_loss = 0 for i,batch in enumerate(train_iterator): model.train() optimizer.zero_grad() claims, sentences = batch.claim, batch.sentence predictions = model(claims, sentences) loss = criterion(predictions, batch.sent_label) loss.backward() optimizer.step() epoch_loss += loss.item() if (i+1)%5000 == 0: print(f'BATCH: {i+1}') # if (i+1)%10000 == 0: # pdb.set_trace() print("--------------------------------") print(f'BATCH: {i+1}') print("loss", epoch_loss/(i+1)) file_data, fever_data = evaluate(model, dev_iterator, dev_path) fever_val, accuracy, precision, recall, f1score = fever_score(fever_data) print(f'Fever Score: {fever_val} | Accuracy: {accuracy}') print(f'Precision: {precision} | Recall: {recall} | F1Score: {f1score}') # if f1score > best_f1: # best_f1 = f1score print(f'Saving Model. . . ') torch.save(model.state_dict(), model_path+f'{epoch_num}_{f1score:0.3f}.pt') print(f'Model Saved Successfully!') pd.DataFrame(fever_data).to_csv("/content/gdrive/My Drive/NLPWikiData/fever_data_output_E"+f'{epoch_num}.csv') pd.DataFrame(file_data).to_json("/content/gdrive/My Drive/NLPWikiData/sen_pred_train_E"+f'{epoch_num}.jsonl', orient='records', lines=True) print(f'jsonl file saved for RTE') print("--------------------------------") print("--------------------------------") return epoch_loss / len(train_iterator), best_f1
def run_score(config): ids = [] predicted_labels = [] predicted_evidence = [] actual = [] with open(config['predicted_labels'], "r") as predictions_file: for line in predictions_file: predicted_labels.append(json.loads(line)["predicted"]) with open(config['predicted_evidence'], "r") as predictions_file: for line in predictions_file: predicted_evidence.append( json.loads(line)["predicted_sentences"][:5]) ids.append(json.loads(line)["id"]) predictions = [] for id, ev, label in zip(ids, predicted_evidence, predicted_labels): predictions.append({ "id": id, "predicted_evidence": ev, "predicted_label": label }) save_submission_file(predictions, config['submission']) with open(config['actual_file'], "r") as actual_file: for line in actual_file: actual.append(json.loads(line)) score, acc, precision, recall, f1 = fever_score(predictions, actual) save_simple_result(config['score_file'], score, acc, precision, recall) print_confusion_mat(predictions, actual) tab = PrettyTable() tab.field_names = [ "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1" ] tab.add_row((round(score, 4), round(acc, 4), round(precision, 4), round(recall, 4), round(f1, 4))) print(tab)
def check(file, threshold, max_evidence=5): fin = open(file, 'rb') instances = [] for line in fin: instance = json.loads(line.decode(ENCODING)) evidences = [] for evidence in instance['predicted_evidence']: if float(evidence[2]) < threshold: continue evidence = [evidence[0], evidence[1]] evidences.append(evidence) instance['predicted_evidence'] = evidences instances.append(instance) fin.close() strict_score, label_accuracy, precision, recall, f1 = fever_score( instances, actual=None, max_evidence=max_evidence) print('Evidence precision:', precision) print('Evidence recall:', recall) print('Evidence f1:', f1)
def score_submission(predicted_labels_file, predicted_evidence_file, actual_labels_file): predicted_labels = [] predicted_evidence = [] actual = [] flatten = lambda l: [item for sublist in l for item in sublist] with open(predicted_labels_file, "r") as predictions_file: for line in predictions_file: predicted_labels.append(json.loads(line)["predicted_label"]) with open(actual_labels_file, "r") as actual_file: for line in actual_file: actual.append(json.loads(line)) with open(predicted_evidence_file, "r") as predictions_file: for line in predictions_file: line = json.loads(line) if "predicted_evidence" in line: predicted_evidence.append(line["predicted_evidence"]) elif "predicted_sentences" in line: predicted_evidence.append(line["predicted_sentences"]) else: predicted_evidence.append([[e[2], e[3]] for e in flatten(line["evidence"])]) predictions = [] for ev, label in zip(predicted_evidence, predicted_labels): predictions.append({ "predicted_evidence": ev, "predicted_label": label }) sdata = list(fever_score(predictions, actual)) sdata.append(len(predictions)) return tuple(sdata)
fever_dict['predicted_evidence'] = [[x[3], int(x[1])] for x in sorted_predicted_sentences][:5] fever_dict['evidence'] = org_dev_evidence_list # fever_dict = "No fever Data" file_data.append(temp_data) fever_data.append(fever_dict) # pd.DataFrame(file_data).to_json(sen_pred_test_path, orient='records', lines=True) print('prob_count', prob_count) return file_data, fever_data dev_file_data, dev_fever_data = evaluate(model, dev_iterator, dev_path) pd.DataFrame(dev_file_data).to_json(sen_pred_dev_path, orient='records', lines=True) pd.DataFrame(dev_fever_data).to_csv(dev_fever_data_path) print("Done!") dev_fever_val, dev_accuracy, dev_precision, dev_recall, f1score = fever_score(dev_fever_data) print(f'Fever Score: {dev_fever_val} | Accuracy: {dev_accuracy}') print(f'Precision: {dev_precision} | Recall: {dev_recall} | F1Score: {f1score}') x = pd.DataFrame(dev_fever_data) x i = x[x['predicted_label'] == 'NOT ENOUGH INFO'].index x.loc[i, 'predicted_evidence'] = [[]] y = x.to_dict('records') y[0]
# with open(args.predicted_evidence,"r") as predictions_file: # for line in predictions_file: # actual_labels2.append(json.loads(line)["label"]) # with open(args.actual, "r") as actual_file: # for line in actual_file: # actual_labels3.append(json.loads(line)["label"]) # for actual1, actual2, actual3 in zip(actual_labels1, actual_labels2, actual_labels3): # assert actual1 == actual2 == actual3, "{}, {}, {}".format(actual1, actual2, actual3) with open(args.actual, "r") as actual_file: for line in actual_file: actual.append(json.loads(line)) score, acc, precision, recall, f1 = fever_score(predictions, actual) save_simple_result(args.score_file, score, acc, precision, recall) print_confusion_mat(predictions, actual) if args.err_analysis: save_wrong_instances(args.actual, args.predicted_labels, args.predicted_evidence, args.err_analysis, args.predicted_labels_supplement) tab = PrettyTable() tab.field_names = [ "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1" ] tab.add_row((round(score, 4), round(acc, 4), round(precision, 4), round(recall, 4), round(f1, 4)))
def score2(all_expts): predictions,actual = list(zip(*all_expts)) sdata = list(fever_score(predictions,actual)) sdata.append(len(predictions)) return tuple(sdata)
def evaluate(self, loader, labels): """ Evaluate model on validation data Parameters ---------- loader : data.DataLoader Data loader class containing validation data labels : dict Index to output class Returns ------- ([float], [float]): Loss history and running loss history """ self.model.eval() jsons = [] loss_history = [] running_loss = 0.0 running_loss_history = [] # don't compute gradient with torch.no_grad(): for i, batch in tqdm(enumerate(loader), total=len(loader)): # Split up the batch X, y, json_list = batch # Foward logits = self.model(X.to(self.device)) og_shape = logits.shape # Reshape to be (sent len * batch size, output dim) logits = logits.view(-1, logits.shape[-1]) # Compute loss & add to history loss = self.loss_fn(logits, y.view(-1).to(self.device)) # no backprop loss_history.append(loss.item()) running_loss += (loss_history[-1] - running_loss) / (i + 1) running_loss_history.append(running_loss) # softmax to normalize probabilities class probs = torch.softmax(logits, dim=-1) # get the output class from the probs # also, reshape the prediction back to sentences predictions = torch.argmax(probs, dim=-1).reshape(og_shape[:-1]) for pred, json in zip(predictions.tolist(), json_list): c = Counter(pred) # most common value, or 1 (NEI) if it's a tie most_common = 2 if c[2] > c[0] else 0 if c[0] > c[2] else 1 json["predicted_label"] = labels[most_common] json["label"] = labels[json["label"]] jsons.append(json) # print(f"Evaluation loss: {running_loss}") # print("Classification report after epoch:") strict_score, label_accuracy, precision, recall, f1 = fever_score(jsons) print(f"Fever score: {strict_score}") print(f"Label accuracy: {label_accuracy}") print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F1: {f1}") return loss_history, running_loss_history
for e in evidence_set: unique_evidence.add((None, None, e[2], 0)) new_evidence.append([list(i) for i in unique_evidence]) j['evidence'] = new_evidence actual.append(j) if 'attack' in j: attacks[j['attack']].append(idx) #for pe, j in zip(predicted_evidence, actual): # print(pe, j['evidence']) predictions = [] for ev, label in zip(predicted_evidence, predicted_labels): predictions.append({"predicted_evidence": ev, "predicted_label": label}) score, acc, precision, recall, f1 = fever_score(predictions, actual) tab = PrettyTable() tab.field_names = [ "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1" ] tab.add_row((round(score, 4), round(acc, 4), round(precision, 4), round(recall, 4), round(f1, 4))) print(tab) actually = [i['label'] for i in actual] predicted = [i['predicted_label'] for i in predictions] print(classification_report(actually, predicted))
"∆ Evidence Recall", " ∆ Evidence F1" ] for i in range(1, args.num_subs + 1): predictions = [] with open(args.prediction + " " + str(i), "r") as predictions_file: for line in predictions_file: predictions.append(json.loads(line)) print("Team {0}".format(i)) p1 = deepcopy(predictions) p2 = deepcopy(predictions) oscore, oacc, oprecision, orecall, of1 = fever_score(p1, actual_original) nscore, nacc, nprecision, nrecall, nf1 = fever_score(p2, actual_rescore) dscore, dacc, dprecision, drecall, df1 = nscore - oscore, nacc - oacc, nprecision - oprecision, nrecall - orecall, nf1 - of1 tab = PrettyTable() tab.field_names = [ "", "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1" ] tab.add_row(("Original", round(oscore, 4), round(oacc, 4), round(oprecision, 4), round(orecall, 4), round(of1, 4))) tab.add_row(("Rescore", round(nscore, 4), round(nacc, 4), round(nprecision, 4), round(nrecall, 4), round(nf1, 4))) tab.add_row(("∆", round(dscore, 4), round(dacc, 4), round(dprecision, 4), round(drecall, 4), round(df1, 4))) deltatab.add_row((round(dscore, 4), round(dacc, 4), round(dprecision, 4),
predicted_labels.append(json.loads(line)["predicted_label"]) with open(args.predicted_evidence,"r") as predictions_file: for line in predictions_file: line = json.loads(line) if "predicted_sentences" in line: predicted_evidence.append(line["predicted_sentences"]) elif "predicted_evidence" in line: predicted_evidence.append(line["predicted_evidence"]) elif "evidence" in line: all_evidence = [] for evidence_group in line["evidence"]: all_evidence.extend(evidence_group) predicted_evidence.append(list(set([(evidence[2],evidence[3]) for evidence in all_evidence]))) predictions = [] for ev,label in zip(predicted_evidence,predicted_labels): predictions.append({"predicted_evidence":ev,"predicted_label":label}) fever, acc, pr, rec, f1 = fever_score(predictions, actual) print("FEVER Score: {}\n".format(fever)) print("Label Accuracy: {}\n".format(acc)) print("Evidence Precision: {}\n".format(pr)) print("Evidence F!: {}\n".format(rec)) print("Evidence F1: {}\n".format(f1))