def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), average_f1
def evaluate(self, model, tensor_examples, stored_info, step, official=False, conll_path=None, tb_writer=None): logger.info('Step %d: evaluating on %d samples...' % (step, len(tensor_examples))) model.to(self.device) evaluator = CorefEvaluator() doc_to_prediction = {} model.eval() for i, (doc_key, tensor_example) in enumerate(tensor_examples): gold_clusters = stored_info['gold'][doc_key] tensor_example = tensor_example[:7] # Strip out gold example_gpu = [d.to(self.device) for d in tensor_example] with torch.no_grad(): _, _, _, span_starts, span_ends, antecedent_idx, antecedent_scores = model(*example_gpu) span_starts, span_ends = span_starts.tolist(), span_ends.tolist() antecedent_idx, antecedent_scores = antecedent_idx.tolist(), antecedent_scores.tolist() predicted_clusters = model.update_evaluator(span_starts, span_ends, antecedent_idx, antecedent_scores, gold_clusters, evaluator) doc_to_prediction[doc_key] = predicted_clusters p, r, f = evaluator.get_prf() metrics = {'Eval_Avg_Precision': p * 100, 'Eval_Avg_Recall': r * 100, 'Eval_Avg_F1': f * 100} for name, score in metrics.items(): logger.info('%s: %.2f' % (name, score)) if tb_writer: tb_writer.add_scalar(name, score, step) if official: conll_results = conll.evaluate_conll(conll_path, doc_to_prediction, stored_info['subtoken_maps']) official_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) logger.info('Official avg F1: %.4f' % official_f1) return f * 100, metrics
def evaluate(self, session, official_stdout=False): self.load_eval_data() def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] } coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, tag_labels, tag_seq, tag_loss_label = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores, tag_outputs, tag_seq = session.run(self.predictions, feed_dict=feed_dict) self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators) predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print "Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)) # print tag_outputs # print tag_seq summary_dict = {} for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)): tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.2f}".format(t, v)) summary_dict[t] = v print ", ".join(results_to_print) conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print "Average F1 (conll): {:.2f}%".format(average_f1) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print "Average F1 (py): {:.2f}%".format(f * 100) summary_dict["Average precision (py)"] = p print "Average precision (py): {:.2f}%".format(p * 100) summary_dict["Average recall (py)"] = r print "Average recall (py): {:.2f}%".format(r * 100) return util.make_summary(summary_dict), average_f1
def evaluate(self, session, official_stdout=False): self.load_eval_data() def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] } coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators) predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)): tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.2f}".format(t, v)) summary_dict[t] = v print(", ".join(results_to_print)) conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), average_f1
def conll_evaluate(l0_inputs, alphas, conll_eval_path, all_top_antecedent_scores): print("Compiling clusters and evaluators for conll suite") if isinstance(alphas, float) or isinstance(alphas, int): alphas = [alphas] coref_predictions = [{} for _ in alphas] coref_evaluators = [metrics.CorefEvaluator() for _ in alphas] subtoken_maps = {} with open(l0_inputs, "rb") as f: data_dicts = np.load(f, allow_pickle=True).item().get("data_dicts") for example_num, data_dict in enumerate(tqdm(data_dicts)): example = data_dict["example"] subtoken_maps[example["doc_key"]] = example["subtoken_map"] top_span_starts = data_dict["top_span_starts"] top_span_ends = data_dict["top_span_ends"] top_antecedents = data_dict["top_antecedents"] for i in range(len(alphas)): top_antecedent_scores = all_top_antecedent_scores[ example["doc_key"]][i] predicted_antecedents = get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[i][example["doc_key"]] = evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluators[i]) summary_dict = DD(list) for i in range(len(alphas)): print("\n*****************************") print("******* alpha = %f *******" % alphas[i]) summary_dict["alpha"].append(alphas[i]) conll_results = conll.evaluate_conll(conll_eval_path, coref_predictions[i], subtoken_maps, official_stdout=True) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"].append(average_f1) print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluators[i].get_prf() summary_dict["Average F1 (py)"].append(f) print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(subtoken_maps.keys()))) summary_dict["Average precision (py)"].append(p) print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"].append(r) print("Average recall (py): {:.2f}%".format(r * 100)) return summary_dict
def evaluate(self, session, official_stdout=False, eval_mode=False): self.load_eval_data() coref_predictions = {} for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, \ top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) """ candidate_starts: (num_words, max_span_width) 所有候选span的start candidate_ends: (num_words, max_span_width) 所有候选span的end candidate_mention_scores: (num_candidates,) 候选答案的得分 top_span_starts: (k, ) 筛选过mention之后的候选的start_index top_span_ends: (k, ) 筛选过mention之后的候选的end_index top_antecedents: (k, c) 粗筛过antecedent之后的每个候选antecedent的index top_antecedent_scores: (k, c) 粗筛过antecedent之后的每个候选antecedent的score """ predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"]) if (example_num + 1) % 100 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) summary_dict = {} if eval_mode: # 在测试集评测的时候,需要用官方的脚本再评测一遍 conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = self.coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(self.eval_data))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), f
def evaluate(self, model, device, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() doc_keys = [] with torch.no_grad(): for example_num, example in enumerate(tqdm(self.eval_data, desc="Eval_Examples")): tensorized_example = model.tensorize_example(example, is_training=False) input_ids = torch.from_numpy(tensorized_example[0]).long().to(device) input_mask = torch.from_numpy(tensorized_example[1]).long().to(device) text_len = torch.from_numpy(tensorized_example[2]).long().to(device) speaker_ids = torch.from_numpy(tensorized_example[3]).long().to(device) genre = torch.tensor(tensorized_example[4]).long().to(device) is_training = tensorized_example[5] gold_starts = torch.from_numpy(tensorized_example[6]).long().to(device) gold_ends = torch.from_numpy(tensorized_example[7]).long().to(device) cluster_ids = torch.from_numpy(tensorized_example[8]).long().to(device) sentence_map = torch.Tensor(tensorized_example[9]).long().to(device) if keys is not None and example['doc_key'] not in keys: continue doc_keys.append(example['doc_key']) (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores), loss = model(input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map) predicted_antecedents = self.get_predicted_antecedents(top_antecedents.cpu(), top_antecedent_scores.cpu()) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return summary_dict, f
def evaluate(self, session, official_stdout=False): # self.load_eval_data() with open(self.config["inv_mapping"], 'rb') as handle: inv_mapping = pickle.load(handle) with open(self.config["eval_path"], 'rb') as handle: test = pickle.load(handle) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for i in range(len(test)): if i == 191 or i == 217 or i == 225: continue example = test[i] file_name = example["doc_key"] inv_map = inv_mapping[file_name] tensorized_example = self.tensorize_example(example, is_training=False) _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run( self.predictions, feed_dict=feed_dict) top_span_starts = inv_map[top_span_starts] top_span_ends = inv_map[top_span_ends] predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[file_name] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if i % 10 == 0: print("Evaluated {}/{} examples.".format(i + 1, len(test))) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100))
def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() avg_loss = 0.0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} predictions, loss = session.run([self.predictions, self.loss], feed_dict=feed_dict) candidate_starts, candidate_ends, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, _, _ = predictions predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 20 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) avg_loss += loss avg_loss = avg_loss / len(self.eval_data) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) cluster_result = {'prediction':coref_predictions, 'gold':official_stdout} with open('evaluate_result.pickle', 'wb') as handle: pickle.dump(cluster_result, handle, protocol=pickle.HIGHEST_PROTOCOL) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) summary_dict["Validation loss"] = avg_loss print("Validation loss: {:.3f}".format(avg_loss)) return util.make_summary(summary_dict), average_f1, avg_loss
def evaluate(model, eval_dataloader, data_path, conll_path, prediction_path, device): with open(data_path) as f: examples = [json.loads(jsonline) for jsonline in f.readlines()] model.eval() coref_predictions = {} subtoken_maps = {} coref_evaluator = metrics.CorefEvaluator(singleton=False) predicted_antecedents = [] predicted_clusters = [] with torch.no_grad(): for i, (batch, example) in enumerate(zip(eval_dataloader, examples)): subtoken_maps[example['doc_key']] = example["subtoken_map"] doc_key = batch[0] assert doc_key == example["doc_key"], (doc_key, example["doc_key"]) input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, \ subtoken_map = [b.to(device) for b in batch[1:]] predictions, loss = model(input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, subtoken_map) (top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, candidate_starts, candidate_ends, top_span_cluster_ids, top_span_mention_scores, candidate_mention_scores) = \ [p.detach().cpu() for p in predictions] antecedents = get_predicted_antecedents( top_antecedents.numpy(), top_antecedent_scores.numpy()) clusters = evaluate_coref(top_span_starts.numpy(), top_span_ends.numpy(), antecedents, example["clusters"], coref_evaluator, top_span_mention_scores) coref_predictions[example["doc_key"]] = clusters predicted_antecedents.append(antecedents) predicted_clusters.append(clusters) coref_p, coref_r, coref_f = coref_evaluator.get_prf() conll_results = conll.evaluate_conll(conll_path, prediction_path, coref_predictions, subtoken_maps, official_stdout=True) return coref_p, coref_r, coref_f, conll_results
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False, to_npy=None, from_npy=None, rsa_model=None): assert not (to_npy is not None and from_npy is not None), "cannot set both to_npy and from_npy to be none!" self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 total_time = 0 if to_npy: data_dicts = [] if from_npy: with open(from_npy, "rb") as f: from_npy_dict = np.load(f) data_dicts = from_npy_dict.item().get("data_dicts") for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example if from_npy is None: feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = \ session.run([self.loss, self.predictions], feed_dict=feed_dict) else: data_dict = data_dicts[example_num] example = data_dict["example"] if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) tensorized_example = data_dict["tensorized_example"] loss = data_dict["loss"] top_span_starts = data_dict["top_span_starts"] top_span_ends = data_dict["top_span_ends"] top_antecedents = data_dict["top_antecedents"] top_antecedent_scores = data_dict["top_antecedent_scores"] # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) if rsa_model is not None: print("Running l1 for sentence %d" % example_num) start_time = time.time() top_antecedent_scores = rsa_model.l1(example, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) duration = time.time() - start_time print("Finished sentence %d, took %.2f s" % (example_num, duration)) total_time += duration num_evaluated += 1 if to_npy: data_dict = { "example_num": example_num, "tensorized_example": tensorized_example, "example": example, "top_span_starts": top_span_starts, "top_span_ends": top_span_ends, "top_antecedents": top_antecedents, "top_antecedent_scores": top_antecedent_scores, "loss": loss, } data_dicts.append(data_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} if to_npy: dict_to_npy = {"data_dicts": data_dicts} if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout ) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 if to_npy: dict_to_npy["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) if to_npy: dict_to_npy["Average F1 (py)"] = f dict_to_npy["Average precision (py)"] = p dict_to_npy["Average recall (py)"] = r with open(to_npy, "wb") as f_to_npy: np.save(f_to_npy, dict_to_npy) if rsa_model: print("Ran rsa on %d sentences, avg time per sentence %.2f s" % num_evaluated, total_time / num_evaluated) return util.make_summary(summary_dict), f
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: # if keys is not None and example['doc_key'] in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) # continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) summary_dict = {} # with open('doc_keys_512.txt', 'w') as f: # for key in doc_keys: # f.write(key + '\n') if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), f
top_antecedents, top_antecedent_scores) example[ "predicted_clusters"], _ = model.get_predicted_clusters( top_span_starts, top_span_ends, predicted_antecedents) coref_predictions[ example["doc_key"]] = model.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) output_file.write(json.dumps(example)) output_file.write("\n") if example_num % 100 == 0: print("Decoded {} examples.".format(example_num + 1)) summary_dict = {} conll_results = conll.evaluate_conll(model.config["conll_eval_path"], coref_predictions, False) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100))
def eval_coref(config): """ 指代消解模型验证 :param config: 配置参数 :return: None """ model = CorefModel.from_pretrained(config["model_save_path"], coref_task_config=config) model.to(device) examples = model.get_eval_example() logger.info("********** Running Eval ****************") logger.info(" Num dev examples = %d", len(examples)) model.eval() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() doc_keys = [] keys = None with torch.no_grad(): for example_num, example in enumerate( tqdm(examples, desc="Eval_Examples")): tensorized_example = model.tensorize_example(example, is_training=False) input_ids = torch.from_numpy( tensorized_example[0]).long().to(device) input_mask = torch.from_numpy( tensorized_example[1]).long().to(device) text_len = torch.from_numpy( tensorized_example[2]).long().to(device) speaker_ids = torch.from_numpy( tensorized_example[3]).long().to(device) genre = torch.tensor(tensorized_example[4]).long().to(device) is_training = tensorized_example[5] gold_starts = torch.from_numpy( tensorized_example[6]).long().to(device) gold_ends = torch.from_numpy( tensorized_example[7]).long().to(device) cluster_ids = torch.from_numpy( tensorized_example[8]).long().to(device) sentence_map = torch.Tensor( tensorized_example[9]).long().to(device) if keys is not None and example['doc_key'] not in keys: continue doc_keys.append(example['doc_key']) (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores), loss = model(input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map) predicted_antecedents = model.get_predicted_antecedents( top_antecedents.cpu(), top_antecedent_scores.cpu()) coref_predictions[example["doc_key"]] = model.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) official_stdout = True eval_mode = True summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll(config["conll_eval_path"], coref_predictions, model.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100))
def evaluate(self, session, official_stdout=False): # self.load_eval_data() with open(self.config["inv_mapping"], 'rb') as handle: inv_mapping = pickle.load(handle) with open(self.config["eval_path"], 'rb') as handle: test = pickle.load(handle) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() swag_predictions = [] swag_labels = [] for i in range(len(test)): if i == 191 or i == 217 or i == 225: continue example = test[i] file_name = example["doc_key"] inv_map = inv_mapping[file_name] tensorized_example = self.tensorize_example(example, i, is_training=False) feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } lee_predictions, swag_pred = session.run( [self.predictions2, self.swag_predictions], feed_dict=feed_dict) _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = lee_predictions top_span_starts = inv_map[top_span_starts] top_span_ends = inv_map[top_span_ends] predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[file_name] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) # SWAG evaluation swag_label = tensorized_example[-1] swag_predictions.append(swag_pred[0]) swag_labels.append(swag_label[0]) if i % 10 == 0: print("Evaluated {}/{} examples.".format(i + 1, len(test))) #. and now you getthe predictiosn basically. summary_dict = {} try: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) except: print("unstable results") average_f1 = 0 p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) print("Now evaluating SWAG") swag_accuracy = self.swag_evaluation(swag_predictions, swag_labels) print("Average SWAG accuracy is : {:.2f}%".format(swag_accuracy * 100)) return util.make_summary(summary_dict), average_f1, swag_accuracy
def evaluate(self, model, prefix="", tb_writer=None, global_step=None, official=False): eval_dataset = get_dataset(self.args, tokenizer=self.tokenizer, evaluate=True) if self.eval_output_dir and not os.path.exists(self.eval_output_dir) and self.args.local_rank in [-1, 0]: os.makedirs(self.eval_output_dir) # Note that DistributedSampler samples randomly # eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = BucketBatchSampler(eval_dataset, max_total_seq_len=self.args.max_total_seq_len, batch_size_1=True) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Examples number: %d", len(eval_dataset)) model.eval() post_pruning_mention_evaluator = MentionEvaluator() mention_evaluator = MentionEvaluator() coref_evaluator = CorefEvaluator() losses = defaultdict(list) doc_to_prediction = {} doc_to_subtoken_map = {} for (doc_key, subtoken_maps), batch in eval_dataloader: batch = tuple(tensor.to(self.args.device) for tensor in batch) input_ids, attention_mask, start_entity_mentions_indices, end_entity_mentions_indices, start_antecedents_indices, end_antecedents_indices, gold_clusters = batch with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_entity_mention_labels=start_entity_mentions_indices, end_entity_mention_labels=end_entity_mentions_indices, start_antecedent_labels=start_antecedents_indices, end_antecedent_labels=end_antecedents_indices, gold_clusters=gold_clusters, return_all_outputs=True) loss_dict = outputs[-1] if self.args.n_gpu > 1: loss_dict = {key: val.mean() for key, val in loss_dict.items()} for key, val in loss_dict.items(): losses[key].append(val.item()) outputs = outputs[1:-1] batch_np = tuple(tensor.cpu().numpy() for tensor in batch) outputs_np = tuple(tensor.cpu().numpy() for tensor in outputs) for output in zip(*(batch_np + outputs_np)): gold_clusters = output[6] gold_clusters = extract_clusters(gold_clusters) mention_to_gold_clusters = extract_mentions_to_predicted_clusters_from_clusters(gold_clusters) gold_mentions = list(mention_to_gold_clusters.keys()) starts, end_offsets, coref_logits, mention_logits = output[-4:] max_antecedents = np.argmax(coref_logits, axis=1).tolist() mention_to_antecedent = {((int(start), int(end)), (int(starts[max_antecedent]), int(end_offsets[max_antecedent]))) for start, end, max_antecedent in zip(starts, end_offsets, max_antecedents) if max_antecedent < len(starts)} predicted_clusters, _ = extract_clusters_for_decode(mention_to_antecedent) candidate_mentions = list(zip(starts, end_offsets)) mention_to_predicted_clusters = extract_mentions_to_predicted_clusters_from_clusters(predicted_clusters) predicted_mentions = list(mention_to_predicted_clusters.keys()) post_pruning_mention_evaluator.update(candidate_mentions, gold_mentions) mention_evaluator.update(predicted_mentions, gold_mentions) coref_evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted_clusters, mention_to_gold_clusters) doc_to_prediction[doc_key] = predicted_clusters doc_to_subtoken_map[doc_key] = subtoken_maps post_pruning_mention_precision, post_pruning_mentions_recall, post_pruning_mention_f1 = post_pruning_mention_evaluator.get_prf() mention_precision, mentions_recall, mention_f1 = mention_evaluator.get_prf() prec, rec, f1 = coref_evaluator.get_prf() results = [(key, sum(val) / len(val)) for key, val in losses.items()] results += [ ("post pruning mention precision", post_pruning_mention_precision), ("post pruning mention recall", post_pruning_mentions_recall), ("post pruning mention f1", post_pruning_mention_f1), ("mention precision", mention_precision), ("mention recall", mentions_recall), ("mention f1", mention_f1), ("precision", prec), ("recall", rec), ("f1", f1) ] logger.info("***** Eval results {} *****".format(prefix)) for key, values in results: if isinstance(values, float): logger.info(f" {key} = {values:.3f}") else: logger.info(f" {key} = {values}") if tb_writer is not None and global_step is not None: tb_writer.add_scalar(key, values, global_step) if self.eval_output_dir: output_eval_file = os.path.join(self.eval_output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: if prefix: writer.write(f'\n{prefix}:\n') for key, values in results: if isinstance(values, float): writer.write(f"{key} = {values:.3f}\n") else: writer.write(f"{key} = {values}\n") results = OrderedDict(results) results["experiment_name"] = self.args.experiment_name results["data"] = prefix with open(os.path.join(self.args.output_dir, "results.jsonl"), "a+") as f: f.write(json.dumps(results) + '\n') if official: with open(os.path.join(self.args.output_dir, "preds.jsonl"), "w") as f: f.write(json.dumps(doc_to_prediction) + '\n') f.write(json.dumps(doc_to_subtoken_map) + '\n') if self.args.conll_path_for_eval is not None: conll_results = evaluate_conll(self.args.conll_path_for_eval, doc_to_prediction, doc_to_subtoken_map) official_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) logger.info('Official avg F1: %.4f' % official_f1) return results
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated= 0 ################################################################################################## ################### WE FURTHER DETECT THE RESULTS SEPERATEDLY FOR P-P; NP-NP, P-NP ############### ################################################################################################## coref_predictions_pp = {} coref_predictions_pnp = {} coref_predictions_npnp = {} # span type coref_evaluator_pp = PairEvaluator() coref_evaluator_pnp = PairEvaluator() coref_evaluator_npnp = PairEvaluator() coref_evaluator_all = PairEvaluator() num_coref_pp = 0 num_coref_pnp = 0 num_coref_npnp = 0 num_coref_all = 0 # span freq coref_evaluator_freq = PairEvaluator() coref_evaluator_rare = PairEvaluator() num_coref_freq = 0 num_coref_rare = 0 # pron type coref_evaluators_type = dict() coref_evaluators_type["demo"], coref_evaluators_type["pos"], coref_evaluators_type["third"] = PairEvaluator(), PairEvaluator(), PairEvaluator() nums_coref_type = dict() nums_coref_type["demo"], nums_coref_type["pos"], nums_coref_type["third"] = 0, 0, 0 count = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): try: # count += 1 # if count == 10: # break _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run([self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) ##################################################################################### # Evaluate on three different settings: NP-NP, NP-P, P-P by using a different cluster ##################################################################################### # Span Type flatten_sentences = util.flatten(example["sentences"]) gold_pp_pairs, gold_pnp_pairs, gold_npnp_pairs, num_pp_pairs, num_pnp_pairs, num_npnp_pairs, num_relation = self.cluster_to_pairs(example["clusters"], flatten_sentences) # predicted_clusters = coref_predictions[example["doc_key"]] pred_pp_pairs, pred_pnp_pairs, pred_npnp_pairs, _, _, _, _ = self.cluster_to_pairs(coref_predictions[example["doc_key"]], flatten_sentences) # Span Frequency gold_freq_pnp_pairs, gold_rare_pnp_pairs, num_freq_pairs, num_rare_pairs = self.cluster_to_pair_frequent(example["clusters"], flatten_sentences) pred_freq_pnp_pairs, pred_rare_pnp_pairs, _, _ = self.cluster_to_pair_frequent(coref_predictions[example["doc_key"]], flatten_sentences) # pronoun type demo, pos, third gold_type_pairs, gold_type_nums = self.cluster_to_pair_detailed_pronoun(example["clusters"], flatten_sentences) pred_type_pairs, pred_type_nums = self.cluster_to_pair_detailed_pronoun(coref_predictions[example["doc_key"]], flatten_sentences) for pron_type in ["demo", "pos", "third"]: coref_evaluators_type[pron_type].update(gold_type_pairs[pron_type], pred_type_pairs[pron_type]) nums_coref_type[pron_type] += gold_type_nums[pron_type] all_gold = gold_pp_pairs + gold_pnp_pairs + gold_npnp_pairs all_pred = pred_pp_pairs + pred_pnp_pairs + pred_npnp_pairs coref_evaluator_pp.update(pred_pp_pairs, gold_pp_pairs) coref_evaluator_pnp.update(pred_pnp_pairs, gold_pnp_pairs) coref_evaluator_npnp.update(pred_npnp_pairs, gold_npnp_pairs) coref_evaluator_all.update(all_pred, all_gold) coref_evaluator_freq.update(pred_freq_pnp_pairs, gold_freq_pnp_pairs) coref_evaluator_rare.update(pred_rare_pnp_pairs, gold_rare_pnp_pairs) num_coref_pp += num_pp_pairs num_coref_pnp += num_pnp_pairs num_coref_npnp += num_npnp_pairs num_coref_all = num_coref_all + num_pp_pairs + num_pnp_pairs + num_npnp_pairs num_coref_freq += num_freq_pairs num_coref_rare += num_rare_pairs except: a = "do nothing" summary_dict = {} self.print_prf(coref_evaluator_pp, summary_dict, doc_keys, "PP", num_coref_pp) self.print_prf(coref_evaluator_pnp, summary_dict, doc_keys, "PNP", num_coref_pnp) self.print_prf(coref_evaluator_npnp, summary_dict, doc_keys, "NPNP", num_coref_npnp) self.print_prf(coref_evaluator_freq, summary_dict, doc_keys, "FREQ", num_coref_freq) self.print_prf(coref_evaluator_rare, summary_dict, doc_keys, "RARE", num_coref_rare) for pron_type in ["demo", "pos", "third"]: self.print_prf(coref_evaluators_type[pron_type], summary_dict, doc_keys, pron_type, nums_coref_type[pron_type]) self.print_prf(coref_evaluator_all, summary_dict, doc_keys, "ALL_PAIRS", num_coref_all) ####################################################################################### # summary_dict = {} print("The evaluatoin results for all clusters") print("The number of pairs is "+ str(num_coref_all)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) return util.make_summary(summary_dict), f
for example_num, (tensorized_example, example) in enumerate(model.eval_data): doc_key = example["doc_key"] mention_starts = mention_start_dict[doc_key] mention_ends = mention_end_dict[doc_key] antecedents = antecedents_dict[doc_key] antecedent_scores = mean_antecedent_scores[doc_key] predicted_antecedents = [] for i, index in enumerate( np.argmax(antecedent_scores, axis=1) - 1): if index < 0: predicted_antecedents.append(-1) else: predicted_antecedents.append(antecedents[i, index]) merged_predictions[doc_key] = model.evaluate_coref( mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) conll_results = conll.evaluate_conll(main_config["conll_eval_path"], merged_predictions, official_stdout=True) average_f = sum(results["f"] for results in conll_results.values()) / len(conll_results) average_r = sum(results["r"] for results in conll_results.values()) / len(conll_results) average_p = sum(results["p"] for results in conll_results.values()) / len(conll_results) print "Merged average F1 (conll): {:.2f}%".format(average_f) print "Merged average Recall (conll): {:.2f}%".format(average_r) print "Merged average Precision (conll): {:.2f}%".format(average_p)
all_antecedent_scores[doc_key].append(antecedent_scores) if example_num % 10 == 0: print("Computed {}/{} examples.".format(example_num + 1, len(model.eval_data))) mean_antecedent_scores = { doc_key : np.mean(s, 0) for doc_key, s in all_antecedent_scores.items() } merged_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(model.eval_data): doc_key = example["doc_key"] mention_starts = mention_start_dict[doc_key] mention_ends = mention_end_dict[doc_key] antecedents = antecedents_dict[doc_key] antecedent_scores = mean_antecedent_scores[doc_key] predicted_antecedents = [] for i, index in enumerate(np.argmax(antecedent_scores, axis=1) - 1): if index < 0: predicted_antecedents.append(-1) else: predicted_antecedents.append(antecedents[i, index]) merged_predictions[doc_key] = model.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) conll_results = conll.evaluate_conll(main_config["conll_eval_path"], merged_predictions, official_stdout=True) average_f = sum(results["f"] for results in conll_results.values()) / len(conll_results) average_r = sum(results["r"] for results in conll_results.values()) / len(conll_results) average_p = sum(results["p"] for results in conll_results.values()) / len(conll_results) print("Merged average F1 (conll): {:.2f}%".format(average_f)) print("Merged average Recall (conll): {:.2f}%".format(average_r)) print("Merged average Precision (conll): {:.2f}%".format(average_p))
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) ##### print mentions of both example["clusters"] and coref_predictions[example["doc_key"]] below ######## print_clusters = False if print_clusters: comb_text = [ word for sentence in example['sentences'] for word in sentence ] print('#### Example Clusters: ####') for cluster in example['clusters']: mapped = [] for mention in cluster: mapped.append( self.convert_mention(mention, comb_text, example)) print(mapped, end=",\n") print('#### Predicted Clusters: ####') for cluster in coref_predictions[example["doc_key"]]: mapped = [] for mention in cluster: mapped.append( self.convert_mention(mention, comb_text, example)) print(mapped, end=",\n") summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util_xlnet.make_summary(summary_dict), f
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False, visualize=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 visualize_list = [] for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) predicted_clusters = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) coref_predictions[example["doc_key"]] = predicted_clusters # if example_num % 10 == 0: # print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) # Visualize antecedents if visualize: print('*****New Doc*****') subtokens = util.flatten(example['sentences']) span_list, antecedent_list = [], [] for idx, antecedent_idx in enumerate(predicted_antecedents): if antecedent_idx == -1: continue span_subtoken_idx = (top_span_starts[idx], top_span_ends[idx]) span_str = ' '.join( subtokens[span_subtoken_idx[0]:span_subtoken_idx[1] + 1]) antecedent_subtoken_idx = (top_span_starts[antecedent_idx], top_span_ends[antecedent_idx]) antecedent_str = ' '.join(subtokens[ antecedent_subtoken_idx[0]:antecedent_subtoken_idx[1] + 1]) # print('%s ---> %s' % (span_str, antecedent_str)) span_list.append(span_str) antecedent_list.append(antecedent_str) visualize_list.append((span_list, antecedent_list)) summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f logger.info("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p logger.info("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r logger.info("Average recall (py): {:.2f}%".format(r * 100)) if visualize: with open('visualize.bin', 'wb') as f: pickle.dump(visualize_list, f) logger.info('Saved visialized') return util.make_summary(summary_dict), f
def evaluate(self, session, official_stdout=False): self.load_eval_data() tp, fn, fp = 0, 0, 0 coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run( self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) gold_mentions = set([(s, e) for cl in example["clusters"] for s, e in cl]) pred_mentions = set([ (s, e) for s, e in zip(top_span_starts, top_span_ends) ]) tp += len(gold_mentions & pred_mentions) fn += len(gold_mentions - pred_mentions) fp += len(pred_mentions - gold_mentions) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) m_r = float(tp) / (tp + fn) m_p = float(tp) / (tp + fp) m_f1 = 2.0 * m_r * m_p / (m_r + m_p) print("Mention F1: {:.2f}%".format(m_f1 * 100)) print("Mention recall: {:.2f}%".format(m_r * 100)) print("Mention precision: {:.2f}%".format(m_p * 100)) summary_dict = {} if official_stdout: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) average_f1 = average_f1 if official_stdout else f * 100 return util.make_summary(summary_dict), average_f1