def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() predictions = [] for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) example['predict_cluster'] = coref_predictions[example["doc_key"]] predictions.append(example) if example_num % 10 == 9: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) df = pd.DataFrame(predictions) df['predict'] = df[['predict_cluster', 'Pronoun_mention', 'A_mention', 'B_mention']].apply(predict, axis=1) accuracy = accuracy_score(df['predict'], df['label']) summary_dict = {} summary_dict['accuracy'] = accuracy print(accuracy) return summary_dict, accuracy '''
def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), average_f1
def evaluate(model, eval_dataloader, data_path, device): with open(data_path) as f: examples = [json.loads(jsonline) for jsonline in f.readlines()] model.eval() coref_evaluator = metrics.CorefEvaluator(singleton=False) with torch.no_grad(): for i, (batch, example) in enumerate(zip(eval_dataloader, examples)): doc_key = batch[0] assert doc_key == example["doc_key"], (doc_key, example["doc_key"]) input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, \ subtoken_map = [b.to(device) for b in batch[1:]] predictions, loss = model(input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, subtoken_map) (top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, candidate_starts, candidate_ends, top_span_cluster_ids, top_span_mention_scores, candidate_mention_scores) = \ [p.detach().cpu() for p in predictions] predicted_antecedents = get_predicted_antecedents(top_antecedents.numpy(), top_antecedent_scores.numpy()) predicted_clusters = evaluate_coref(top_span_starts.numpy(), top_span_ends.numpy(), predicted_antecedents, example["clusters"], coref_evaluator, top_span_mention_scores) coref_p, coref_r, coref_f = coref_evaluator.get_prf() return coref_p, coref_r, coref_f
def eval(self, path): eval_fd_list = self.get_feed_dict_list(path, False) coref_evaluator = metrics.CorefEvaluator() for fd, clusters in eval_fd_list: mention_starts, mention_ends = fd[self.mention_starts], fd[ self.mention_ends] antecedents, mention_pair_scores = self.sess.run( self.predictions, fd) predicted_antecedents = [] for i, index in enumerate( np.argmax(mention_pair_scores, axis=1) - 1): if index < 0: predicted_antecedents.append(-1) else: predicted_antecedents.append(antecedents[i, index]) self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, clusters, coref_evaluator) p, r, f = coref_evaluator.get_prf() print("Average F1 (py): {:.2f}%".format(f * 100)) print("Average precision (py): {:.2f}%".format(p * 100)) print("Average recall (py): {:.2f}%".format(r * 100))
def evaluate(self, session, official_stdout=False): self.load_eval_data() def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] } coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, tag_labels, tag_seq, tag_loss_label = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores, tag_outputs, tag_seq = session.run(self.predictions, feed_dict=feed_dict) self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators) predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print "Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)) # print tag_outputs # print tag_seq summary_dict = {} for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)): tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.2f}".format(t, v)) summary_dict[t] = v print ", ".join(results_to_print) conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print "Average F1 (conll): {:.2f}%".format(average_f1) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print "Average F1 (py): {:.2f}%".format(f * 100) summary_dict["Average precision (py)"] = p print "Average precision (py): {:.2f}%".format(p * 100) summary_dict["Average recall (py)"] = r print "Average recall (py): {:.2f}%".format(r * 100) return util.make_summary(summary_dict), average_f1
def conll_evaluate(l0_inputs, alphas, conll_eval_path, all_top_antecedent_scores): print("Compiling clusters and evaluators for conll suite") if isinstance(alphas, float) or isinstance(alphas, int): alphas = [alphas] coref_predictions = [{} for _ in alphas] coref_evaluators = [metrics.CorefEvaluator() for _ in alphas] subtoken_maps = {} with open(l0_inputs, "rb") as f: data_dicts = np.load(f, allow_pickle=True).item().get("data_dicts") for example_num, data_dict in enumerate(tqdm(data_dicts)): example = data_dict["example"] subtoken_maps[example["doc_key"]] = example["subtoken_map"] top_span_starts = data_dict["top_span_starts"] top_span_ends = data_dict["top_span_ends"] top_antecedents = data_dict["top_antecedents"] for i in range(len(alphas)): top_antecedent_scores = all_top_antecedent_scores[ example["doc_key"]][i] predicted_antecedents = get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[i][example["doc_key"]] = evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluators[i]) summary_dict = DD(list) for i in range(len(alphas)): print("\n*****************************") print("******* alpha = %f *******" % alphas[i]) summary_dict["alpha"].append(alphas[i]) conll_results = conll.evaluate_conll(conll_eval_path, coref_predictions[i], subtoken_maps, official_stdout=True) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"].append(average_f1) print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluators[i].get_prf() summary_dict["Average F1 (py)"].append(f) print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(subtoken_maps.keys()))) summary_dict["Average precision (py)"].append(p) print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"].append(r) print("Average recall (py): {:.2f}%".format(r * 100)) return summary_dict
def evaluate(self, model, device, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() doc_keys = [] with torch.no_grad(): for example_num, example in enumerate(tqdm(self.eval_data, desc="Eval_Examples")): tensorized_example = model.tensorize_example(example, is_training=False) input_ids = torch.from_numpy(tensorized_example[0]).long().to(device) input_mask = torch.from_numpy(tensorized_example[1]).long().to(device) text_len = torch.from_numpy(tensorized_example[2]).long().to(device) speaker_ids = torch.from_numpy(tensorized_example[3]).long().to(device) genre = torch.tensor(tensorized_example[4]).long().to(device) is_training = tensorized_example[5] gold_starts = torch.from_numpy(tensorized_example[6]).long().to(device) gold_ends = torch.from_numpy(tensorized_example[7]).long().to(device) cluster_ids = torch.from_numpy(tensorized_example[8]).long().to(device) sentence_map = torch.Tensor(tensorized_example[9]).long().to(device) if keys is not None and example['doc_key'] not in keys: continue doc_keys.append(example['doc_key']) (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores), loss = model(input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map) predicted_antecedents = self.get_predicted_antecedents(top_antecedents.cpu(), top_antecedent_scores.cpu()) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return summary_dict, f
def evaluate(self, session, official_stdout=False): # self.load_eval_data() with open(self.config["inv_mapping"], 'rb') as handle: inv_mapping = pickle.load(handle) with open(self.config["eval_path"], 'rb') as handle: test = pickle.load(handle) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for i in range(len(test)): if i == 191 or i == 217 or i == 225: continue example = test[i] file_name = example["doc_key"] inv_map = inv_mapping[file_name] tensorized_example = self.tensorize_example(example, is_training=False) _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run( self.predictions, feed_dict=feed_dict) top_span_starts = inv_map[top_span_starts] top_span_ends = inv_map[top_span_ends] predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[file_name] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if i % 10 == 0: print("Evaluated {}/{} examples.".format(i + 1, len(test))) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100))
def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() avg_loss = 0.0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} predictions, loss = session.run([self.predictions, self.loss], feed_dict=feed_dict) candidate_starts, candidate_ends, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, _, _ = predictions predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 20 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) avg_loss += loss avg_loss = avg_loss / len(self.eval_data) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) cluster_result = {'prediction':coref_predictions, 'gold':official_stdout} with open('evaluate_result.pickle', 'wb') as handle: pickle.dump(cluster_result, handle, protocol=pickle.HIGHEST_PROTOCOL) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) summary_dict["Validation loss"] = avg_loss print("Validation loss: {:.3f}".format(avg_loss)) return util.make_summary(summary_dict), average_f1, avg_loss
def evaluate(gold_file, predicted_file): metrics.INCLUDE_SINGLETONS = True eval_data = load_eval_data(gold_file, predicted_file) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, example in enumerate(eval_data): coref_predictions[example["doc_key"]] = evaluate_coref( example['predicted_clusters'], example['clusters'], coref_evaluator, ) mention_p, mention_r, mention_f = metrics.get_prf_mentions_for_all_documents( eval_data, coref_predictions) summary_dict = {} p, r, f = coref_evaluator.get_prf() average_f1 = f * 100 summary_dict["Average F1 (py)"] = average_f1 print("Average F1 (py): {:.2f}%".format(average_f1)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) average_mention_f1 = mention_f * 100 summary_dict["Average mention F1 (py)"] = average_mention_f1 print("Average mention F1 (py): {:.2f}%".format(average_mention_f1)) summary_dict["Average mention precision (py)"] = mention_p print("Average mention precision (py): {:.2f}%".format(mention_p * 100)) summary_dict["Average mention recall (py)"] = mention_r print("Average mention recall (py): {:.2f}%".format(mention_r * 100))
def evaluate(self, data, transformer_model): """ 评估函数 """ coref_evaluator = metrics.CorefEvaluator() with torch.no_grad(): for idx, data_i in enumerate(data): sentences_ids, sentences_masks, sentences_valid_masks, gold_clusters, speaker_ids, sentence_map, subtoken_map, genre = data_i top_antecedents_score, top_antecedents_index, top_m_spans_masks, top_m_spans_start, top_m_spans_end = self.forward( sentences_ids, sentences_masks, sentences_valid_masks, speaker_ids, sentence_map, subtoken_map, genre, transformer_model) predicted_antecedents = self.get_predicted_antecedents( top_antecedents_index, top_antecedents_score) top_m_spans = list() for i in range(len(top_m_spans_start)): top_m_spans.append( tuple([ top_m_spans_start[i].item(), top_m_spans_end[i].item() ])) # all spans gold_clusters = [ tuple(tuple([m[0], m[1]]) for m in gc) for gc in gold_clusters ] mention_to_gold = {} for gc in gold_clusters: for mention in gc: mention_to_gold[tuple(mention)] = gc predicted_clusters, mention_to_predicted = self.get_predicted_clusters( top_m_spans, predicted_antecedents) coref_evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold) return coref_evaluator.get_prf()
def eval_coref(config): """ 指代消解模型验证 :param config: 配置参数 :return: None """ model = CorefModel.from_pretrained(config["model_save_path"], coref_task_config=config) model.to(device) examples = model.get_eval_example() logger.info("********** Running Eval ****************") logger.info(" Num dev examples = %d", len(examples)) model.eval() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() doc_keys = [] keys = None with torch.no_grad(): for example_num, example in enumerate( tqdm(examples, desc="Eval_Examples")): tensorized_example = model.tensorize_example(example, is_training=False) input_ids = torch.from_numpy( tensorized_example[0]).long().to(device) input_mask = torch.from_numpy( tensorized_example[1]).long().to(device) text_len = torch.from_numpy( tensorized_example[2]).long().to(device) speaker_ids = torch.from_numpy( tensorized_example[3]).long().to(device) genre = torch.tensor(tensorized_example[4]).long().to(device) is_training = tensorized_example[5] gold_starts = torch.from_numpy( tensorized_example[6]).long().to(device) gold_ends = torch.from_numpy( tensorized_example[7]).long().to(device) cluster_ids = torch.from_numpy( tensorized_example[8]).long().to(device) sentence_map = torch.Tensor( tensorized_example[9]).long().to(device) if keys is not None and example['doc_key'] not in keys: continue doc_keys.append(example['doc_key']) (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores), loss = model(input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map) predicted_antecedents = model.get_predicted_antecedents( top_antecedents.cpu(), top_antecedent_scores.cpu()) coref_predictions[example["doc_key"]] = model.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) official_stdout = True eval_mode = True summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll(config["conll_eval_path"], coref_predictions, model.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100))
def evaluate(self, session, official_stdout=False): self.load_eval_data() tp, fn, fp = 0, 0, 0 coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run( self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) gold_mentions = set([(s, e) for cl in example["clusters"] for s, e in cl]) pred_mentions = set([ (s, e) for s, e in zip(top_span_starts, top_span_ends) ]) tp += len(gold_mentions & pred_mentions) fn += len(gold_mentions - pred_mentions) fp += len(pred_mentions - gold_mentions) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) m_r = float(tp) / (tp + fn) m_p = float(tp) / (tp + fp) m_f1 = 2.0 * m_r * m_p / (m_r + m_p) print("Mention F1: {:.2f}%".format(m_f1 * 100)) print("Mention recall: {:.2f}%".format(m_r * 100)) print("Mention precision: {:.2f}%".format(m_p * 100)) summary_dict = {} if official_stdout: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) average_f1 = average_f1 if official_stdout else f * 100 return util.make_summary(summary_dict), average_f1
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False, visualize=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 visualize_list = [] for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) predicted_clusters = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) coref_predictions[example["doc_key"]] = predicted_clusters # if example_num % 10 == 0: # print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) # Visualize antecedents if visualize: print('*****New Doc*****') subtokens = util.flatten(example['sentences']) span_list, antecedent_list = [], [] for idx, antecedent_idx in enumerate(predicted_antecedents): if antecedent_idx == -1: continue span_subtoken_idx = (top_span_starts[idx], top_span_ends[idx]) span_str = ' '.join( subtokens[span_subtoken_idx[0]:span_subtoken_idx[1] + 1]) antecedent_subtoken_idx = (top_span_starts[antecedent_idx], top_span_ends[antecedent_idx]) antecedent_str = ' '.join(subtokens[ antecedent_subtoken_idx[0]:antecedent_subtoken_idx[1] + 1]) # print('%s ---> %s' % (span_str, antecedent_str)) span_list.append(span_str) antecedent_list.append(antecedent_str) visualize_list.append((span_list, antecedent_list)) summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f logger.info("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p logger.info("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r logger.info("Average recall (py): {:.2f}%".format(r * 100)) if visualize: with open('visualize.bin', 'wb') as f: pickle.dump(visualize_list, f) logger.info('Saved visialized') return util.make_summary(summary_dict), f
def evaluate(self, session, official_stdout=False): # self.load_eval_data() with open(self.config["inv_mapping"], 'rb') as handle: inv_mapping = pickle.load(handle) with open(self.config["eval_path"], 'rb') as handle: test = pickle.load(handle) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() swag_predictions = [] swag_labels = [] for i in range(len(test)): if i == 191 or i == 217 or i == 225: continue example = test[i] file_name = example["doc_key"] inv_map = inv_mapping[file_name] tensorized_example = self.tensorize_example(example, i, is_training=False) feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } lee_predictions, swag_pred = session.run( [self.predictions2, self.swag_predictions], feed_dict=feed_dict) _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = lee_predictions top_span_starts = inv_map[top_span_starts] top_span_ends = inv_map[top_span_ends] predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[file_name] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) # SWAG evaluation swag_label = tensorized_example[-1] swag_predictions.append(swag_pred[0]) swag_labels.append(swag_label[0]) if i % 10 == 0: print("Evaluated {}/{} examples.".format(i + 1, len(test))) #. and now you getthe predictiosn basically. summary_dict = {} try: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) except: print("unstable results") average_f1 = 0 p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) print("Now evaluating SWAG") swag_accuracy = self.swag_evaluation(swag_predictions, swag_labels) print("Average SWAG accuracy is : {:.2f}%".format(swag_accuracy * 100)) return util.make_summary(summary_dict), average_f1, swag_accuracy
def __init__(self, config): self.config = config self.max_segment_len = config['max_segment_len'] self.max_span_width = config["max_span_width"] self.genres = {g: i for i, g in enumerate(config["genres"])} self.subtoken_maps = {} self.gold = {} self.eval_data = None # Load eval data lazily. self.dropout = None self.bert_config = modeling.BertConfig.from_json_file( config["bert_config_file"]) self.tokenizer = tokenization.FullTokenizer( vocab_file=config['vocab_file'], do_lower_case=False) input_props = [] input_props.append( (tf.int32, [None, None])) # input_ids. (batch_size, seq_len) input_props.append( (tf.int32, [None, None])) # input_mask (batch_size, seq_len) input_props.append((tf.int32, [None])) # Text lengths. input_props.append( (tf.int32, [None, None])) # Speaker IDs. (batch_size, seq_len) input_props.append( (tf.int32, [])) # Genre. 能确保整个batch都是同主题,能因为一篇文章的多段放在一个batch里 input_props.append((tf.bool, [])) # Is training. input_props.append( (tf.int32, [None])) # Gold starts. 一个instance只有一个start?是整篇文章的所有mention的start input_props.append((tf.int32, [None])) # Gold ends. 整篇文章的所有mention的end input_props.append( (tf.int32, [None])) # Cluster ids. 整篇文章的所有mention的id input_props.append( (tf.int32, [None])) # Sentence Map 整篇文章的每个token属于哪个句子 self.queue_input_tensors = [ tf.placeholder(dtype, shape) for dtype, shape in input_props ] dtypes, shapes = zip(*input_props) queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes) # 10是batch_size? self.enqueue_op = queue.enqueue(self.queue_input_tensors) self.input_tensors = queue.dequeue() # self.queue_input_tensors 不一样? self.predictions, self.loss = self.get_predictions_and_loss( *self.input_tensors) # bert stuff tvars = tf.trainable_variables() # If you're using TF weights only, tf_checkpoint and init_checkpoint can be the same # Get the assignment map from the tensorflow checkpoint. # Depending on the extension, use TF/Pytorch to load weights. assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint( tvars, config['tf_checkpoint']) init_from_checkpoint = tf.train.init_from_checkpoint if config[ 'init_checkpoint'].endswith( 'ckpt') else load_from_pytorch_checkpoint init_from_checkpoint(config['init_checkpoint'], assignment_map) print("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" # tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, # init_string) print(" name = %s, shape = %s%s" % (var.name, var.shape, init_string)) num_train_steps = int(self.config['num_docs'] * self.config['num_epochs']) # 文章数 * 训练轮数 num_warmup_steps = int(num_train_steps * 0.1) # 前1/10做warm_up self.global_step = tf.train.get_or_create_global_step( ) # 根据不同的model得到不同的optimizer self.train_op = optimization.create_custom_optimizer( tvars, self.loss, self.config['bert_learning_rate'], self.config['task_learning_rate'], num_train_steps, num_warmup_steps, False, self.global_step, freeze=-1, task_opt=self.config['task_optimizer'], eps=config['adam_eps']) self.coref_evaluator = metrics.CorefEvaluator()
def evaluate(self, name='test', saves_results=False): # from collections import Counter # span_len_cnts = Counter() with torch.no_grad(): log('evaluating') evaluator = metrics.CorefEvaluator() self.model.eval() batch_num = 0 next_logging_pct = 10. start_time = time.time() cluster_predictions = {} avg_pos_acc = 0. for pct, example_idx, input_tensors, pos_tags, cand_mention_labels in data_utils.gen_batches( name): batch_num += 1 ( # [cand_num] cand_mention_scores, # [top_cand_num] top_start_idxes, # [top_cand_num] top_end_idxes, # [top_cand_num] top_span_cluster_ids, # [top_span_num, pruned_ant_num] top_ant_idxes_of_spans, # [top_cand_num, pruned_ant_num] top_ant_cluster_ids_of_spans, # # [top_cand_num, 1 + pruned_ant_num] # top_ant_scores_of_spans, # 4 * [top_cand_num, 1 + pruned_ant_num] list_of_top_ant_scores_of_spans, # [top_span_num, pruned_ant_num] top_ant_mask_of_spans, # [doc_len, pos_tag_num] pos_tag_logits, # [top_span_num, 1 + top_span_num], [top_span_num, top_span_num] full_fast_ant_scores_of_spans, full_ant_mask_of_spans) = self.model(*input_tensors) ( top_start_idxes, top_end_idxes, predicted_ant_idxes, predicted_clusters, span_to_predicted_cluster ) = Runner.predict( # [cand_num] cand_mention_scores, # [top_cand_num] top_start_idxes, # [top_cand_num] top_end_idxes, # [top_cand_num] top_span_cluster_ids, # [top_span_num, pruned_ant_num] top_ant_idxes_of_spans, # [top_cand_num, pruned_ant_num] top_ant_cluster_ids_of_spans, # # [top_cand_num, 1 + pruned_ant_num] # top_ant_scores_of_spans, # 4 * [top_cand_num, 1 + pruned_ant_num] list_of_top_ant_scores_of_spans, # [top_span_num, pruned_ant_num] top_ant_mask_of_spans) # span_len_cnts.update((top_end_idxes - top_start_idxes + 1).tolist()) if configs.predicts_pos_tags: avg_pos_acc += Runner.compute_accuracy( pos_tag_logits, pos_tags.cuda()) gold_clusters = data_utils.get_gold_clusters(name, example_idx) gold_clusters = [ tuple(tuple(span) for span in cluster) for cluster in gold_clusters ] span_to_gold_cluster = { span: cluster for cluster in gold_clusters for span in cluster } evaluator.update( predicted=predicted_clusters, gold=gold_clusters, mention_to_predicted=span_to_predicted_cluster, mention_to_gold=span_to_gold_cluster) cluster_predictions[data_utils.get_doc_key( name, example_idx)] = predicted_clusters if pct >= next_logging_pct: na_str = 'N/A' log(f'{int(pct)}%,\ttime:\t{time.time() - start_time}\n' f'pos_acc:\t{avg_pos_acc / batch_num if configs.predicts_pos_tags else na_str}\n' f'f1:\t{evaluator.get_f1()}\n') next_logging_pct += 5. epoch_precision, epoch_recall, epoch_f1 = evaluator.get_prf() avg_pos_acc /= batch_num avg_conll_f1 = conll.compute_avg_conll_f1( f'{configs.data_dir}/{name}.english.v4_gold_conll', cluster_predictions, official_stdout=True) na_str = 'N/A' log(f'avg_valid_time:\t{time.time() - start_time}\n' f'pos_acc:\t{avg_pos_acc if configs.predicts_pos_tags else na_str}\n' f'precision:\t{epoch_precision}\n' f'recall:\t{epoch_recall}\n' f'f1:\t{epoch_f1}\n' f'conll_f1: {avg_conll_f1}') # if saves_results: # data_utils.save_predictions(name, cluster_predictions) # if name == 'test' and configs.training: # if avg_conll_f1 > self.max_f1: # self.max_f1 = avg_conll_f1 # # self.save_ckpt() # # max_f1_file = open(configs.max_f1_path) # # if epoch_f1 > float(max_f1_file.readline().strip()): # max_f1_file.close() # max_f1_file = open(configs.max_f1_path, 'w') # print(epoch_f1, file=max_f1_file) # self.save_ckpt() # # max_f1_file.close() # self.lr_scheduler.step(epoch_f1) # self.lr_scheduler.step(-avg_epoch_loss) return avg_conll_f1
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated= 0 ################################################################################################## ################### WE FURTHER DETECT THE RESULTS SEPERATEDLY FOR P-P; NP-NP, P-NP ############### ################################################################################################## coref_predictions_pp = {} coref_predictions_pnp = {} coref_predictions_npnp = {} # span type coref_evaluator_pp = PairEvaluator() coref_evaluator_pnp = PairEvaluator() coref_evaluator_npnp = PairEvaluator() coref_evaluator_all = PairEvaluator() num_coref_pp = 0 num_coref_pnp = 0 num_coref_npnp = 0 num_coref_all = 0 # span freq coref_evaluator_freq = PairEvaluator() coref_evaluator_rare = PairEvaluator() num_coref_freq = 0 num_coref_rare = 0 # pron type coref_evaluators_type = dict() coref_evaluators_type["demo"], coref_evaluators_type["pos"], coref_evaluators_type["third"] = PairEvaluator(), PairEvaluator(), PairEvaluator() nums_coref_type = dict() nums_coref_type["demo"], nums_coref_type["pos"], nums_coref_type["third"] = 0, 0, 0 count = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): try: # count += 1 # if count == 10: # break _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run([self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) ##################################################################################### # Evaluate on three different settings: NP-NP, NP-P, P-P by using a different cluster ##################################################################################### # Span Type flatten_sentences = util.flatten(example["sentences"]) gold_pp_pairs, gold_pnp_pairs, gold_npnp_pairs, num_pp_pairs, num_pnp_pairs, num_npnp_pairs, num_relation = self.cluster_to_pairs(example["clusters"], flatten_sentences) # predicted_clusters = coref_predictions[example["doc_key"]] pred_pp_pairs, pred_pnp_pairs, pred_npnp_pairs, _, _, _, _ = self.cluster_to_pairs(coref_predictions[example["doc_key"]], flatten_sentences) # Span Frequency gold_freq_pnp_pairs, gold_rare_pnp_pairs, num_freq_pairs, num_rare_pairs = self.cluster_to_pair_frequent(example["clusters"], flatten_sentences) pred_freq_pnp_pairs, pred_rare_pnp_pairs, _, _ = self.cluster_to_pair_frequent(coref_predictions[example["doc_key"]], flatten_sentences) # pronoun type demo, pos, third gold_type_pairs, gold_type_nums = self.cluster_to_pair_detailed_pronoun(example["clusters"], flatten_sentences) pred_type_pairs, pred_type_nums = self.cluster_to_pair_detailed_pronoun(coref_predictions[example["doc_key"]], flatten_sentences) for pron_type in ["demo", "pos", "third"]: coref_evaluators_type[pron_type].update(gold_type_pairs[pron_type], pred_type_pairs[pron_type]) nums_coref_type[pron_type] += gold_type_nums[pron_type] all_gold = gold_pp_pairs + gold_pnp_pairs + gold_npnp_pairs all_pred = pred_pp_pairs + pred_pnp_pairs + pred_npnp_pairs coref_evaluator_pp.update(pred_pp_pairs, gold_pp_pairs) coref_evaluator_pnp.update(pred_pnp_pairs, gold_pnp_pairs) coref_evaluator_npnp.update(pred_npnp_pairs, gold_npnp_pairs) coref_evaluator_all.update(all_pred, all_gold) coref_evaluator_freq.update(pred_freq_pnp_pairs, gold_freq_pnp_pairs) coref_evaluator_rare.update(pred_rare_pnp_pairs, gold_rare_pnp_pairs) num_coref_pp += num_pp_pairs num_coref_pnp += num_pnp_pairs num_coref_npnp += num_npnp_pairs num_coref_all = num_coref_all + num_pp_pairs + num_pnp_pairs + num_npnp_pairs num_coref_freq += num_freq_pairs num_coref_rare += num_rare_pairs except: a = "do nothing" summary_dict = {} self.print_prf(coref_evaluator_pp, summary_dict, doc_keys, "PP", num_coref_pp) self.print_prf(coref_evaluator_pnp, summary_dict, doc_keys, "PNP", num_coref_pnp) self.print_prf(coref_evaluator_npnp, summary_dict, doc_keys, "NPNP", num_coref_npnp) self.print_prf(coref_evaluator_freq, summary_dict, doc_keys, "FREQ", num_coref_freq) self.print_prf(coref_evaluator_rare, summary_dict, doc_keys, "RARE", num_coref_rare) for pron_type in ["demo", "pos", "third"]: self.print_prf(coref_evaluators_type[pron_type], summary_dict, doc_keys, pron_type, nums_coref_type[pron_type]) self.print_prf(coref_evaluator_all, summary_dict, doc_keys, "ALL_PAIRS", num_coref_all) ####################################################################################### # summary_dict = {} print("The evaluatoin results for all clusters") print("The number of pairs is "+ str(num_coref_all)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) return util.make_summary(summary_dict), f
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: # if keys is not None and example['doc_key'] in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) # continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) summary_dict = {} # with open('doc_keys_512.txt', 'w') as f: # for key in doc_keys: # f.write(key + '\n') if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), f
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) ##### print mentions of both example["clusters"] and coref_predictions[example["doc_key"]] below ######## print_clusters = False if print_clusters: comb_text = [ word for sentence in example['sentences'] for word in sentence ] print('#### Example Clusters: ####') for cluster in example['clusters']: mapped = [] for mention in cluster: mapped.append( self.convert_mention(mention, comb_text, example)) print(mapped, end=",\n") print('#### Predicted Clusters: ####') for cluster in coref_predictions[example["doc_key"]]: mapped = [] for mention in cluster: mapped.append( self.convert_mention(mention, comb_text, example)) print(mapped, end=",\n") summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util_xlnet.make_summary(summary_dict), f
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False, to_npy=None, from_npy=None, rsa_model=None): assert not (to_npy is not None and from_npy is not None), "cannot set both to_npy and from_npy to be none!" self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 total_time = 0 if to_npy: data_dicts = [] if from_npy: with open(from_npy, "rb") as f: from_npy_dict = np.load(f) data_dicts = from_npy_dict.item().get("data_dicts") for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example if from_npy is None: feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = \ session.run([self.loss, self.predictions], feed_dict=feed_dict) else: data_dict = data_dicts[example_num] example = data_dict["example"] if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) tensorized_example = data_dict["tensorized_example"] loss = data_dict["loss"] top_span_starts = data_dict["top_span_starts"] top_span_ends = data_dict["top_span_ends"] top_antecedents = data_dict["top_antecedents"] top_antecedent_scores = data_dict["top_antecedent_scores"] # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) if rsa_model is not None: print("Running l1 for sentence %d" % example_num) start_time = time.time() top_antecedent_scores = rsa_model.l1(example, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) duration = time.time() - start_time print("Finished sentence %d, took %.2f s" % (example_num, duration)) total_time += duration num_evaluated += 1 if to_npy: data_dict = { "example_num": example_num, "tensorized_example": tensorized_example, "example": example, "top_span_starts": top_span_starts, "top_span_ends": top_span_ends, "top_antecedents": top_antecedents, "top_antecedent_scores": top_antecedent_scores, "loss": loss, } data_dicts.append(data_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} if to_npy: dict_to_npy = {"data_dicts": data_dicts} if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout ) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 if to_npy: dict_to_npy["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) if to_npy: dict_to_npy["Average F1 (py)"] = f dict_to_npy["Average precision (py)"] = p dict_to_npy["Average recall (py)"] = r with open(to_npy, "wb") as f_to_npy: np.save(f_to_npy, dict_to_npy) if rsa_model: print("Ran rsa on %d sentences, avg time per sentence %.2f s" % num_evaluated, total_time / num_evaluated) return util.make_summary(summary_dict), f
mention_end_dict[doc_key] = mention_ends antecedents_dict[doc_key] = antecedents all_antecedent_scores[doc_key].append(antecedent_scores) if example_num % 10 == 0: print "Computed {}/{} examples.".format( example_num + 1, len(model.eval_data)) mean_antecedent_scores = { doc_key: np.mean(s, 0) for doc_key, s in all_antecedent_scores.items() } merged_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(model.eval_data): doc_key = example["doc_key"] mention_starts = mention_start_dict[doc_key] mention_ends = mention_end_dict[doc_key] antecedents = antecedents_dict[doc_key] antecedent_scores = mean_antecedent_scores[doc_key] predicted_antecedents = [] for i, index in enumerate( np.argmax(antecedent_scores, axis=1) - 1): if index < 0: predicted_antecedents.append(-1) else: predicted_antecedents.append(antecedents[i, index]) merged_predictions[doc_key] = model.evaluate_coref(