def train(): global max_f1 with tf.Session(config=util.gpu_config()) as session: session.run(tf.global_variables_initializer()) model.start_enqueue_thread(session) accumulated_loss = 0.0 ckpt = tf.train.get_checkpoint_state(log_dir) if ckpt and ckpt.model_checkpoint_path: print("Restoring from: {}".format(ckpt.model_checkpoint_path)) saver.restore(session, ckpt.model_checkpoint_path) initial_time = time.time() while True: tf_loss, tf_global_step, _ = session.run( [model.loss, model.global_step, model.train_op]) accumulated_loss += tf_loss if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print("[{}] loss={:.2f}, steps/s={:.2f}".format( tf_global_step, average_loss, steps_per_second)) writer.add_summary( util.make_summary({"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step % save_frequency == 0: saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) if tf_global_step % eval_frequency == 0: eval_summary, eval_f1 = model.evaluate(session) if eval_f1 > max_f1: max_f1 = eval_f1 util.copy_checkpoint( os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt")) writer.add_summary(eval_summary, tf_global_step) writer.add_summary( util.make_summary({"max_eval_f1": max_f1}), tf_global_step) print("[{}] evaL_f1={:.2f}, max_f1={:.2f}".format( tf_global_step, eval_f1, max_f1))
def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), average_f1
def evaluate(self, session, official_stdout=False): self.load_eval_data() def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] } coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, tag_labels, tag_seq, tag_loss_label = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores, tag_outputs, tag_seq = session.run(self.predictions, feed_dict=feed_dict) self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators) predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print "Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)) # print tag_outputs # print tag_seq summary_dict = {} for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)): tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.2f}".format(t, v)) summary_dict[t] = v print ", ".join(results_to_print) conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print "Average F1 (conll): {:.2f}%".format(average_f1) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print "Average F1 (py): {:.2f}%".format(f * 100) summary_dict["Average precision (py)"] = p print "Average precision (py): {:.2f}%".format(p * 100) summary_dict["Average recall (py)"] = r print "Average recall (py): {:.2f}%".format(r * 100) return util.make_summary(summary_dict), average_f1
def main_csv_reader(args): path_to_coffee = args.path_to_coffee path_to_matched = args.matched_json all_people_list = flat_list(list(read_csv_file(path_to_coffee))) matched_in_this_session = [] error = False if path_to_matched: try: matched_people_json = read_json_file(path_to_matched) tuple_list = create_tuple_list(all_people_list, matched_people_json) sorted_people_list = sort_tuple_list(tuple_list) except: raise ('Only use the program generated matched_people.json file') else: write_json_file() matched_people_json = read_json_file('matched_people.json') sorted_people_list = all_people_list unmatched_people = [] for person in sorted_people_list: if person not in matched_in_this_session: individual_match_list = invidual_preproc(person, all_people_list, matched_people_json, matched_in_this_session) if individual_match_list: matched_pair = coffee_roulette(person, individual_match_list) if matched_pair is not None: for person in matched_pair: matched_in_this_session.append(person) else: error = True break else: unmatched_people.append(person) else: pass if error is False: create_today_matched(matched_in_this_session) if unmatched_people: create_today_unmatched(unmatched_people) updated_json = update_current_json(matched_people_json, matched_in_this_session) summary = "\n{} Matches".format(date.today()) summary = create_matched_people_string(matched_in_this_session, summary) summary_messsage, alone = make_summary(matched_in_this_session, unmatched_people, summary, "") summary += alone write_json_file(updated_json) write_txt_file(summary) print(summary_messsage)
def evaluate(self, session, official_stdout=False): self.load_eval_data() def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] } coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators) predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)): tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.2f}".format(t, v)) summary_dict[t] = v print(", ".join(results_to_print)) conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), average_f1
def evaluate(self, session, official_stdout=False, eval_mode=False): self.load_eval_data() coref_predictions = {} for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, \ top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict) """ candidate_starts: (num_words, max_span_width) 所有候选span的start candidate_ends: (num_words, max_span_width) 所有候选span的end candidate_mention_scores: (num_candidates,) 候选答案的得分 top_span_starts: (k, ) 筛选过mention之后的候选的start_index top_span_ends: (k, ) 筛选过mention之后的候选的end_index top_antecedents: (k, c) 粗筛过antecedent之后的每个候选antecedent的index top_antecedent_scores: (k, c) 粗筛过antecedent之后的每个候选antecedent的score """ predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"]) if (example_num + 1) % 100 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) summary_dict = {} if eval_mode: # 在测试集评测的时候,需要用官方的脚本再评测一遍 conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = self.coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(self.eval_data))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), f
def evaluate(self, session, official_stdout=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() avg_loss = 0.0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} predictions, loss = session.run([self.predictions, self.loss], feed_dict=feed_dict) candidate_starts, candidate_ends, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, _, _ = predictions predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 20 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) avg_loss += loss avg_loss = avg_loss / len(self.eval_data) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) cluster_result = {'prediction':coref_predictions, 'gold':official_stdout} with open('evaluate_result.pickle', 'wb') as handle: pickle.dump(cluster_result, handle, protocol=pickle.HIGHEST_PROTOCOL) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) summary_dict["Validation loss"] = avg_loss print("Validation loss: {:.3f}".format(avg_loss)) return util.make_summary(summary_dict), average_f1, avg_loss
def evaluate(self, session, official_stdout=False): with open(self.config["inv_mapping"], 'rb') as handle: inv_mapping = pickle.load(handle) with open(self.config["eval_path"], 'rb') as handle: test = pickle.load(handle) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for i in range(len(test)): example = test[i] file_name = example["doc_key"] inv_map = inv_mapping[file_name] tensorized_example = self.tensorize_example(example, is_training=False) feed_dict = dict(zip(self.input_tensors, tensorized_example)) _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _ = tensorized_example # feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run( self.predictions, feed_dict=feed_dict) top_span_starts = inv_map[top_span_starts] top_span_ends = inv_map[top_span_ends] predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[file_name] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if i % 10 == 0: print("Evaluated {}/{} examples.".format(i + 1, len(test))) summary_dict = {} conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), average_f1
def evaluate_mention_proposal(self, session, official_stdout=False, eval_mode=False): self.load_eval_data() summary_dict = {} tp = 0 fp = 0 fn = 0 epsilon = 1e-10 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } pred_labels, gold_labels = session.run( [self.pred_mention_labels, self.gold_mention_labels], feed_dict=feed_dict) tp += np.logical_and(pred_labels, gold_labels).sum() fp += np.logical_and(pred_labels, np.logical_not(gold_labels)).sum() fn += np.logical_and(np.logical_not(pred_labels), gold_labels).sum() if (example_num + 1) % 100 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) p = tp / (tp + fp + epsilon) r = tp / (tp + fn + epsilon) f = 2 * p * r / (p + r + epsilon) summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(self.eval_data))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), f
def evaluate(self, session): self.load_eval_data() tp, fn, fp = 0, 0, 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } top_span_starts, top_span_ends = session.run(self.predictions, feed_dict=feed_dict) gold_mentions = set([(m[0], m[1]) for cl in example["clusters"] for m in cl]) pred_mentions = set([ (s, e) for s, e in zip(top_span_starts, top_span_ends) ]) tp += len(gold_mentions & pred_mentions) fn += len(gold_mentions - pred_mentions) fp += len(pred_mentions - gold_mentions) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) m_r = float(tp) / (tp + fn) m_p = float(tp) / (tp + fp) m_f1 = 2.0 * m_r * m_p / (m_r + m_p) print("Mention F1: {:.2f}%".format(m_f1 * 100)) print("Mention recall: {:.2f}%".format(m_r * 100)) print("Mention precision: {:.2f}%".format(m_p * 100)) summary_dict = {} summary_dict["Mention F1"] = m_f1 summary_dict["Mention recall"] = m_r summary_dict["Mention precision"] = m_p return util.make_summary(summary_dict), m_r
# print "use_gpu", use_gpu accumulated_loss += tf_loss acc_total_loss += total_loss acc_domain_loss += domain_loss acc_dmrm += domain_loss_reduce_mean if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print "[{}] loss={:.2f}, steps/s={:.2f}".format( tf_global_step, average_loss, steps_per_second) writer.add_summary( util.make_summary({"original loss": average_loss}), tf_global_step) accumulated_loss = 0.0 average_domain_loss = acc_domain_loss / report_frequency print "[{}] domain_loss={:.2f}, steps/s={:.2f}".format( tf_global_step, average_domain_loss, steps_per_second) writer.add_summary( util.make_summary({"domain loss": average_domain_loss}), tf_global_step) acc_domain_loss = 0.0 average_domain_loss_rm = acc_dmrm / report_frequency print "[{}] domain_loss_reduce_mean={:.2f}, steps/s={:.2f}".format( tf_global_step, average_domain_loss_rm, steps_per_second) writer.add_summary(
R = (reward_val * j) + 0.99 * R pg_reward[i][j] = R feed_dict[model.pg_reward] = pg_reward + eps tf_loss, tf_global_step, _ = session.run([model.loss, model.global_step, model.train_op], feed_dict=feed_dict) accumulated_loss += tf_loss if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print("[{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step, average_loss, steps_per_second)) writer.add_summary(util.make_summary( {"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step % eval_frequency == 0: eval_frequency = report_frequency = np.random.randint(1, 11) saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) try: eval_summary, eval_f1 = model.evaluate(session) except: # most time is spent here. so there is a high chance that # the timeout exception from reward computation is caught here eval_summary, eval_f1 = model.evaluate(session) if eval_f1 > max_f1: max_f1 = eval_f1
def main(): config = util.initialize_from_env() report_frequency = config["report_frequency"] eval_frequency = config["eval_frequency"] model = util.get_model(config) saver = tf.train.Saver() log_dir = config["log_dir"] max_steps = config['num_epochs'] * config['num_docs'] writer = tf.summary.FileWriter(log_dir, flush_secs=20) max_f1 = 0 mode = 'w' with tf.Session() as session: session.run(tf.global_variables_initializer()) model.start_enqueue_thread(session) accumulated_loss = 0.0 initial_step = 0 ckpt = tf.train.get_checkpoint_state(log_dir) if ckpt and ckpt.model_checkpoint_path: print("Restoring from: {}".format(ckpt.model_checkpoint_path)) saver.restore(session, ckpt.model_checkpoint_path) mode = 'a' initial_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) fh = logging.FileHandler(os.path.join(log_dir, 'stdout.log'), mode=mode) fh.setFormatter(logging.Formatter(format)) logger.addHandler(fh) initial_time = time.time() while True: tf_loss, tf_global_step, _ = session.run( [model.loss, model.global_step, model.train_op]) accumulated_loss += tf_loss # print('tf global_step', tf_global_step) if tf_global_step % report_frequency == 0: steps_per_second = (tf_global_step - initial_step) / ( time.time() - initial_time) average_loss = accumulated_loss / report_frequency logger.info("[{}] loss={:.2f}, steps/s={:.2f}".format( tf_global_step, average_loss, steps_per_second)) writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step % eval_frequency == 0: eval_summary, eval_f1 = model.evaluate(session) if eval_f1 > max_f1: max_f1 = eval_f1 saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) util.copy_checkpoint( os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt")) writer.add_summary(eval_summary, tf_global_step) writer.add_summary(util.make_summary({"max_eval_f1": max_f1}), tf_global_step) logger.info("[{}] evaL_f1={:.4f}, max_f1={:.4f}".format( tf_global_step, eval_f1, max_f1)) if tf_global_step > max_steps: break
#训练数据结果 if predict > config["result_metric"]: pred.append(1) else: pred.append(0) true.append(label) if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print("[{}] loss={:.2f}, steps/s={:.2f}".format( tf_global_step, average_loss, steps_per_second)) writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step % eval_frequency == 0: #训练集评价结果 train_accuracy = metrics.accuracy_score(true, pred) train_precision_macro = metrics.precision_score( true, pred, average='macro') train_recall_macro = metrics.recall_score(true, pred, average='macro') train_f = metrics.f1_score(true, pred, average='macro') summary_dict = {}
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated= 0 ################################################################################################## ################### WE FURTHER DETECT THE RESULTS SEPERATEDLY FOR P-P; NP-NP, P-NP ############### ################################################################################################## coref_predictions_pp = {} coref_predictions_pnp = {} coref_predictions_npnp = {} # span type coref_evaluator_pp = PairEvaluator() coref_evaluator_pnp = PairEvaluator() coref_evaluator_npnp = PairEvaluator() coref_evaluator_all = PairEvaluator() num_coref_pp = 0 num_coref_pnp = 0 num_coref_npnp = 0 num_coref_all = 0 # span freq coref_evaluator_freq = PairEvaluator() coref_evaluator_rare = PairEvaluator() num_coref_freq = 0 num_coref_rare = 0 # pron type coref_evaluators_type = dict() coref_evaluators_type["demo"], coref_evaluators_type["pos"], coref_evaluators_type["third"] = PairEvaluator(), PairEvaluator(), PairEvaluator() nums_coref_type = dict() nums_coref_type["demo"], nums_coref_type["pos"], nums_coref_type["third"] = 0, 0, 0 count = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): try: # count += 1 # if count == 10: # break _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run([self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) ##################################################################################### # Evaluate on three different settings: NP-NP, NP-P, P-P by using a different cluster ##################################################################################### # Span Type flatten_sentences = util.flatten(example["sentences"]) gold_pp_pairs, gold_pnp_pairs, gold_npnp_pairs, num_pp_pairs, num_pnp_pairs, num_npnp_pairs, num_relation = self.cluster_to_pairs(example["clusters"], flatten_sentences) # predicted_clusters = coref_predictions[example["doc_key"]] pred_pp_pairs, pred_pnp_pairs, pred_npnp_pairs, _, _, _, _ = self.cluster_to_pairs(coref_predictions[example["doc_key"]], flatten_sentences) # Span Frequency gold_freq_pnp_pairs, gold_rare_pnp_pairs, num_freq_pairs, num_rare_pairs = self.cluster_to_pair_frequent(example["clusters"], flatten_sentences) pred_freq_pnp_pairs, pred_rare_pnp_pairs, _, _ = self.cluster_to_pair_frequent(coref_predictions[example["doc_key"]], flatten_sentences) # pronoun type demo, pos, third gold_type_pairs, gold_type_nums = self.cluster_to_pair_detailed_pronoun(example["clusters"], flatten_sentences) pred_type_pairs, pred_type_nums = self.cluster_to_pair_detailed_pronoun(coref_predictions[example["doc_key"]], flatten_sentences) for pron_type in ["demo", "pos", "third"]: coref_evaluators_type[pron_type].update(gold_type_pairs[pron_type], pred_type_pairs[pron_type]) nums_coref_type[pron_type] += gold_type_nums[pron_type] all_gold = gold_pp_pairs + gold_pnp_pairs + gold_npnp_pairs all_pred = pred_pp_pairs + pred_pnp_pairs + pred_npnp_pairs coref_evaluator_pp.update(pred_pp_pairs, gold_pp_pairs) coref_evaluator_pnp.update(pred_pnp_pairs, gold_pnp_pairs) coref_evaluator_npnp.update(pred_npnp_pairs, gold_npnp_pairs) coref_evaluator_all.update(all_pred, all_gold) coref_evaluator_freq.update(pred_freq_pnp_pairs, gold_freq_pnp_pairs) coref_evaluator_rare.update(pred_rare_pnp_pairs, gold_rare_pnp_pairs) num_coref_pp += num_pp_pairs num_coref_pnp += num_pnp_pairs num_coref_npnp += num_npnp_pairs num_coref_all = num_coref_all + num_pp_pairs + num_pnp_pairs + num_npnp_pairs num_coref_freq += num_freq_pairs num_coref_rare += num_rare_pairs except: a = "do nothing" summary_dict = {} self.print_prf(coref_evaluator_pp, summary_dict, doc_keys, "PP", num_coref_pp) self.print_prf(coref_evaluator_pnp, summary_dict, doc_keys, "PNP", num_coref_pnp) self.print_prf(coref_evaluator_npnp, summary_dict, doc_keys, "NPNP", num_coref_npnp) self.print_prf(coref_evaluator_freq, summary_dict, doc_keys, "FREQ", num_coref_freq) self.print_prf(coref_evaluator_rare, summary_dict, doc_keys, "RARE", num_coref_rare) for pron_type in ["demo", "pos", "third"]: self.print_prf(coref_evaluators_type[pron_type], summary_dict, doc_keys, pron_type, nums_coref_type[pron_type]) self.print_prf(coref_evaluator_all, summary_dict, doc_keys, "ALL_PAIRS", num_coref_all) ####################################################################################### # summary_dict = {} print("The evaluatoin results for all clusters") print("The number of pairs is "+ str(num_coref_all)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) return util.make_summary(summary_dict), f
def evaluate(self, session, official_stdout=False): self.load_eval_data() tp, fn, fp = 0, 0, 0 coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run( self.predictions, feed_dict=feed_dict) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) gold_mentions = set([(s, e) for cl in example["clusters"] for s, e in cl]) pred_mentions = set([ (s, e) for s, e in zip(top_span_starts, top_span_ends) ]) tp += len(gold_mentions & pred_mentions) fn += len(gold_mentions - pred_mentions) fp += len(pred_mentions - gold_mentions) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) m_r = float(tp) / (tp + fn) m_p = float(tp) / (tp + fp) m_f1 = 2.0 * m_r * m_p / (m_r + m_p) print("Mention F1: {:.2f}%".format(m_f1 * 100)) print("Mention recall: {:.2f}%".format(m_r * 100)) print("Mention precision: {:.2f}%".format(m_p * 100)) summary_dict = {} if official_stdout: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) average_f1 = average_f1 if official_stdout else f * 100 return util.make_summary(summary_dict), average_f1
def evaluate(self, session, data, predictions, loss, official_stdout=False): if self.eval_data is None: self.eval_data, self.eval_tensors, self.coref_eval_data = data.load_eval_data() def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) # Retrieval evaluators. arg_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 30, 40, 50, 80, 100, 120, 150] } predicate_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50, 70] } mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50] } entity_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50, 70] } total_loss = 0 total_num_predicates = 0 total_gold_predicates = 0 srl_comp_sents = 0 srl_predictions = [] ner_predictions = [] rel_predictions = [] coref_predictions = {} coref_evaluator = coref_metrics.CorefEvaluator() all_gold_predicates = [] all_guessed_predicates = [] start_time = time.time() debug_printer = debug_utils.DebugPrinter() # Simple analysis. unique_core_role_violations = 0 continuation_role_violations = 0 reference_role_violations = 0 gold_u_violations = 0 gold_c_violations = 0 gold_r_violations = 0 # Global sentence ID. rel_sent_id = 0 srl_sent_id = 0 for i, doc_tensors in enumerate(self.eval_tensors): feed_dict = dict(list(zip( data.input_tensors, [pad_batch_tensors(doc_tensors, tn) for tn in data.input_names + data.label_names]))) predict_names = [] for tn in data.predict_names: if tn in predictions: predict_names.append(tn) predict_tensors = [predictions[tn] for tn in predict_names] + [loss] predict_tensors = session.run(predict_tensors, feed_dict=feed_dict) predict_dict = dict(list(zip(predict_names + ["loss"], predict_tensors))) doc_size = len(doc_tensors) doc_example = self.coref_eval_data[i] sentences = doc_example["sentences"] decoded_predictions = inference_utils.mtl_decode( sentences, predict_dict, data.ner_labels_inv, data.rel_labels_inv, self.config) # Relation extraction. if "rel" in decoded_predictions: rel_predictions.extend(decoded_predictions["rel"]) for j in range(len(sentences)): sent_example = self.eval_data[rel_sent_id][3] # sentence, srl, ner, relations text_length = len(sentences[j]) ne = predict_dict["num_entities"][j] gold_entities = set([]) for rel in sent_example: gold_entities.update([rel[:2], rel[2:4]]) srl_eval_utils.evaluate_retrieval( predict_dict["candidate_starts"][j], predict_dict["candidate_ends"][j], predict_dict["candidate_entity_scores"][j], predict_dict["entity_starts"][j][:ne], predict_dict["entity_ends"][j][:ne], gold_entities, text_length, entity_evaluators) rel_sent_id += 1 if "ner" in decoded_predictions: ner_predictions.extend(decoded_predictions["ner"]) if "predicted_clusters" in decoded_predictions: gold_clusters = [tuple(tuple(m) for m in gc) for gc in doc_example["clusters"]] gold_mentions = set([]) mention_to_gold = {} for gc in gold_clusters: for mention in gc: mention_to_gold[mention] = gc gold_mentions.add(mention) coref_evaluator.update(decoded_predictions["predicted_clusters"], gold_clusters, decoded_predictions["mention_to_predicted"], mention_to_gold) coref_predictions[doc_example["doc_key"]] = decoded_predictions["predicted_clusters"] # Evaluate retrieval. doc_text_length = sum([len(s) for s in sentences]) srl_eval_utils.evaluate_retrieval( predict_dict["candidate_mention_starts"], predict_dict["candidate_mention_ends"], predict_dict["candidate_mention_scores"], predict_dict["mention_starts"], predict_dict["mention_ends"], gold_mentions, doc_text_length, mention_evaluators) total_loss += predict_dict["loss"] if (i + 1) % 50 == 0: print(("Evaluated {}/{} documents.".format(i + 1, len(self.coref_eval_data)))) debug_printer.close() summary_dict = {} task_to_f1 = {} # From task name to F1. elapsed_time = time.time() - start_time sentences, gold_srl, gold_ner, gold_relations = list(zip(*self.eval_data)) # Summarize results. if self.config["relation_weight"] > 0: precision, recall, f1 = ( srl_eval_utils.compute_relation_f1(sentences, gold_relations, rel_predictions)) task_to_f1["relations"] = f1 summary_dict["Relation F1"] = f1 summary_dict["Relation precision"] = precision summary_dict["Relation recall"] = recall for k, evaluator in sorted(list(entity_evaluators.items()), key=operator.itemgetter(0)): tags = ["{} {} @ {}".format("Entities", t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.4f}".format(t, v)) summary_dict[t] = v print(", ".join(results_to_print)) if self.config["ner_weight"] > 0: ner_precision, ner_recall, ner_f1, ul_ner_prec, ul_ner_recall, ul_ner_f1, ner_label_mat = ( srl_eval_utils.compute_span_f1(gold_ner, ner_predictions, "NER")) summary_dict["NER F1"] = ner_f1 summary_dict["NER precision"] = ner_precision summary_dict["NER recall"] = ner_recall summary_dict["Unlabeled NER F1"] = ul_ner_f1 summary_dict["Unlabeled NER precision"] = ul_ner_prec summary_dict["Unlabeled NER recall"] = ul_ner_recall # Write NER prediction to IOB format and run official eval script. srl_eval_utils.print_to_iob2(sentences, gold_ner, ner_predictions, self.config["ner_conll_eval_path"]) task_to_f1["ner"] = ner_f1 #for label_pair, freq in ner_label_mat.most_common(): # if label_pair[0] != label_pair[1] and freq > 10: # print ("{}\t{}\t{}".format(label_pair[0], label_pair[1], freq)) if self.config["coref_weight"] > 0: #conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout) #coref_conll_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) #summary_dict["Average F1 (conll)"] = coref_conll_f1 #print "Average F1 (conll): {:.2f}%".format(coref_conll_f1) p,r,f = coref_evaluator.get_prf() summary_dict["Average Coref F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average Coref precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average Coref recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) task_to_f1["coref"] = f * 100 # coref_conll_f1 for k, evaluator in sorted(list(mention_evaluators.items()), key=operator.itemgetter(0)): tags = ["{} {} @ {}".format("Mentions", t, _k_to_tag(k)) for t in ("R", "P", "F")] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.4f}".format(t, v)) summary_dict[t] = v print(", ".join(results_to_print)) summary_dict["Dev Loss"] = total_loss / len(self.coref_eval_data) print("Decoding took {}.".format(str(datetime.timedelta(seconds=int(elapsed_time))))) print("Decoding speed: {}/document, or {}/sentence.".format( str(datetime.timedelta(seconds=int(elapsed_time / len(self.coref_eval_data)))), str(datetime.timedelta(seconds=int(elapsed_time / len(self.eval_data)))) )) metric_names = self.config["main_metrics"].split("_") main_metric = sum([task_to_f1[t] for t in metric_names]) / len(metric_names) print("Combined metric ({}): {}".format(self.config["main_metrics"], main_metric)) return util.make_summary(summary_dict), main_metric, task_to_f1
def evaluate(self, session, evaluation_data=None, official_stdout=False, mode='train', title_map=None): if evaluation_data: separate_data = list() for tmp_example in evaluation_data: tensorized_example = self.tensorize_pronoun_example( tmp_example, is_training=True) separate_data.append((tensorized_example, tmp_example)) else: separate_data = self.eval_data all_coreference = 0 predict_coreference = 0 corrct_predict_coreference = 0 prediction_result = list() for example_num, (tensorized_example, example) in enumerate(separate_data): prediction_result_by_example = list() all_sentence = list() doc_id = example['doc_key'] if mode == 'test' or mode == 'predict': print(title_map[doc_id]) for s in example['sentences']: all_sentence += s _, _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, number_features, candidate_NP_positions, \ pronoun_positions, name_positions, status_positions, order_features, labels, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } pronoun_coref_scores = session.run(self.predictions, feed_dict=feed_dict) pronoun_coref_scores = pronoun_coref_scores[0] # [4, 4] if self.config["use_multi_span"]: gold_starts = tf.squeeze(gold_starts[:, :1], 1).eval() gold_ends = tf.squeeze(gold_ends[:, :1], 1).eval() for i, pronoun_coref_scores_by_example in enumerate( pronoun_coref_scores): current_pronoun_index = int(pronoun_positions[i][0]) pronoun_position_start = int( gold_starts[current_pronoun_index]) pronoun_position_end = int( gold_ends[current_pronoun_index]) + 1 current_pronoun = ''.join( all_sentence[pronoun_position_start:pronoun_position_end]) pronoun_coref_scores_by_example = pronoun_coref_scores_by_example[ 1:] # [1,3] # labels [4, 3] bool prediction_result_by_example.append( (pronoun_coref_scores_by_example.tolist(), labels[i])) for j, tmp_score in enumerate( pronoun_coref_scores_by_example.tolist()): current_candidate_index = int(candidate_NP_positions[i][j]) candidate_positions_start = int( gold_starts[current_candidate_index]) candidate_positions_end = int( gold_ends[current_candidate_index]) + 1 current_candidate = ''.join(all_sentence[ candidate_positions_start:candidate_positions_end]) if tmp_score > 0: msg = '{} link to: {} ({},{}) \t'.format( current_pronoun, current_candidate, candidate_positions_start, candidate_positions_end) predict_coreference += 1 if labels[i][j]: corrct_predict_coreference += 1 msg += 'True-predict' + '\t' + 'score: ' + str( tmp_score) else: msg += 'False-predict' + '\t' + 'score: ' + str( tmp_score) if mode == 'test' or mode == 'predict': print(msg) for l in labels[i]: if l: all_coreference += 1 prediction_result.append(prediction_result_by_example) summary_dict = {} if mode == 'predict': summary_dict["Average F1 (py)"] = 0 summary_dict["Average precision (py)"] = 0 summary_dict["Average recall (py)"] = 0 print('there is no positive prediction') f1 = 0 else: if predict_coreference > 0: p = corrct_predict_coreference / predict_coreference r = corrct_predict_coreference / all_coreference f1 = 2 * p * r / (p + r) summary_dict["Average F1 (py)"] = f1 print("Average F1 (py): {:.2f}%".format(f1 * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) else: summary_dict["Average F1 (py)"] = 0 summary_dict["Average precision (py)"] = 0 summary_dict["Average recall (py)"] = 0 print('there is no positive prediction') f1 = 0 return util.make_summary(summary_dict), f1
while True: tf_loss, tf_global_step, _ = session.run( [model.loss, model.global_step, model.train_op]) accumulated_loss += tf_loss if tf_global_step % report_frequency == 0: steps_per_second = (tf_global_step - initial_step) / ( time.time() - initial_time) average_loss = accumulated_loss / report_frequency print("[{}] loss={:.2f}, steps/s={:.2f}".format( tf_global_step, average_loss, steps_per_second)) writer.add_summary( util.make_summary({ "loss": average_loss, "learning_rate": session.run(model.learning_rate) }), tf_global_step) accumulated_loss = 0.0 initial_time = time.time() initial_step = tf_global_step if tf_global_step % eval_frequency == 0: saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) eval_summary, eval_f1 = model.evaluate(session) if eval_f1 > max_f1: max_f1 = eval_f1 util.copy_checkpoint(
if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time # avg_mention_loss = acc_mention_loss / report_frequency avg_tagging_loss = acc_tagging_loss / report_frequency print "[{}] tagging_loss={:.2f} steps/s={:.2f}".format( tf_global_step, avg_tagging_loss, steps_per_second) # ''' print '----------------------------' print x1 print "number of entities:%d" % max(list(x2)) print "tagging_loss:%f, mention_loss:NA, antecedent_loss:%f" % ( x6, x8) print list(x2) print util.check_tags(x2) print list(x3[0]) print x4 print '----------------------------' # ''' writer.add_summary( util.make_summary({"loss": avg_tagging_loss}), tf_global_step) # accumulated_loss = 0.0 # acc_mention_loss = 0 acc_tagging_loss = 0 # Ask for all the services to stop. sv.stop()
global_step=model.global_step, save_model_secs=120) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as session: model.start_enqueue_thread(session) accumulated_loss = 0.0 initial_time = time.time() while not sv.should_stop(): tf_loss, tf_global_step, _ = session.run( [model.loss, model.global_step, model.train_op]) accumulated_loss += tf_loss if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print("[{}] loss={:.2f}, steps/s={:.2f}".format( tf_global_step, tf_loss, steps_per_second)) accumulated_loss = 0.0 writer.add_summary( util.make_summary({ "Train Loss": average_loss, "Steps per second": steps_per_second })) # Ask for all the services to stop. sv.stop()
saver.restore(session, ckpt.model_checkpoint_path) initial_time = time.time() print("We're reporting with frequency: %d" % report_frequency) print("We're reporting with eval frequency: %d" % eval_frequency) while True: tf_loss, tf_global_step, _ = session.run([model.loss, model.global_step1, model.train_op]) accumulated_loss += tf_loss if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print("Coreference [{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step, average_loss[0], steps_per_second)) writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step % eval_frequency == 0: #saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) eval_summary, eval_f1, swag_accuracy = model.evaluate(session) if eval_f1 > max_f1: saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) max_f1 = eval_f1 util.copy_checkpoint(os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt")) if swag_accuracy> max_swag_acc: saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) max_swag_acc = swag_accuracy util.copy_checkpoint(os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt")) writer.add_summary(eval_summary, tf_global_step)
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False, to_npy=None, from_npy=None, rsa_model=None): assert not (to_npy is not None and from_npy is not None), "cannot set both to_npy and from_npy to be none!" self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 total_time = 0 if to_npy: data_dicts = [] if from_npy: with open(from_npy, "rb") as f: from_npy_dict = np.load(f) data_dicts = from_npy_dict.item().get("data_dicts") for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example if from_npy is None: feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = \ session.run([self.loss, self.predictions], feed_dict=feed_dict) else: data_dict = data_dicts[example_num] example = data_dict["example"] if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) tensorized_example = data_dict["tensorized_example"] loss = data_dict["loss"] top_span_starts = data_dict["top_span_starts"] top_span_ends = data_dict["top_span_ends"] top_antecedents = data_dict["top_antecedents"] top_antecedent_scores = data_dict["top_antecedent_scores"] # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) if rsa_model is not None: print("Running l1 for sentence %d" % example_num) start_time = time.time() top_antecedent_scores = rsa_model.l1(example, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) duration = time.time() - start_time print("Finished sentence %d, took %.2f s" % (example_num, duration)) total_time += duration num_evaluated += 1 if to_npy: data_dict = { "example_num": example_num, "tensorized_example": tensorized_example, "example": example, "top_span_starts": top_span_starts, "top_span_ends": top_span_ends, "top_antecedents": top_antecedents, "top_antecedent_scores": top_antecedent_scores, "loss": loss, } data_dicts.append(data_dict) predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) summary_dict = {} if to_npy: dict_to_npy = {"data_dicts": data_dicts} if eval_mode: conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout ) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 if to_npy: dict_to_npy["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) if to_npy: dict_to_npy["Average F1 (py)"] = f dict_to_npy["Average precision (py)"] = p dict_to_npy["Average recall (py)"] = r with open(to_npy, "wb") as f_to_npy: np.save(f_to_npy, dict_to_npy) if rsa_model: print("Ran rsa on %d sentences, avg time per sentence %.2f s" % num_evaluated, total_time / num_evaluated) return util.make_summary(summary_dict), f
counter = 0 while True: random.shuffle(train_examples) for example in train_examples: tensorized_example = model.tensorize_example(example, is_training=True) feed_dict = dict(zip(model.input_tensors, tensorized_example)) tf_loss, tf_global_step, _ = session.run([model.loss, model.global_step1, model.train_op] , feed_dict = feed_dict) print(str(tf_global_step)+'\r',end='') # print(str(tf_global_step)) accumulated_loss += tf_loss if tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print("[{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step, average_loss, steps_per_second)) writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step % eval_frequency == 0: #saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) eval_summary, eval_f1 = model.evaluate(session) if eval_f1 > max_f1: saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) max_f1 = eval_f1 util.copy_checkpoint(os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt")) print("====") except Exception as e: print(e)
def evaluate(self, session, eval_fold=-1, num_fold=-1, is_final_test=False): self.load_eval_data(eval_fold, num_fold) if "eval_on_test_part_only" in self.config and self.config[ "eval_on_test_part_only"]: eval_on_test_part_only = True print("Evaluate on the test part only!!!!") else: eval_on_test_part_only = False if num_fold > 1 and is_final_test: print("Evaluating %d/%d fold." % (eval_fold + 1, num_fold)) tp, fn, fp = 0, 0, 0 tpa, fna, fpa = 0, 0, 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, _, _, gold_starts, gold_ends, cluster_ids, bridging_ante_cids, _, _, = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } predictions = session.run(self.predictions, feed_dict=feed_dict) pred_bridging_pairs, pred_bridging_anaphora = self.get_predicted_bridging_pairs( predictions) if self.config["has_multi_bridging_ant"]: #we follow hou et al to count birdings with multi-antecedent as correct # as long as any gold bridging ant is recovered (only for BASHI) gold_bridging_pairs, gold_bridging_anaphora = self.get_gold_bridging_pairs( gold_starts, gold_ends, cluster_ids, example["bridging_pairs"], pred_bridging_pairs) else: gold_bridging_pairs = set([(s, e, cid) for s, e, cid in zip( gold_starts, gold_ends, bridging_ante_cids) if cid > 0]) gold_bridging_anaphora = set([(s, e) for s, e, cid in zip( gold_starts, gold_ends, bridging_ante_cids) if cid > 0]) add2eval = True if eval_on_test_part_only and not example["doc_key"].endswith( '_test'): add2eval = False if add2eval: tp += len(gold_bridging_pairs & pred_bridging_pairs) fn += len(gold_bridging_pairs - pred_bridging_pairs) fp += len(pred_bridging_pairs - gold_bridging_pairs) tpa += len(gold_bridging_anaphora & pred_bridging_anaphora) fna += len(gold_bridging_anaphora - pred_bridging_anaphora) fpa += len(pred_bridging_anaphora - gold_bridging_anaphora) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) bridging_recall = 0.0 if tp == 0 else float(tp) / (tp + fn) bridging_precision = 0.0 if tp == 0 else float(tp) / (tp + fp) bridging_f1 = 0.0 if bridging_precision == 0.0 else 2.0 * bridging_recall * bridging_precision / ( bridging_recall + bridging_precision) bridging_anaphora_recall = 0.0 if tpa == 0 else float(tpa) / (tpa + fna) bridging_anaphora_precision = 0.0 if tpa == 0 else float(tpa) / (tpa + fpa) bridging_anaphora_f1 = 0.0 if bridging_anaphora_precision == 0.0 else 2.0 * bridging_anaphora_recall * bridging_anaphora_precision / ( bridging_anaphora_recall + bridging_anaphora_precision) print("Bridging anaphora detection F1: {:.2f}%".format( bridging_anaphora_f1 * 100)) print("Bridging anaphora detection recall: {:.2f}%".format( bridging_anaphora_recall * 100)) print("Bridging anaphora detection precision: {:.2f}%".format( bridging_anaphora_precision * 100)) print("Bridging F1: {:.2f}%".format(bridging_f1 * 100)) print("Bridging recall: {:.2f}%".format(bridging_recall * 100)) print("Bridging precision: {:.2f}%".format(bridging_precision * 100)) summary_dict = {} summary_dict["Bridging anaphora detection F1"] = bridging_anaphora_f1 summary_dict[ "Bridging anaphora detection recall"] = bridging_anaphora_recall summary_dict[ "Bridging anaphora detection precision"] = bridging_anaphora_precision summary_dict["Bridging F1"] = bridging_f1 summary_dict["Bridging recall"] = bridging_recall summary_dict["Bridging precision"] = bridging_precision f1 = bridging_f1 if is_final_test: return tp, fn, fp, tpa, fna, fpa return util.make_summary(summary_dict), f1 * 100
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: # if keys is not None and example['doc_key'] in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) # continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[example["doc_key"]] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format( example_num + 1, len(self.eval_data))) summary_dict = {} # with open('doc_keys_512.txt', 'w') as f: # for key in doc_keys: # f.write(key + '\n') if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) return util.make_summary(summary_dict), f
def evaluate(self, session, data, predictions, loss, official_stdout=False): if self.eval_data is None: self.eval_data, self.eval_tensors, self.coref_eval_data = data.load_eval_data( ) def _k_to_tag(k): if k == -3: return "oracle" elif k == -2: return "actual" elif k == -1: return "exact" elif k == 0: return "threshold" else: return "{}%".format(k) # Retrieval evaluators. arg_evaluators = { k: util.RetrievalEvaluator() for k in [-3, -2, -1, 30, 40, 50, 80, 100, 120, 150] } predicate_evaluators = { k: util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50, 70] } total_loss = 0 total_num_predicates = 0 total_gold_predicates = 0 srl_comp_sents = 0 srl_predictions = [] all_gold_predicates = [] all_guessed_predicates = [] start_time = time.time() sent_id = 0 # Simple analysis. unique_core_role_violations = 0 continuation_role_violations = 0 reference_role_violations = 0 gold_u_violations = 0 gold_c_violations = 0 gold_r_violations = 0 # Go through document-level predictions. for i, doc_tensors in enumerate(self.eval_tensors): feed_dict = dict( zip(data.input_tensors, [ pad_batch_tensors(doc_tensors, tn) for tn in data.input_names + data.label_names ])) predict_names = [] for tn in data.predict_names: if tn in predictions: predict_names.append(tn) predict_tensors = [predictions[tn] for tn in predict_names] + [loss] predict_tensors = session.run(predict_tensors, feed_dict=feed_dict) predict_dict = dict(zip(predict_names + ["loss"], predict_tensors)) doc_size = len(doc_tensors) doc_example = self.coref_eval_data[i] sentences = doc_example["sentences"] decoded_predictions = inference_utils.srl_decode( sentences, predict_dict, data.srl_labels_inv, self.config) if "srl" in decoded_predictions: srl_predictions.extend(decoded_predictions["srl"]) # Evaluate retrieval. word_offset = 0 for j in range(len(sentences)): text_length = len(sentences[j]) na = predict_dict["num_args"][j] np = predict_dict["num_preds"][j] sent_example = self.eval_data[ sent_id] # sentence, srl, ner gold_args = set([]) gold_preds = set([]) guessed_preds = set([]) for pred, args in sent_example[1].iteritems(): filtered_args = [(a[0], a[1]) for a in args if a[2] not in ["V", "C-V"]] if len(filtered_args) > 0: gold_preds.add((pred, pred)) gold_args.update(filtered_args) for pred, args in decoded_predictions["srl"][j].iteritems( ): guessed_preds.add((pred, pred, "V")) all_gold_predicates.append([(p[0], p[1], "V") for p in gold_preds]) all_guessed_predicates.append(guessed_preds) srl_eval_utils.evaluate_retrieval( predict_dict["candidate_starts"][j], predict_dict["candidate_ends"][j], predict_dict["candidate_arg_scores"][j], predict_dict["arg_starts"][j][:na], predict_dict["arg_ends"][j][:na], gold_args, text_length, arg_evaluators) srl_eval_utils.evaluate_retrieval( range(text_length), range(text_length), predict_dict["candidate_pred_scores"][j], predict_dict["predicates"][j][:np], predict_dict["predicates"][j][:np], gold_preds, text_length, predicate_evaluators) # TODO: Move elsewhere. u_violations, c_violations, r_violations = debug_utils.srl_constraint_tracker( decoded_predictions["srl"][j]) unique_core_role_violations += u_violations continuation_role_violations += c_violations reference_role_violations += r_violations total_num_predicates += len( decoded_predictions["srl"][j].keys()) u_violations, c_violations, r_violations = debug_utils.srl_constraint_tracker( sent_example[1]) gold_u_violations += u_violations gold_c_violations += c_violations gold_r_violations += r_violations total_gold_predicates += len(sent_example[1].keys()) sent_id += 1 word_offset += text_length total_loss += predict_dict["loss"] if (i + 1) % 50 == 0: print("Evaluated {}/{} documents.".format( i + 1, len(self.coref_eval_data))) summary_dict = {} task_to_f1 = {} # From task name to F1. elapsed_time = time.time() - start_time sentences, gold_srl, gold_ner = zip(*self.eval_data) # Summarize results, evaluate entire dev set. precision, recall, f1, conll_precision, conll_recall, conll_f1, ul_prec, ul_recall, ul_f1, srl_label_mat, comp = ( srl_eval_utils.compute_srl_f1(sentences, gold_srl, srl_predictions, self.config["srl_conll_eval_path"])) pid_precision, pred_recall, pid_f1, _, _, _, _ = srl_eval_utils.compute_span_f1( all_gold_predicates, all_guessed_predicates, "Predicate ID") task_to_f1["srl"] = conll_f1 summary_dict["PAS F1"] = f1 summary_dict["PAS precision"] = precision summary_dict["PAS recall"] = recall summary_dict["Unlabeled PAS F1"] = ul_f1 summary_dict["Unlabeled PAS precision"] = ul_prec summary_dict["Unlabeled PAS recall"] = ul_recall summary_dict["CoNLL F1"] = conll_f1 summary_dict["CoNLL precision"] = conll_precision summary_dict["CoNLL recall"] = conll_recall if total_num_predicates > 0: summary_dict[ "Unique core violations/Predicate"] = 1.0 * unique_core_role_violations / total_num_predicates summary_dict[ "Continuation violations/Predicate"] = 1.0 * continuation_role_violations / total_num_predicates summary_dict[ "Reference violations/Predicate"] = 1.0 * reference_role_violations / total_num_predicates print "Completely correct sentences: {}/{}".format( comp, 100.0 * comp / len(srl_predictions)) for k, evaluator in sorted(arg_evaluators.items(), key=operator.itemgetter(0)): tags = [ "{} {} @ {}".format("Args", t, _k_to_tag(k)) for t in ("R", "P", "F") ] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.4f}".format(t, v)) summary_dict[t] = v print ", ".join(results_to_print) for k, evaluator in sorted(predicate_evaluators.items(), key=operator.itemgetter(0)): tags = [ "{} {} @ {}".format("Predicates", t, _k_to_tag(k)) for t in ("R", "P", "F") ] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.4f}".format(t, v)) summary_dict[t] = v print ", ".join(results_to_print) if total_num_predicates > 0: print("Constraint voilations: U: {} ({}), C: {} ({}), R: {} ({})". format( 1.0 * unique_core_role_violations / total_num_predicates, unique_core_role_violations, 1.0 * continuation_role_violations / total_num_predicates, continuation_role_violations, 1.0 * reference_role_violations / total_num_predicates, reference_role_violations)) if total_gold_predicates > 0: print( "Gold constraint voilations: U: {} ({}), C: {} ({}), R: {} ({})" .format(1.0 * gold_u_violations / total_gold_predicates, gold_u_violations, 1.0 * gold_c_violations / total_gold_predicates, gold_c_violations, 1.0 * gold_r_violations / total_gold_predicates, gold_r_violations)) #for label_pair, freq in srl_label_mat.most_common(): # if label_pair[0] != label_pair[1] and freq > 10: # print ("{}\t{}\t{}".format(label_pair[0], label_pair[1], freq)) summary_dict["Dev Loss"] = total_loss / len(self.coref_eval_data) print "Decoding took {}.".format( str(datetime.timedelta(seconds=int(elapsed_time)))) print "Decoding speed: {}/document, or {}/sentence.".format( str( datetime.timedelta(seconds=int(elapsed_time / len(self.coref_eval_data)))), str( datetime.timedelta(seconds=int(elapsed_time / len(self.eval_data))))) metric_names = self.config["main_metrics"].split("_") main_metric = sum([task_to_f1[t] for t in metric_names]) / len(metric_names) print "Combined metric ({}): {}".format(self.config["main_metrics"], main_metric) return util.make_summary(summary_dict), main_metric, task_to_f1
def evaluate(self, session, is_final_test=False): self.load_eval_data() tp,fn,fp = 0,0,0 start_time = time.time() num_words = 0 sub_tp,sub_fn,sub_fp = [0] * self.num_types,[0]*self.num_types, [0]*self.num_types is_flat_ner = 'flat_ner' in self.config and self.config['flat_ner'] total_preds = [] total_golds = [] for example_num, (tensorized_example, example) in enumerate(self.eval_data): feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)} candidate_ner_scores = session.run(self.predictions, feed_dict=feed_dict) num_words += sum(len(tok) for tok in example["sentences"]) gold_ners = set([(sid,s,e, self.ner_maps[t]) for sid, ner in enumerate(example['ners']) for s,e,t in ner]) pred_ners = self.get_pred_ner(example["sentences"], candidate_ner_scores,is_flat_ner) total_golds.append(list(gold_ners)) total_preds.append(list(pred_ners)) #print(pred_ners) tp += len(gold_ners & pred_ners) fn += len(gold_ners - pred_ners) fp += len(pred_ners - gold_ners) if is_final_test: for i in range(self.num_types): sub_gm = set((sid,s,e) for sid,s,e,t in gold_ners if t ==i+1) sub_pm = set((sid,s,e) for sid,s,e,t in pred_ners if t == i+1) sub_tp[i] += len(sub_gm & sub_pm) sub_fn[i] += len(sub_gm - sub_pm) sub_fp[i] += len(sub_pm - sub_gm) if example_num % 10 == 0: print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) used_time = time.time() - start_time print("Time used: %d second, %.2f w/s " % (used_time, num_words*1.0/used_time)) m_r = 0 if tp == 0 else float(tp)/(tp+fn) m_p = 0 if tp == 0 else float(tp)/(tp+fp) m_f1 = 0 if m_p == 0 else 2.0*m_r*m_p/(m_r+m_p) print("Mention F1: {:.2f}%".format(m_f1*100)) print("Mention recall: {:.2f}%".format(m_r*100)) print("Mention precision: {:.2f}%".format(m_p*100)) if is_final_test: print("****************SUB NER TYPES********************") for i in range(self.num_types): sub_r = 0 if sub_tp[i] == 0 else float(sub_tp[i]) / (sub_tp[i] + sub_fn[i]) sub_p = 0 if sub_tp[i] == 0 else float(sub_tp[i]) / (sub_tp[i] + sub_fp[i]) sub_f1 = 0 if sub_p == 0 else 2.0 * sub_r * sub_p / (sub_r + sub_p) print("{} F1: {:.2f}%".format(self.ner_types[i],sub_f1 * 100)) print("{} recall: {:.2f}%".format(self.ner_types[i],sub_r * 100)) print("{} precision: {:.2f}%".format(self.ner_types[i],sub_p * 100)) summary_dict = {} summary_dict["Mention F1"] = m_f1 summary_dict["Mention recall"] = m_r summary_dict["Mention precision"] = m_p return util.make_summary(summary_dict), m_f1, total_preds, total_golds
def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False, visualize=False): self.load_eval_data() coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() losses = [] doc_keys = [] num_evaluated = 0 visualize_list = [] for example_num, (tensorized_example, example) in enumerate(self.eval_data): _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } # if tensorized_example[0].shape[0] <= 9: if keys is not None and example['doc_key'] not in keys: # print('Skipping...', example['doc_key'], tensorized_example[0].shape) continue doc_keys.append(example['doc_key']) loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run( [self.loss, self.predictions], feed_dict=feed_dict) # losses.append(session.run(self.loss, feed_dict=feed_dict)) losses.append(loss) predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) predicted_clusters = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) coref_predictions[example["doc_key"]] = predicted_clusters # if example_num % 10 == 0: # print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))) # Visualize antecedents if visualize: print('*****New Doc*****') subtokens = util.flatten(example['sentences']) span_list, antecedent_list = [], [] for idx, antecedent_idx in enumerate(predicted_antecedents): if antecedent_idx == -1: continue span_subtoken_idx = (top_span_starts[idx], top_span_ends[idx]) span_str = ' '.join( subtokens[span_subtoken_idx[0]:span_subtoken_idx[1] + 1]) antecedent_subtoken_idx = (top_span_starts[antecedent_idx], top_span_ends[antecedent_idx]) antecedent_str = ' '.join(subtokens[ antecedent_subtoken_idx[0]:antecedent_subtoken_idx[1] + 1]) # print('%s ---> %s' % (span_str, antecedent_str)) span_list.append(span_str) antecedent_list.append(antecedent_str) visualize_list.append((span_list, antecedent_list)) summary_dict = {} if eval_mode: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f logger.info("Average F1 (py): {:.2f}% on {} docs".format( f * 100, len(doc_keys))) summary_dict["Average precision (py)"] = p logger.info("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r logger.info("Average recall (py): {:.2f}%".format(r * 100)) if visualize: with open('visualize.bin', 'wb') as f: pickle.dump(visualize_list, f) logger.info('Saved visialized') return util.make_summary(summary_dict), f
initial_time = time.time() while True: tf_loss, tf_global_step, _ = session.run( [model.loss, model.global_step, model.train_op]) accumulated_loss += tf_loss if tf_global_step == 1 or tf_global_step % report_frequency == 0: total_time = time.time() - initial_time steps_per_second = tf_global_step / total_time average_loss = accumulated_loss / report_frequency print( f"[{tf_global_step}] loss={average_loss:.4f}, steps/s={steps_per_second:.2f}" ) writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step) accumulated_loss = 0.0 if tf_global_step == 1 or tf_global_step % eval_frequency == 0: eval_summary, eval_f1 = model.evaluate(session) _ = session.run(model.update_max_f1) saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step) if eval_f1 > max_f1: max_f1 = eval_f1 util.copy_checkpoint( os.path.join(log_dir, "model-{}".format(tf_global_step)),
def evaluate(self, session, official_stdout=False): # self.load_eval_data() with open(self.config["inv_mapping"], 'rb') as handle: inv_mapping = pickle.load(handle) with open(self.config["eval_path"], 'rb') as handle: test = pickle.load(handle) coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() swag_predictions = [] swag_labels = [] for i in range(len(test)): if i == 191 or i == 217 or i == 225: continue example = test[i] file_name = example["doc_key"] inv_map = inv_mapping[file_name] tensorized_example = self.tensorize_example(example, i, is_training=False) feed_dict = { i: t for i, t in zip(self.input_tensors, tensorized_example) } lee_predictions, swag_pred = session.run( [self.predictions2, self.swag_predictions], feed_dict=feed_dict) _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = lee_predictions top_span_starts = inv_map[top_span_starts] top_span_ends = inv_map[top_span_ends] predicted_antecedents = self.get_predicted_antecedents( top_antecedents, top_antecedent_scores) coref_predictions[file_name] = self.evaluate_coref( top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator) # SWAG evaluation swag_label = tensorized_example[-1] swag_predictions.append(swag_pred[0]) swag_labels.append(swag_label[0]) if i % 10 == 0: print("Evaluated {}/{} examples.".format(i + 1, len(test))) #. and now you getthe predictiosn basically. summary_dict = {} try: conll_results = conll.evaluate_conll( self.config["conll_eval_path"], coref_predictions, official_stdout) average_f1 = sum( results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) except: print("unstable results") average_f1 = 0 p, r, f = coref_evaluator.get_prf() summary_dict["Average F1 (py)"] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["Average precision (py)"] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"] = r print("Average recall (py): {:.2f}%".format(r * 100)) print("Now evaluating SWAG") swag_accuracy = self.swag_evaluation(swag_predictions, swag_labels) print("Average SWAG accuracy is : {:.2f}%".format(swag_accuracy * 100)) return util.make_summary(summary_dict), average_f1, swag_accuracy