Esempio n. 1
0
    def train():
        global max_f1
        with tf.Session(config=util.gpu_config()) as session:
            session.run(tf.global_variables_initializer())
            model.start_enqueue_thread(session)
            accumulated_loss = 0.0

            ckpt = tf.train.get_checkpoint_state(log_dir)
            if ckpt and ckpt.model_checkpoint_path:
                print("Restoring from: {}".format(ckpt.model_checkpoint_path))
                saver.restore(session, ckpt.model_checkpoint_path)

            initial_time = time.time()

            while True:
                tf_loss, tf_global_step, _ = session.run(
                    [model.loss, model.global_step, model.train_op])
                accumulated_loss += tf_loss

                if tf_global_step % report_frequency == 0:
                    total_time = time.time() - initial_time
                    steps_per_second = tf_global_step / total_time

                    average_loss = accumulated_loss / report_frequency
                    print("[{}] loss={:.2f}, steps/s={:.2f}".format(
                        tf_global_step, average_loss, steps_per_second))
                    writer.add_summary(
                        util.make_summary({"loss": average_loss}),
                        tf_global_step)
                    accumulated_loss = 0.0

                if tf_global_step % save_frequency == 0:
                    saver.save(session,
                               os.path.join(log_dir, "model"),
                               global_step=tf_global_step)

                if tf_global_step % eval_frequency == 0:
                    eval_summary, eval_f1 = model.evaluate(session)

                    if eval_f1 > max_f1:
                        max_f1 = eval_f1
                        util.copy_checkpoint(
                            os.path.join(log_dir,
                                         "model-{}".format(tf_global_step)),
                            os.path.join(log_dir, "model.max.ckpt"))

                    writer.add_summary(eval_summary, tf_global_step)
                    writer.add_summary(
                        util.make_summary({"max_eval_f1": max_f1}),
                        tf_global_step)

                    print("[{}] evaL_f1={:.2f}, max_f1={:.2f}".format(
                        tf_global_step, eval_f1, max_f1))
Esempio n. 2
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)
      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)
      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    return util.make_summary(summary_dict), average_f1
Esempio n. 3
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    def _k_to_tag(k):
      if k == -3:
        return "oracle"
      elif k == -2:
        return "actual"
      elif k == -1:
        return "exact"
      elif k == 0:
        return "threshold"
      else:
        return "{}%".format(k)
    mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] }

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _, tag_labels, tag_seq, tag_loss_label = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores, tag_outputs, tag_seq = session.run(self.predictions, feed_dict=feed_dict)

      self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators)
      predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores)

      coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print "Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))
        # print tag_outputs
        # print tag_seq

    summary_dict = {}
    for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)):
      tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")]
      results_to_print = []
      for t, v in zip(tags, evaluator.metrics()):
        results_to_print.append("{:<10}: {:.2f}".format(t, v))
        summary_dict[t] = v
      print ", ".join(results_to_print)

    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print "Average F1 (conll): {:.2f}%".format(average_f1)

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print "Average F1 (py): {:.2f}%".format(f * 100)
    summary_dict["Average precision (py)"] = p
    print "Average precision (py): {:.2f}%".format(p * 100)
    summary_dict["Average recall (py)"] = r
    print "Average recall (py): {:.2f}%".format(r * 100)

    return util.make_summary(summary_dict), average_f1
Esempio n. 4
0
def main_csv_reader(args):
    path_to_coffee = args.path_to_coffee
    path_to_matched = args.matched_json
    all_people_list = flat_list(list(read_csv_file(path_to_coffee)))
    matched_in_this_session = []
    error = False

    if path_to_matched:
        try:
            matched_people_json = read_json_file(path_to_matched)
            tuple_list = create_tuple_list(all_people_list,
                                           matched_people_json)
            sorted_people_list = sort_tuple_list(tuple_list)
        except:
            raise ('Only use the program generated matched_people.json file')
    else:
        write_json_file()
        matched_people_json = read_json_file('matched_people.json')
        sorted_people_list = all_people_list

    unmatched_people = []

    for person in sorted_people_list:
        if person not in matched_in_this_session:
            individual_match_list = invidual_preproc(person, all_people_list,
                                                     matched_people_json,
                                                     matched_in_this_session)
            if individual_match_list:
                matched_pair = coffee_roulette(person, individual_match_list)
                if matched_pair is not None:
                    for person in matched_pair:
                        matched_in_this_session.append(person)
                else:
                    error = True
                    break
            else:
                unmatched_people.append(person)
        else:
            pass

    if error is False:
        create_today_matched(matched_in_this_session)
        if unmatched_people:
            create_today_unmatched(unmatched_people)

        updated_json = update_current_json(matched_people_json,
                                           matched_in_this_session)
        summary = "\n{} Matches".format(date.today())
        summary = create_matched_people_string(matched_in_this_session,
                                               summary)
        summary_messsage, alone = make_summary(matched_in_this_session,
                                               unmatched_people, summary, "")
        summary += alone
        write_json_file(updated_json)
        write_txt_file(summary)
        print(summary_messsage)
Esempio n. 5
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    def _k_to_tag(k):
      if k == -3:
        return "oracle"
      elif k == -2:
        return "actual"
      elif k == -1:
        return "exact"
      elif k == 0:
        return "threshold"
      else:
        return "{}%".format(k)
    mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] }

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)

      self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators)
      predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores)

      coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)):
      tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")]
      results_to_print = []
      for t, v in zip(tags, evaluator.metrics()):
        results_to_print.append("{:<10}: {:.2f}".format(t, v))
        summary_dict[t] = v
      print(", ".join(results_to_print))

    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    return util.make_summary(summary_dict), average_f1
Esempio n. 6
0
    def evaluate(self, session, official_stdout=False, eval_mode=False):
        self.load_eval_data()
        coref_predictions = {}

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, \
                top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)
            """
            candidate_starts: (num_words, max_span_width) 所有候选span的start
            candidate_ends: (num_words, max_span_width) 所有候选span的end
            candidate_mention_scores: (num_candidates,) 候选答案的得分
            top_span_starts: (k, ) 筛选过mention之后的候选的start_index
            top_span_ends: (k, ) 筛选过mention之后的候选的end_index
            top_antecedents: (k, c) 粗筛过antecedent之后的每个候选antecedent的index
            top_antecedent_scores: (k, c) 粗筛过antecedent之后的每个候选antecedent的score
            """
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"])
            if (example_num + 1) % 100 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        summary_dict = {}
        if eval_mode:  # 在测试集评测的时候,需要用官方的脚本再评测一遍
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = self.coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(self.eval_data)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util.make_summary(summary_dict), f
Esempio n. 7
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    avg_loss = 0.0
    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      
      predictions, loss = session.run([self.predictions, self.loss], feed_dict=feed_dict)
      candidate_starts, candidate_ends, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, _, _ = predictions

      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 20 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))
      avg_loss += loss

    avg_loss = avg_loss / len(self.eval_data)
    summary_dict = {}
    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)

    cluster_result = {'prediction':coref_predictions, 'gold':official_stdout}

    with open('evaluate_result.pickle', 'wb') as handle:
      pickle.dump(cluster_result, handle, protocol=pickle.HIGHEST_PROTOCOL)

    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))
    summary_dict["Validation loss"] = avg_loss
    print("Validation loss: {:.3f}".format(avg_loss))
    

    return util.make_summary(summary_dict), average_f1, avg_loss
    def evaluate(self, session, official_stdout=False):
        with open(self.config["inv_mapping"], 'rb') as handle:
            inv_mapping = pickle.load(handle)
        with open(self.config["eval_path"], 'rb') as handle:
            test = pickle.load(handle)

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        for i in range(len(test)):
            example = test[i]
            file_name = example["doc_key"]
            inv_map = inv_mapping[file_name]
            tensorized_example = self.tensorize_example(example,
                                                        is_training=False)
            feed_dict = dict(zip(self.input_tensors, tensorized_example))
            _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _ = tensorized_example
            # feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
            candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(
                self.predictions, feed_dict=feed_dict)
            top_span_starts = inv_map[top_span_starts]
            top_span_ends = inv_map[top_span_ends]
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[file_name] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if i % 10 == 0:
                print("Evaluated {}/{} examples.".format(i + 1, len(test)))
        summary_dict = {}
        conll_results = conll.evaluate_conll(self.config["conll_eval_path"],
                                             coref_predictions,
                                             official_stdout)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)
        summary_dict["Average F1 (conll)"] = average_f1
        print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        return util.make_summary(summary_dict), average_f1
Esempio n. 9
0
    def evaluate_mention_proposal(self,
                                  session,
                                  official_stdout=False,
                                  eval_mode=False):
        self.load_eval_data()
        summary_dict = {}
        tp = 0
        fp = 0
        fn = 0
        epsilon = 1e-10
        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            pred_labels, gold_labels = session.run(
                [self.pred_mention_labels, self.gold_mention_labels],
                feed_dict=feed_dict)

            tp += np.logical_and(pred_labels, gold_labels).sum()
            fp += np.logical_and(pred_labels,
                                 np.logical_not(gold_labels)).sum()
            fn += np.logical_and(np.logical_not(pred_labels),
                                 gold_labels).sum()

            if (example_num + 1) % 100 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        p = tp / (tp + fp + epsilon)
        r = tp / (tp + fn + epsilon)
        f = 2 * p * r / (p + r + epsilon)
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(self.eval_data)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util.make_summary(summary_dict), f
Esempio n. 10
0
    def evaluate(self, session):
        self.load_eval_data()

        tp, fn, fp = 0, 0, 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            top_span_starts, top_span_ends = session.run(self.predictions,
                                                         feed_dict=feed_dict)

            gold_mentions = set([(m[0], m[1]) for cl in example["clusters"]
                                 for m in cl])
            pred_mentions = set([
                (s, e) for s, e in zip(top_span_starts, top_span_ends)
            ])

            tp += len(gold_mentions & pred_mentions)
            fn += len(gold_mentions - pred_mentions)
            fp += len(pred_mentions - gold_mentions)

            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        m_r = float(tp) / (tp + fn)
        m_p = float(tp) / (tp + fp)
        m_f1 = 2.0 * m_r * m_p / (m_r + m_p)

        print("Mention F1: {:.2f}%".format(m_f1 * 100))
        print("Mention recall: {:.2f}%".format(m_r * 100))
        print("Mention precision: {:.2f}%".format(m_p * 100))

        summary_dict = {}
        summary_dict["Mention F1"] = m_f1
        summary_dict["Mention recall"] = m_r
        summary_dict["Mention precision"] = m_p

        return util.make_summary(summary_dict), m_r
Esempio n. 11
0
            # print "use_gpu", use_gpu
            accumulated_loss += tf_loss
            acc_total_loss += total_loss
            acc_domain_loss += domain_loss
            acc_dmrm += domain_loss_reduce_mean

            if tf_global_step % report_frequency == 0:
                total_time = time.time() - initial_time
                steps_per_second = tf_global_step / total_time

                average_loss = accumulated_loss / report_frequency
                print "[{}] loss={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, average_loss, steps_per_second)
                writer.add_summary(
                    util.make_summary({"original loss": average_loss}),
                    tf_global_step)
                accumulated_loss = 0.0

                average_domain_loss = acc_domain_loss / report_frequency
                print "[{}] domain_loss={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, average_domain_loss, steps_per_second)
                writer.add_summary(
                    util.make_summary({"domain loss": average_domain_loss}),
                    tf_global_step)
                acc_domain_loss = 0.0

                average_domain_loss_rm = acc_dmrm / report_frequency
                print "[{}] domain_loss_reduce_mean={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, average_domain_loss_rm, steps_per_second)
                writer.add_summary(
Esempio n. 12
0
                    R = (reward_val * j) + 0.99 * R
                    pg_reward[i][j] = R

            feed_dict[model.pg_reward] = pg_reward + eps
            tf_loss, tf_global_step, _ = session.run([model.loss, model.global_step, model.train_op],
                                                     feed_dict=feed_dict)
            accumulated_loss += tf_loss

            if tf_global_step % report_frequency == 0:
                total_time = time.time() - initial_time
                steps_per_second = tf_global_step / total_time

                average_loss = accumulated_loss / report_frequency
                print("[{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step,
                                                                average_loss, steps_per_second))
                writer.add_summary(util.make_summary(
                    {"loss": average_loss}), tf_global_step)
                accumulated_loss = 0.0

            if tf_global_step % eval_frequency == 0:
                eval_frequency = report_frequency = np.random.randint(1, 11)
                saver.save(session, os.path.join(log_dir, "model"),
                           global_step=tf_global_step)
                try:
                    eval_summary, eval_f1 = model.evaluate(session)
                except:
                    # most time is spent here. so there is a high chance that
                    # the timeout exception from reward computation is caught here
                    eval_summary, eval_f1 = model.evaluate(session)

                if eval_f1 > max_f1:
                    max_f1 = eval_f1
Esempio n. 13
0
def main():
    config = util.initialize_from_env()

    report_frequency = config["report_frequency"]
    eval_frequency = config["eval_frequency"]

    model = util.get_model(config)
    saver = tf.train.Saver()

    log_dir = config["log_dir"]
    max_steps = config['num_epochs'] * config['num_docs']
    writer = tf.summary.FileWriter(log_dir, flush_secs=20)

    max_f1 = 0
    mode = 'w'

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        model.start_enqueue_thread(session)
        accumulated_loss = 0.0

        initial_step = 0
        ckpt = tf.train.get_checkpoint_state(log_dir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Restoring from: {}".format(ckpt.model_checkpoint_path))
            saver.restore(session, ckpt.model_checkpoint_path)
            mode = 'a'
            initial_step = int(
                os.path.basename(ckpt.model_checkpoint_path).split('-')[1])
        fh = logging.FileHandler(os.path.join(log_dir, 'stdout.log'),
                                 mode=mode)
        fh.setFormatter(logging.Formatter(format))
        logger.addHandler(fh)

        initial_time = time.time()
        while True:
            tf_loss, tf_global_step, _ = session.run(
                [model.loss, model.global_step, model.train_op])
            accumulated_loss += tf_loss
            # print('tf global_step', tf_global_step)

            if tf_global_step % report_frequency == 0:
                steps_per_second = (tf_global_step - initial_step) / (
                    time.time() - initial_time)

                average_loss = accumulated_loss / report_frequency
                logger.info("[{}] loss={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, average_loss, steps_per_second))
                writer.add_summary(util.make_summary({"loss": average_loss}),
                                   tf_global_step)
                accumulated_loss = 0.0

            if tf_global_step % eval_frequency == 0:
                eval_summary, eval_f1 = model.evaluate(session)

                if eval_f1 > max_f1:
                    max_f1 = eval_f1
                    saver.save(session,
                               os.path.join(log_dir, "model"),
                               global_step=tf_global_step)
                    util.copy_checkpoint(
                        os.path.join(log_dir,
                                     "model-{}".format(tf_global_step)),
                        os.path.join(log_dir, "model.max.ckpt"))

                writer.add_summary(eval_summary, tf_global_step)
                writer.add_summary(util.make_summary({"max_eval_f1": max_f1}),
                                   tf_global_step)

                logger.info("[{}] evaL_f1={:.4f}, max_f1={:.4f}".format(
                    tf_global_step, eval_f1, max_f1))
                if tf_global_step > max_steps:
                    break
Esempio n. 14
0
            #训练数据结果
            if predict > config["result_metric"]:
                pred.append(1)
            else:
                pred.append(0)
            true.append(label)

            if tf_global_step % report_frequency == 0:
                total_time = time.time() - initial_time
                steps_per_second = tf_global_step / total_time

                average_loss = accumulated_loss / report_frequency
                print("[{}] loss={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, average_loss, steps_per_second))
                writer.add_summary(util.make_summary({"loss": average_loss}),
                                   tf_global_step)

                accumulated_loss = 0.0

            if tf_global_step % eval_frequency == 0:

                #训练集评价结果
                train_accuracy = metrics.accuracy_score(true, pred)
                train_precision_macro = metrics.precision_score(
                    true, pred, average='macro')
                train_recall_macro = metrics.recall_score(true,
                                                          pred,
                                                          average='macro')
                train_f = metrics.f1_score(true, pred, average='macro')
                summary_dict = {}
Esempio n. 15
0
  def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    losses = []
    doc_keys = []
    num_evaluated= 0


    ##################################################################################################
    ################### WE FURTHER DETECT THE RESULTS SEPERATEDLY FOR P-P; NP-NP, P-NP ###############
    ##################################################################################################
    coref_predictions_pp = {}
    coref_predictions_pnp = {}
    coref_predictions_npnp = {}

    # span type
    coref_evaluator_pp = PairEvaluator()
    coref_evaluator_pnp = PairEvaluator()
    coref_evaluator_npnp = PairEvaluator()
    coref_evaluator_all = PairEvaluator()

    num_coref_pp = 0
    num_coref_pnp = 0
    num_coref_npnp = 0
    num_coref_all = 0

    # span freq
    coref_evaluator_freq = PairEvaluator()
    coref_evaluator_rare = PairEvaluator()
    
    num_coref_freq = 0
    num_coref_rare = 0

    # pron type
    coref_evaluators_type = dict()
    coref_evaluators_type["demo"], coref_evaluators_type["pos"], coref_evaluators_type["third"] = PairEvaluator(), PairEvaluator(), PairEvaluator()
    nums_coref_type = dict()
    nums_coref_type["demo"], nums_coref_type["pos"], nums_coref_type["third"] = 0, 0, 0

    count = 0 

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      try:
        # count += 1
        # if count == 10:
        #   break
        _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
        feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
        # if tensorized_example[0].shape[0] <= 9:
        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])
        loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run([self.loss, self.predictions], feed_dict=feed_dict)
        # losses.append(session.run(self.loss, feed_dict=feed_dict))
        losses.append(loss)
        predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)

        coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

        if example_num % 10 == 0:
          print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

        #####################################################################################
        # Evaluate on three different settings: NP-NP, NP-P, P-P by using a different cluster
        #####################################################################################

        # Span Type
        flatten_sentences = util.flatten(example["sentences"])
        gold_pp_pairs, gold_pnp_pairs, gold_npnp_pairs, num_pp_pairs, num_pnp_pairs, num_npnp_pairs, num_relation = self.cluster_to_pairs(example["clusters"], flatten_sentences)
        # predicted_clusters = coref_predictions[example["doc_key"]]
        pred_pp_pairs, pred_pnp_pairs, pred_npnp_pairs, _, _, _, _ = self.cluster_to_pairs(coref_predictions[example["doc_key"]], flatten_sentences)

        # Span Frequency
        gold_freq_pnp_pairs, gold_rare_pnp_pairs, num_freq_pairs, num_rare_pairs = self.cluster_to_pair_frequent(example["clusters"], flatten_sentences)
        pred_freq_pnp_pairs, pred_rare_pnp_pairs, _, _ = self.cluster_to_pair_frequent(coref_predictions[example["doc_key"]], flatten_sentences)

        # pronoun type demo, pos, third
        gold_type_pairs, gold_type_nums = self.cluster_to_pair_detailed_pronoun(example["clusters"], flatten_sentences)
        pred_type_pairs, pred_type_nums = self.cluster_to_pair_detailed_pronoun(coref_predictions[example["doc_key"]], flatten_sentences)
        
        for pron_type in ["demo", "pos", "third"]:
          coref_evaluators_type[pron_type].update(gold_type_pairs[pron_type], pred_type_pairs[pron_type])
          nums_coref_type[pron_type] += gold_type_nums[pron_type]

        all_gold = gold_pp_pairs + gold_pnp_pairs + gold_npnp_pairs
        all_pred = pred_pp_pairs + pred_pnp_pairs + pred_npnp_pairs

        coref_evaluator_pp.update(pred_pp_pairs, gold_pp_pairs)
        coref_evaluator_pnp.update(pred_pnp_pairs, gold_pnp_pairs)
        coref_evaluator_npnp.update(pred_npnp_pairs, gold_npnp_pairs)
        coref_evaluator_all.update(all_pred, all_gold)

        coref_evaluator_freq.update(pred_freq_pnp_pairs, gold_freq_pnp_pairs)
        coref_evaluator_rare.update(pred_rare_pnp_pairs, gold_rare_pnp_pairs)

        num_coref_pp += num_pp_pairs
        num_coref_pnp += num_pnp_pairs
        num_coref_npnp += num_npnp_pairs
        num_coref_all = num_coref_all + num_pp_pairs + num_pnp_pairs + num_npnp_pairs
        num_coref_freq += num_freq_pairs
        num_coref_rare += num_rare_pairs
      except:
        a = "do nothing"

    summary_dict = {}

    self.print_prf(coref_evaluator_pp, summary_dict, doc_keys, "PP", num_coref_pp)
    self.print_prf(coref_evaluator_pnp, summary_dict, doc_keys, "PNP", num_coref_pnp)
    self.print_prf(coref_evaluator_npnp, summary_dict, doc_keys, "NPNP", num_coref_npnp)

    self.print_prf(coref_evaluator_freq, summary_dict, doc_keys, "FREQ", num_coref_freq)
    self.print_prf(coref_evaluator_rare, summary_dict, doc_keys, "RARE", num_coref_rare)

    for pron_type in ["demo", "pos", "third"]:
      self.print_prf(coref_evaluators_type[pron_type], summary_dict, doc_keys, pron_type, nums_coref_type[pron_type])
    
    self.print_prf(coref_evaluator_all, summary_dict, doc_keys, "ALL_PAIRS", num_coref_all)

    #######################################################################################

    # summary_dict = {}

    print("The evaluatoin results for all clusters")
    print("The number of pairs is "+ str(num_coref_all))
    
    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    if eval_mode:
      conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout)
      average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      summary_dict["Average F1 (conll)"] = average_f1
      print("Average F1 (conll): {:.2f}%".format(average_f1))


    return util.make_summary(summary_dict), f
Esempio n. 16
0
    def evaluate(self, session, official_stdout=False):
        self.load_eval_data()

        tp, fn, fp = 0, 0, 0
        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(
                self.predictions, feed_dict=feed_dict)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)

            gold_mentions = set([(s, e) for cl in example["clusters"]
                                 for s, e in cl])
            pred_mentions = set([
                (s, e) for s, e in zip(top_span_starts, top_span_ends)
            ])
            tp += len(gold_mentions & pred_mentions)
            fn += len(gold_mentions - pred_mentions)
            fp += len(pred_mentions - gold_mentions)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        m_r = float(tp) / (tp + fn)
        m_p = float(tp) / (tp + fp)
        m_f1 = 2.0 * m_r * m_p / (m_r + m_p)
        print("Mention F1: {:.2f}%".format(m_f1 * 100))
        print("Mention recall: {:.2f}%".format(m_r * 100))
        print("Mention precision: {:.2f}%".format(m_p * 100))

        summary_dict = {}
        if official_stdout:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        average_f1 = average_f1 if official_stdout else f * 100
        return util.make_summary(summary_dict), average_f1
Esempio n. 17
0
  def evaluate(self, session, data, predictions, loss, official_stdout=False):
    if self.eval_data is None:
      self.eval_data, self.eval_tensors, self.coref_eval_data = data.load_eval_data()

    def _k_to_tag(k):
      if k == -3:
        return "oracle"
      elif k == -2:
        return "actual"
      elif k == -1:
        return "exact"
      elif k == 0:
        return "threshold"
      else:
        return "{}%".format(k)

    # Retrieval evaluators.
    arg_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 30, 40, 50, 80, 100, 120, 150] }
    predicate_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50, 70] }
    mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50] }
    entity_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 10, 20, 30, 40, 50, 70] }

    total_loss = 0
    total_num_predicates = 0
    total_gold_predicates = 0

    srl_comp_sents = 0
    srl_predictions = []
    ner_predictions = []
    rel_predictions = []
    coref_predictions = {}
    coref_evaluator = coref_metrics.CorefEvaluator()
    all_gold_predicates = []
    all_guessed_predicates = []

    start_time = time.time()
    debug_printer = debug_utils.DebugPrinter()

    # Simple analysis.
    unique_core_role_violations = 0
    continuation_role_violations = 0
    reference_role_violations = 0
    gold_u_violations = 0
    gold_c_violations = 0
    gold_r_violations = 0

    # Global sentence ID.
    rel_sent_id = 0
    srl_sent_id = 0

    for i, doc_tensors in enumerate(self.eval_tensors):
      feed_dict = dict(list(zip(
          data.input_tensors,
          [pad_batch_tensors(doc_tensors, tn) for tn in data.input_names + data.label_names])))
      predict_names = []
      for tn in data.predict_names:
        if tn in predictions:
          predict_names.append(tn)
      predict_tensors = [predictions[tn] for tn in predict_names] + [loss]
      predict_tensors = session.run(predict_tensors, feed_dict=feed_dict)
      predict_dict = dict(list(zip(predict_names + ["loss"], predict_tensors)))

      doc_size = len(doc_tensors)
      doc_example = self.coref_eval_data[i]
      sentences = doc_example["sentences"]
      decoded_predictions = inference_utils.mtl_decode(
          sentences, predict_dict, data.ner_labels_inv, data.rel_labels_inv,
          self.config)

      # Relation extraction.
      if "rel" in decoded_predictions:
        rel_predictions.extend(decoded_predictions["rel"])
        for j in range(len(sentences)):
          sent_example = self.eval_data[rel_sent_id][3]  # sentence, srl, ner, relations
          text_length = len(sentences[j])
          ne = predict_dict["num_entities"][j]
          gold_entities = set([])
          for rel in sent_example:
            gold_entities.update([rel[:2], rel[2:4]])
          srl_eval_utils.evaluate_retrieval(
              predict_dict["candidate_starts"][j], predict_dict["candidate_ends"][j],
              predict_dict["candidate_entity_scores"][j], predict_dict["entity_starts"][j][:ne],
              predict_dict["entity_ends"][j][:ne], gold_entities, text_length, entity_evaluators)
          rel_sent_id += 1


      if "ner" in decoded_predictions:
        ner_predictions.extend(decoded_predictions["ner"])

      if "predicted_clusters" in decoded_predictions:
        gold_clusters = [tuple(tuple(m) for m in gc) for gc in doc_example["clusters"]]
        gold_mentions = set([])
        mention_to_gold = {}
        for gc in gold_clusters:
          for mention in gc:
            mention_to_gold[mention] = gc
            gold_mentions.add(mention)
        coref_evaluator.update(decoded_predictions["predicted_clusters"], gold_clusters, decoded_predictions["mention_to_predicted"],
                               mention_to_gold)
        coref_predictions[doc_example["doc_key"]] = decoded_predictions["predicted_clusters"]
        
        # Evaluate retrieval.
        doc_text_length = sum([len(s) for s in sentences])
        srl_eval_utils.evaluate_retrieval(
            predict_dict["candidate_mention_starts"], predict_dict["candidate_mention_ends"],
            predict_dict["candidate_mention_scores"], predict_dict["mention_starts"], predict_dict["mention_ends"],
            gold_mentions, doc_text_length, mention_evaluators)

      total_loss += predict_dict["loss"]
      if (i + 1) % 50 == 0:
        print(("Evaluated {}/{} documents.".format(i + 1, len(self.coref_eval_data))))

    debug_printer.close()
    summary_dict = {}
    task_to_f1 = {}  # From task name to F1.
    elapsed_time = time.time() - start_time

    sentences, gold_srl, gold_ner, gold_relations = list(zip(*self.eval_data))

    # Summarize results.
    if self.config["relation_weight"] > 0:
      precision, recall, f1 = (
          srl_eval_utils.compute_relation_f1(sentences, gold_relations, rel_predictions))
      task_to_f1["relations"] = f1
      summary_dict["Relation F1"] = f1
      summary_dict["Relation precision"] = precision
      summary_dict["Relation recall"] = recall
      for k, evaluator in sorted(list(entity_evaluators.items()), key=operator.itemgetter(0)):
        tags = ["{} {} @ {}".format("Entities", t, _k_to_tag(k)) for t in ("R", "P", "F")]
        results_to_print = []
        for t, v in zip(tags, evaluator.metrics()):
          results_to_print.append("{:<10}: {:.4f}".format(t, v))
          summary_dict[t] = v
        print(", ".join(results_to_print))
  

    if self.config["ner_weight"] > 0:
      ner_precision, ner_recall, ner_f1, ul_ner_prec, ul_ner_recall, ul_ner_f1, ner_label_mat = (
          srl_eval_utils.compute_span_f1(gold_ner, ner_predictions, "NER"))
      summary_dict["NER F1"] = ner_f1
      summary_dict["NER precision"] = ner_precision
      summary_dict["NER recall"] = ner_recall
      summary_dict["Unlabeled NER F1"] = ul_ner_f1
      summary_dict["Unlabeled NER precision"] = ul_ner_prec
      summary_dict["Unlabeled NER recall"] = ul_ner_recall

      # Write NER prediction to IOB format and run official eval script.
      srl_eval_utils.print_to_iob2(sentences, gold_ner, ner_predictions, self.config["ner_conll_eval_path"])
      task_to_f1["ner"] = ner_f1
      #for label_pair, freq in ner_label_mat.most_common():
      #  if label_pair[0] != label_pair[1] and freq > 10:
      #    print ("{}\t{}\t{}".format(label_pair[0], label_pair[1], freq))


    if self.config["coref_weight"] > 0:
      #conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
      #coref_conll_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      #summary_dict["Average F1 (conll)"] = coref_conll_f1
      #print "Average F1 (conll): {:.2f}%".format(coref_conll_f1)

      p,r,f = coref_evaluator.get_prf()
      summary_dict["Average Coref F1 (py)"] = f
      print("Average F1 (py): {:.2f}%".format(f * 100))
      summary_dict["Average Coref precision (py)"] = p
      print("Average precision (py): {:.2f}%".format(p * 100))
      summary_dict["Average Coref recall (py)"] = r
      print("Average recall (py): {:.2f}%".format(r * 100))

      task_to_f1["coref"] = f * 100  # coref_conll_f1
      for k, evaluator in sorted(list(mention_evaluators.items()), key=operator.itemgetter(0)):
        tags = ["{} {} @ {}".format("Mentions", t, _k_to_tag(k)) for t in ("R", "P", "F")]
        results_to_print = []
        for t, v in zip(tags, evaluator.metrics()):
          results_to_print.append("{:<10}: {:.4f}".format(t, v))
          summary_dict[t] = v
        print(", ".join(results_to_print))

    summary_dict["Dev Loss"] = total_loss / len(self.coref_eval_data)

    print("Decoding took {}.".format(str(datetime.timedelta(seconds=int(elapsed_time)))))
    print("Decoding speed: {}/document, or {}/sentence.".format(
        str(datetime.timedelta(seconds=int(elapsed_time / len(self.coref_eval_data)))),
        str(datetime.timedelta(seconds=int(elapsed_time / len(self.eval_data))))
    ))

    metric_names = self.config["main_metrics"].split("_")
    main_metric = sum([task_to_f1[t] for t in metric_names]) / len(metric_names)
    print("Combined metric ({}): {}".format(self.config["main_metrics"], main_metric))

    return util.make_summary(summary_dict), main_metric, task_to_f1
Esempio n. 18
0
    def evaluate(self,
                 session,
                 evaluation_data=None,
                 official_stdout=False,
                 mode='train',
                 title_map=None):
        if evaluation_data:
            separate_data = list()
            for tmp_example in evaluation_data:
                tensorized_example = self.tensorize_pronoun_example(
                    tmp_example, is_training=True)
                separate_data.append((tensorized_example, tmp_example))
        else:
            separate_data = self.eval_data

        all_coreference = 0
        predict_coreference = 0
        corrct_predict_coreference = 0

        prediction_result = list()
        for example_num, (tensorized_example,
                          example) in enumerate(separate_data):
            prediction_result_by_example = list()
            all_sentence = list()

            doc_id = example['doc_key']
            if mode == 'test' or mode == 'predict':
                print(title_map[doc_id])

            for s in example['sentences']:
                all_sentence += s

            _, _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, number_features,  candidate_NP_positions, \
            pronoun_positions, name_positions, status_positions, order_features, labels, _ = tensorized_example

            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            pronoun_coref_scores = session.run(self.predictions,
                                               feed_dict=feed_dict)

            pronoun_coref_scores = pronoun_coref_scores[0]  # [4, 4]

            if self.config["use_multi_span"]:
                gold_starts = tf.squeeze(gold_starts[:, :1], 1).eval()
                gold_ends = tf.squeeze(gold_ends[:, :1], 1).eval()

            for i, pronoun_coref_scores_by_example in enumerate(
                    pronoun_coref_scores):
                current_pronoun_index = int(pronoun_positions[i][0])
                pronoun_position_start = int(
                    gold_starts[current_pronoun_index])
                pronoun_position_end = int(
                    gold_ends[current_pronoun_index]) + 1

                current_pronoun = ''.join(
                    all_sentence[pronoun_position_start:pronoun_position_end])

                pronoun_coref_scores_by_example = pronoun_coref_scores_by_example[
                    1:]  # [1,3]

                # labels [4, 3] bool
                prediction_result_by_example.append(
                    (pronoun_coref_scores_by_example.tolist(), labels[i]))

                for j, tmp_score in enumerate(
                        pronoun_coref_scores_by_example.tolist()):
                    current_candidate_index = int(candidate_NP_positions[i][j])
                    candidate_positions_start = int(
                        gold_starts[current_candidate_index])
                    candidate_positions_end = int(
                        gold_ends[current_candidate_index]) + 1
                    current_candidate = ''.join(all_sentence[
                        candidate_positions_start:candidate_positions_end])
                    if tmp_score > 0:
                        msg = '{} link to: {} ({},{}) \t'.format(
                            current_pronoun, current_candidate,
                            candidate_positions_start, candidate_positions_end)
                        predict_coreference += 1
                        if labels[i][j]:
                            corrct_predict_coreference += 1
                            msg += 'True-predict' + '\t' + 'score: ' + str(
                                tmp_score)
                        else:
                            msg += 'False-predict' + '\t' + 'score: ' + str(
                                tmp_score)
                        if mode == 'test' or mode == 'predict':
                            print(msg)
                for l in labels[i]:
                    if l:
                        all_coreference += 1
            prediction_result.append(prediction_result_by_example)

        summary_dict = {}
        if mode == 'predict':
            summary_dict["Average F1 (py)"] = 0
            summary_dict["Average precision (py)"] = 0
            summary_dict["Average recall (py)"] = 0
            print('there is no positive prediction')
            f1 = 0
        else:
            if predict_coreference > 0:
                p = corrct_predict_coreference / predict_coreference
                r = corrct_predict_coreference / all_coreference
                f1 = 2 * p * r / (p + r)
                summary_dict["Average F1 (py)"] = f1
                print("Average F1 (py): {:.2f}%".format(f1 * 100))
                summary_dict["Average precision (py)"] = p
                print("Average precision (py): {:.2f}%".format(p * 100))
                summary_dict["Average recall (py)"] = r
                print("Average recall (py): {:.2f}%".format(r * 100))
            else:
                summary_dict["Average F1 (py)"] = 0
                summary_dict["Average precision (py)"] = 0
                summary_dict["Average recall (py)"] = 0
                print('there is no positive prediction')
                f1 = 0

        return util.make_summary(summary_dict), f1
Esempio n. 19
0
        while True:
            tf_loss, tf_global_step, _ = session.run(
                [model.loss, model.global_step, model.train_op])
            accumulated_loss += tf_loss

            if tf_global_step % report_frequency == 0:
                steps_per_second = (tf_global_step - initial_step) / (
                    time.time() - initial_time)

                average_loss = accumulated_loss / report_frequency
                print("[{}] loss={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, average_loss, steps_per_second))
                writer.add_summary(
                    util.make_summary({
                        "loss":
                        average_loss,
                        "learning_rate":
                        session.run(model.learning_rate)
                    }), tf_global_step)
                accumulated_loss = 0.0
                initial_time = time.time()
                initial_step = tf_global_step

            if tf_global_step % eval_frequency == 0:
                saver.save(session,
                           os.path.join(log_dir, "model"),
                           global_step=tf_global_step)
                eval_summary, eval_f1 = model.evaluate(session)

                if eval_f1 > max_f1:
                    max_f1 = eval_f1
                    util.copy_checkpoint(
Esempio n. 20
0
            if tf_global_step % report_frequency == 0:
                total_time = time.time() - initial_time
                steps_per_second = tf_global_step / total_time

                # avg_mention_loss = acc_mention_loss / report_frequency
                avg_tagging_loss = acc_tagging_loss / report_frequency
                print "[{}] tagging_loss={:.2f} steps/s={:.2f}".format(
                    tf_global_step, avg_tagging_loss, steps_per_second)
                # '''
                print '----------------------------'
                print x1
                print "number of entities:%d" % max(list(x2))
                print "tagging_loss:%f, mention_loss:NA, antecedent_loss:%f" % (
                    x6, x8)
                print list(x2)
                print util.check_tags(x2)
                print list(x3[0])
                print x4
                print '----------------------------'
                # '''

                writer.add_summary(
                    util.make_summary({"loss": avg_tagging_loss}),
                    tf_global_step)
                # accumulated_loss = 0.0
                # acc_mention_loss = 0
                acc_tagging_loss = 0

    # Ask for all the services to stop.
    sv.stop()
Esempio n. 21
0
                             global_step=model.global_step,
                             save_model_secs=120)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as session:
        model.start_enqueue_thread(session)
        accumulated_loss = 0.0
        initial_time = time.time()
        while not sv.should_stop():
            tf_loss, tf_global_step, _ = session.run(
                [model.loss, model.global_step, model.train_op])
            accumulated_loss += tf_loss

            if tf_global_step % report_frequency == 0:
                total_time = time.time() - initial_time
                steps_per_second = tf_global_step / total_time

                average_loss = accumulated_loss / report_frequency
                print("[{}] loss={:.2f}, steps/s={:.2f}".format(
                    tf_global_step, tf_loss, steps_per_second))
                accumulated_loss = 0.0
                writer.add_summary(
                    util.make_summary({
                        "Train Loss": average_loss,
                        "Steps per second": steps_per_second
                    }))

    # Ask for all the services to stop.
    sv.stop()
Esempio n. 22
0
      saver.restore(session, ckpt.model_checkpoint_path)

    initial_time = time.time()
    print("We're reporting with frequency: %d" % report_frequency)
    print("We're reporting with eval frequency: %d" % eval_frequency)
    while True:
        tf_loss, tf_global_step, _  = session.run([model.loss, model.global_step1, model.train_op])
        accumulated_loss += tf_loss

        if tf_global_step % report_frequency == 0:
          total_time = time.time() - initial_time
          steps_per_second = tf_global_step / total_time

          average_loss = accumulated_loss / report_frequency
          print("Coreference [{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step, average_loss[0], steps_per_second))
          writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step)
          accumulated_loss = 0.0
    
        if tf_global_step % eval_frequency  == 0:
          #saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step)
          eval_summary, eval_f1, swag_accuracy = model.evaluate(session)
          if eval_f1 > max_f1:
            saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step)
            max_f1 = eval_f1
            util.copy_checkpoint(os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt"))
          if swag_accuracy> max_swag_acc:
            saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step)
            max_swag_acc = swag_accuracy
            util.copy_checkpoint(os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt"))

          writer.add_summary(eval_summary, tf_global_step)
Esempio n. 23
0
  def evaluate(self, session, global_step=None, official_stdout=False,
               keys=None, eval_mode=False, to_npy=None, from_npy=None,
               rsa_model=None):
    assert not (to_npy is not None and from_npy is not None), "cannot set both to_npy and from_npy to be none!"

    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    losses = []
    doc_keys = []
    num_evaluated = 0
    total_time = 0

    if to_npy:
      data_dicts = []
    if from_npy:
      with open(from_npy, "rb") as f:
        from_npy_dict = np.load(f)
        data_dicts = from_npy_dict.item().get("data_dicts")

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example

      if from_npy is None:
        feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
        # if tensorized_example[0].shape[0] <= 9:
        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])
        loss, (candidate_starts, candidate_ends, candidate_mention_scores,
               top_span_starts, top_span_ends,
               top_antecedents, top_antecedent_scores) = \
          session.run([self.loss, self.predictions], feed_dict=feed_dict)
      else:
        data_dict = data_dicts[example_num]
        example = data_dict["example"]

        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])

        tensorized_example = data_dict["tensorized_example"]
        loss = data_dict["loss"]
        top_span_starts = data_dict["top_span_starts"]
        top_span_ends = data_dict["top_span_ends"]
        top_antecedents = data_dict["top_antecedents"]
        top_antecedent_scores = data_dict["top_antecedent_scores"]

      # losses.append(session.run(self.loss, feed_dict=feed_dict))
      losses.append(loss)

      if rsa_model is not None:
        print("Running l1 for sentence %d" % example_num)
        start_time = time.time()
        top_antecedent_scores = rsa_model.l1(example, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores)
        duration = time.time() - start_time
        print("Finished sentence %d, took %.2f s" % (example_num, duration))
        total_time += duration
        num_evaluated += 1

      if to_npy:
          data_dict = {
              "example_num": example_num,
              "tensorized_example": tensorized_example,
              "example": example,
              "top_span_starts": top_span_starts,
              "top_span_ends": top_span_ends,
              "top_antecedents": top_antecedents,
              "top_antecedent_scores": top_antecedent_scores,
              "loss": loss,
          }
          data_dicts.append(data_dict)

      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    if to_npy:
      dict_to_npy = {"data_dicts": data_dicts}

    if eval_mode:
      conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout )
      average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      summary_dict["Average F1 (conll)"] = average_f1
      if to_npy:
        dict_to_npy["Average F1 (conll)"] = average_f1
      print("Average F1 (conll): {:.2f}%".format(average_f1))


    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    if to_npy:
      dict_to_npy["Average F1 (py)"] = f
      dict_to_npy["Average precision (py)"] = p
      dict_to_npy["Average recall (py)"] = r
      with open(to_npy, "wb") as f_to_npy:
        np.save(f_to_npy, dict_to_npy)

    if rsa_model:
        print("Ran rsa on %d sentences, avg time per sentence %.2f s" % num_evaluated, total_time / num_evaluated)

    return util.make_summary(summary_dict), f
      counter = 0 
      while True:
        random.shuffle(train_examples)
        for example in train_examples:
          tensorized_example = model.tensorize_example(example, is_training=True)
          feed_dict = dict(zip(model.input_tensors, tensorized_example))
          tf_loss, tf_global_step, _ = session.run([model.loss, model.global_step1, model.train_op] , feed_dict = feed_dict)
          print(str(tf_global_step)+'\r',end='')
          # print(str(tf_global_step))
          accumulated_loss += tf_loss
          if tf_global_step % report_frequency == 0:
              total_time = time.time() - initial_time
              steps_per_second = tf_global_step / total_time
              average_loss = accumulated_loss / report_frequency
              print("[{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step, average_loss, steps_per_second))
              writer.add_summary(util.make_summary({"loss": average_loss}), tf_global_step)
              accumulated_loss = 0.0
          if tf_global_step % eval_frequency  == 0:
            #saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step)
            eval_summary, eval_f1 = model.evaluate(session)
            if eval_f1 > max_f1:
              saver.save(session, os.path.join(log_dir, "model"), global_step=tf_global_step)
              max_f1 = eval_f1
              util.copy_checkpoint(os.path.join(log_dir, "model-{}".format(tf_global_step)), os.path.join(log_dir, "model.max.ckpt"))
              print("====")
  except Exception as e: 
      print(e)
      


Esempio n. 25
0
    def evaluate(self,
                 session,
                 eval_fold=-1,
                 num_fold=-1,
                 is_final_test=False):
        self.load_eval_data(eval_fold, num_fold)

        if "eval_on_test_part_only" in self.config and self.config[
                "eval_on_test_part_only"]:
            eval_on_test_part_only = True
            print("Evaluate on the test part only!!!!")
        else:
            eval_on_test_part_only = False

        if num_fold > 1 and is_final_test:
            print("Evaluating %d/%d fold." % (eval_fold + 1, num_fold))
        tp, fn, fp = 0, 0, 0
        tpa, fna, fpa = 0, 0, 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, _, _, gold_starts, gold_ends, cluster_ids, bridging_ante_cids, _, _, = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            predictions = session.run(self.predictions, feed_dict=feed_dict)

            pred_bridging_pairs, pred_bridging_anaphora = self.get_predicted_bridging_pairs(
                predictions)
            if self.config["has_multi_bridging_ant"]:
                #we follow hou et al to count birdings with multi-antecedent as correct
                # as long as any gold bridging ant is recovered (only for BASHI)
                gold_bridging_pairs, gold_bridging_anaphora = self.get_gold_bridging_pairs(
                    gold_starts, gold_ends, cluster_ids,
                    example["bridging_pairs"], pred_bridging_pairs)
            else:
                gold_bridging_pairs = set([(s, e, cid) for s, e, cid in zip(
                    gold_starts, gold_ends, bridging_ante_cids) if cid > 0])
                gold_bridging_anaphora = set([(s, e) for s, e, cid in zip(
                    gold_starts, gold_ends, bridging_ante_cids) if cid > 0])

            add2eval = True
            if eval_on_test_part_only and not example["doc_key"].endswith(
                    '_test'):
                add2eval = False
            if add2eval:
                tp += len(gold_bridging_pairs & pred_bridging_pairs)
                fn += len(gold_bridging_pairs - pred_bridging_pairs)
                fp += len(pred_bridging_pairs - gold_bridging_pairs)

                tpa += len(gold_bridging_anaphora & pred_bridging_anaphora)
                fna += len(gold_bridging_anaphora - pred_bridging_anaphora)
                fpa += len(pred_bridging_anaphora - gold_bridging_anaphora)

            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        bridging_recall = 0.0 if tp == 0 else float(tp) / (tp + fn)
        bridging_precision = 0.0 if tp == 0 else float(tp) / (tp + fp)
        bridging_f1 = 0.0 if bridging_precision == 0.0 else 2.0 * bridging_recall * bridging_precision / (
            bridging_recall + bridging_precision)

        bridging_anaphora_recall = 0.0 if tpa == 0 else float(tpa) / (tpa +
                                                                      fna)
        bridging_anaphora_precision = 0.0 if tpa == 0 else float(tpa) / (tpa +
                                                                         fpa)
        bridging_anaphora_f1 = 0.0 if bridging_anaphora_precision == 0.0 else 2.0 * bridging_anaphora_recall * bridging_anaphora_precision / (
            bridging_anaphora_recall + bridging_anaphora_precision)

        print("Bridging anaphora detection F1: {:.2f}%".format(
            bridging_anaphora_f1 * 100))
        print("Bridging anaphora detection recall: {:.2f}%".format(
            bridging_anaphora_recall * 100))
        print("Bridging anaphora detection precision: {:.2f}%".format(
            bridging_anaphora_precision * 100))

        print("Bridging F1: {:.2f}%".format(bridging_f1 * 100))
        print("Bridging recall: {:.2f}%".format(bridging_recall * 100))
        print("Bridging precision: {:.2f}%".format(bridging_precision * 100))

        summary_dict = {}
        summary_dict["Bridging anaphora detection F1"] = bridging_anaphora_f1
        summary_dict[
            "Bridging anaphora detection recall"] = bridging_anaphora_recall
        summary_dict[
            "Bridging anaphora detection precision"] = bridging_anaphora_precision

        summary_dict["Bridging F1"] = bridging_f1
        summary_dict["Bridging recall"] = bridging_recall
        summary_dict["Bridging precision"] = bridging_precision
        f1 = bridging_f1

        if is_final_test:
            return tp, fn, fp, tpa, fna, fpa
        return util.make_summary(summary_dict), f1 * 100
Esempio n. 26
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            # if keys is not None and example['doc_key']  in keys:
            # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
            # continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        summary_dict = {}
        # with open('doc_keys_512.txt', 'w') as f:
        # for key in doc_keys:
        # f.write(key + '\n')
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util.make_summary(summary_dict), f
Esempio n. 27
0
    def evaluate(self,
                 session,
                 data,
                 predictions,
                 loss,
                 official_stdout=False):
        if self.eval_data is None:
            self.eval_data, self.eval_tensors, self.coref_eval_data = data.load_eval_data(
            )

        def _k_to_tag(k):
            if k == -3:
                return "oracle"
            elif k == -2:
                return "actual"
            elif k == -1:
                return "exact"
            elif k == 0:
                return "threshold"
            else:
                return "{}%".format(k)

        # Retrieval evaluators.
        arg_evaluators = {
            k: util.RetrievalEvaluator()
            for k in [-3, -2, -1, 30, 40, 50, 80, 100, 120, 150]
        }
        predicate_evaluators = {
            k: util.RetrievalEvaluator()
            for k in [-3, -2, -1, 10, 20, 30, 40, 50, 70]
        }
        total_loss = 0
        total_num_predicates = 0
        total_gold_predicates = 0
        srl_comp_sents = 0
        srl_predictions = []
        all_gold_predicates = []
        all_guessed_predicates = []
        start_time = time.time()
        sent_id = 0

        # Simple analysis.
        unique_core_role_violations = 0
        continuation_role_violations = 0
        reference_role_violations = 0
        gold_u_violations = 0
        gold_c_violations = 0
        gold_r_violations = 0

        # Go through document-level predictions.
        for i, doc_tensors in enumerate(self.eval_tensors):
            feed_dict = dict(
                zip(data.input_tensors, [
                    pad_batch_tensors(doc_tensors, tn)
                    for tn in data.input_names + data.label_names
                ]))
            predict_names = []
            for tn in data.predict_names:
                if tn in predictions:
                    predict_names.append(tn)
            predict_tensors = [predictions[tn]
                               for tn in predict_names] + [loss]
            predict_tensors = session.run(predict_tensors, feed_dict=feed_dict)
            predict_dict = dict(zip(predict_names + ["loss"], predict_tensors))

            doc_size = len(doc_tensors)
            doc_example = self.coref_eval_data[i]
            sentences = doc_example["sentences"]
            decoded_predictions = inference_utils.srl_decode(
                sentences, predict_dict, data.srl_labels_inv, self.config)

            if "srl" in decoded_predictions:
                srl_predictions.extend(decoded_predictions["srl"])
                # Evaluate retrieval.
                word_offset = 0
                for j in range(len(sentences)):
                    text_length = len(sentences[j])
                    na = predict_dict["num_args"][j]
                    np = predict_dict["num_preds"][j]
                    sent_example = self.eval_data[
                        sent_id]  # sentence, srl, ner
                    gold_args = set([])
                    gold_preds = set([])
                    guessed_preds = set([])
                    for pred, args in sent_example[1].iteritems():
                        filtered_args = [(a[0], a[1]) for a in args
                                         if a[2] not in ["V", "C-V"]]
                        if len(filtered_args) > 0:
                            gold_preds.add((pred, pred))
                            gold_args.update(filtered_args)
                    for pred, args in decoded_predictions["srl"][j].iteritems(
                    ):
                        guessed_preds.add((pred, pred, "V"))
                    all_gold_predicates.append([(p[0], p[1], "V")
                                                for p in gold_preds])
                    all_guessed_predicates.append(guessed_preds)

                    srl_eval_utils.evaluate_retrieval(
                        predict_dict["candidate_starts"][j],
                        predict_dict["candidate_ends"][j],
                        predict_dict["candidate_arg_scores"][j],
                        predict_dict["arg_starts"][j][:na],
                        predict_dict["arg_ends"][j][:na], gold_args,
                        text_length, arg_evaluators)
                    srl_eval_utils.evaluate_retrieval(
                        range(text_length), range(text_length),
                        predict_dict["candidate_pred_scores"][j],
                        predict_dict["predicates"][j][:np],
                        predict_dict["predicates"][j][:np], gold_preds,
                        text_length, predicate_evaluators)

                    # TODO: Move elsewhere.
                    u_violations, c_violations, r_violations = debug_utils.srl_constraint_tracker(
                        decoded_predictions["srl"][j])
                    unique_core_role_violations += u_violations
                    continuation_role_violations += c_violations
                    reference_role_violations += r_violations
                    total_num_predicates += len(
                        decoded_predictions["srl"][j].keys())
                    u_violations, c_violations, r_violations = debug_utils.srl_constraint_tracker(
                        sent_example[1])
                    gold_u_violations += u_violations
                    gold_c_violations += c_violations
                    gold_r_violations += r_violations
                    total_gold_predicates += len(sent_example[1].keys())
                    sent_id += 1
                    word_offset += text_length

            total_loss += predict_dict["loss"]
            if (i + 1) % 50 == 0:
                print("Evaluated {}/{} documents.".format(
                    i + 1, len(self.coref_eval_data)))

        summary_dict = {}
        task_to_f1 = {}  # From task name to F1.
        elapsed_time = time.time() - start_time

        sentences, gold_srl, gold_ner = zip(*self.eval_data)

        # Summarize results, evaluate entire dev set.
        precision, recall, f1, conll_precision, conll_recall, conll_f1, ul_prec, ul_recall, ul_f1, srl_label_mat, comp = (
            srl_eval_utils.compute_srl_f1(sentences, gold_srl, srl_predictions,
                                          self.config["srl_conll_eval_path"]))
        pid_precision, pred_recall, pid_f1, _, _, _, _ = srl_eval_utils.compute_span_f1(
            all_gold_predicates, all_guessed_predicates, "Predicate ID")
        task_to_f1["srl"] = conll_f1
        summary_dict["PAS F1"] = f1
        summary_dict["PAS precision"] = precision
        summary_dict["PAS recall"] = recall
        summary_dict["Unlabeled PAS F1"] = ul_f1
        summary_dict["Unlabeled PAS precision"] = ul_prec
        summary_dict["Unlabeled PAS recall"] = ul_recall
        summary_dict["CoNLL F1"] = conll_f1
        summary_dict["CoNLL precision"] = conll_precision
        summary_dict["CoNLL recall"] = conll_recall
        if total_num_predicates > 0:
            summary_dict[
                "Unique core violations/Predicate"] = 1.0 * unique_core_role_violations / total_num_predicates
            summary_dict[
                "Continuation violations/Predicate"] = 1.0 * continuation_role_violations / total_num_predicates
            summary_dict[
                "Reference violations/Predicate"] = 1.0 * reference_role_violations / total_num_predicates
        print "Completely correct sentences: {}/{}".format(
            comp, 100.0 * comp / len(srl_predictions))

        for k, evaluator in sorted(arg_evaluators.items(),
                                   key=operator.itemgetter(0)):
            tags = [
                "{} {} @ {}".format("Args", t, _k_to_tag(k))
                for t in ("R", "P", "F")
            ]
            results_to_print = []
            for t, v in zip(tags, evaluator.metrics()):
                results_to_print.append("{:<10}: {:.4f}".format(t, v))
                summary_dict[t] = v
            print ", ".join(results_to_print)

        for k, evaluator in sorted(predicate_evaluators.items(),
                                   key=operator.itemgetter(0)):
            tags = [
                "{} {} @ {}".format("Predicates", t, _k_to_tag(k))
                for t in ("R", "P", "F")
            ]
            results_to_print = []
            for t, v in zip(tags, evaluator.metrics()):
                results_to_print.append("{:<10}: {:.4f}".format(t, v))
                summary_dict[t] = v
            print ", ".join(results_to_print)

        if total_num_predicates > 0:
            print("Constraint voilations: U: {} ({}), C: {} ({}), R: {} ({})".
                  format(
                      1.0 * unique_core_role_violations / total_num_predicates,
                      unique_core_role_violations, 1.0 *
                      continuation_role_violations / total_num_predicates,
                      continuation_role_violations,
                      1.0 * reference_role_violations / total_num_predicates,
                      reference_role_violations))
        if total_gold_predicates > 0:
            print(
                "Gold constraint voilations: U: {} ({}), C: {} ({}), R: {} ({})"
                .format(1.0 * gold_u_violations / total_gold_predicates,
                        gold_u_violations,
                        1.0 * gold_c_violations / total_gold_predicates,
                        gold_c_violations,
                        1.0 * gold_r_violations / total_gold_predicates,
                        gold_r_violations))
        #for label_pair, freq in srl_label_mat.most_common():
        #  if label_pair[0] != label_pair[1] and freq > 10:
        #    print ("{}\t{}\t{}".format(label_pair[0], label_pair[1], freq))

        summary_dict["Dev Loss"] = total_loss / len(self.coref_eval_data)
        print "Decoding took {}.".format(
            str(datetime.timedelta(seconds=int(elapsed_time))))
        print "Decoding speed: {}/document, or {}/sentence.".format(
            str(
                datetime.timedelta(seconds=int(elapsed_time /
                                               len(self.coref_eval_data)))),
            str(
                datetime.timedelta(seconds=int(elapsed_time /
                                               len(self.eval_data)))))
        metric_names = self.config["main_metrics"].split("_")
        main_metric = sum([task_to_f1[t]
                           for t in metric_names]) / len(metric_names)
        print "Combined metric ({}): {}".format(self.config["main_metrics"],
                                                main_metric)
        return util.make_summary(summary_dict), main_metric, task_to_f1
  def evaluate(self, session, is_final_test=False):
    self.load_eval_data()

    tp,fn,fp = 0,0,0
    start_time = time.time()
    num_words = 0
    sub_tp,sub_fn,sub_fp = [0] * self.num_types,[0]*self.num_types, [0]*self.num_types

    is_flat_ner = 'flat_ner' in self.config and self.config['flat_ner']
    total_preds = []
    total_golds = []
    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_ner_scores = session.run(self.predictions, feed_dict=feed_dict)

      num_words += sum(len(tok) for tok in example["sentences"])


      gold_ners = set([(sid,s,e, self.ner_maps[t]) for sid, ner in enumerate(example['ners']) for s,e,t in ner])
      pred_ners = self.get_pred_ner(example["sentences"], candidate_ner_scores,is_flat_ner)
      total_golds.append(list(gold_ners))
      total_preds.append(list(pred_ners))
      #print(pred_ners)
      tp += len(gold_ners & pred_ners)
      fn += len(gold_ners - pred_ners)
      fp += len(pred_ners - gold_ners)

      if is_final_test:
        for i in range(self.num_types):
          sub_gm = set((sid,s,e) for sid,s,e,t in gold_ners if t ==i+1)
          sub_pm = set((sid,s,e) for sid,s,e,t in pred_ners if t == i+1)
          sub_tp[i] += len(sub_gm & sub_pm)
          sub_fn[i] += len(sub_gm - sub_pm)
          sub_fp[i] += len(sub_pm - sub_gm)


      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    used_time = time.time() - start_time
    print("Time used: %d second, %.2f w/s " % (used_time, num_words*1.0/used_time))

    m_r = 0 if tp == 0 else float(tp)/(tp+fn)
    m_p = 0 if tp == 0 else float(tp)/(tp+fp)
    m_f1 = 0 if m_p == 0 else 2.0*m_r*m_p/(m_r+m_p)

    print("Mention F1: {:.2f}%".format(m_f1*100))
    print("Mention recall: {:.2f}%".format(m_r*100))
    print("Mention precision: {:.2f}%".format(m_p*100))

    if is_final_test:
      print("****************SUB NER TYPES********************")
      for i in range(self.num_types):
        sub_r = 0 if sub_tp[i] == 0 else float(sub_tp[i]) / (sub_tp[i] + sub_fn[i])
        sub_p = 0 if sub_tp[i] == 0 else float(sub_tp[i]) / (sub_tp[i] + sub_fp[i])
        sub_f1 = 0 if sub_p == 0 else 2.0 * sub_r * sub_p / (sub_r + sub_p)

        print("{} F1: {:.2f}%".format(self.ner_types[i],sub_f1 * 100))
        print("{} recall: {:.2f}%".format(self.ner_types[i],sub_r * 100))
        print("{} precision: {:.2f}%".format(self.ner_types[i],sub_p * 100))

    summary_dict = {}
    summary_dict["Mention F1"] = m_f1
    summary_dict["Mention recall"] = m_r
    summary_dict["Mention precision"] = m_p

    return util.make_summary(summary_dict), m_f1, total_preds, total_golds
Esempio n. 29
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False,
                 visualize=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0
        visualize_list = []

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            if keys is not None and example['doc_key'] not in keys:
                # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
                continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            predicted_clusters = self.evaluate_coref(top_span_starts,
                                                     top_span_ends,
                                                     predicted_antecedents,
                                                     example["clusters"],
                                                     coref_evaluator)
            coref_predictions[example["doc_key"]] = predicted_clusters
            # if example_num % 10 == 0:
            #   print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

            # Visualize antecedents
            if visualize:
                print('*****New Doc*****')
                subtokens = util.flatten(example['sentences'])
                span_list, antecedent_list = [], []
                for idx, antecedent_idx in enumerate(predicted_antecedents):
                    if antecedent_idx == -1:
                        continue
                    span_subtoken_idx = (top_span_starts[idx],
                                         top_span_ends[idx])
                    span_str = ' '.join(
                        subtokens[span_subtoken_idx[0]:span_subtoken_idx[1] +
                                  1])

                    antecedent_subtoken_idx = (top_span_starts[antecedent_idx],
                                               top_span_ends[antecedent_idx])
                    antecedent_str = ' '.join(subtokens[
                        antecedent_subtoken_idx[0]:antecedent_subtoken_idx[1] +
                        1])

                    # print('%s ---> %s' % (span_str, antecedent_str))
                    span_list.append(span_str)
                    antecedent_list.append(antecedent_str)
                visualize_list.append((span_list, antecedent_list))

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        logger.info("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        logger.info("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        logger.info("Average recall (py): {:.2f}%".format(r * 100))

        if visualize:
            with open('visualize.bin', 'wb') as f:
                pickle.dump(visualize_list, f)
            logger.info('Saved visialized')

        return util.make_summary(summary_dict), f
Esempio n. 30
0
        initial_time = time.time()

        while True:
            tf_loss, tf_global_step, _ = session.run(
                [model.loss, model.global_step, model.train_op])
            accumulated_loss += tf_loss

            if tf_global_step == 1 or tf_global_step % report_frequency == 0:
                total_time = time.time() - initial_time
                steps_per_second = tf_global_step / total_time

                average_loss = accumulated_loss / report_frequency
                print(
                    f"[{tf_global_step}] loss={average_loss:.4f}, steps/s={steps_per_second:.2f}"
                )
                writer.add_summary(util.make_summary({"loss": average_loss}),
                                   tf_global_step)
                accumulated_loss = 0.0

            if tf_global_step == 1 or tf_global_step % eval_frequency == 0:
                eval_summary, eval_f1 = model.evaluate(session)
                _ = session.run(model.update_max_f1)
                saver.save(session,
                           os.path.join(log_dir, "model"),
                           global_step=tf_global_step)

                if eval_f1 > max_f1:
                    max_f1 = eval_f1
                    util.copy_checkpoint(
                        os.path.join(log_dir,
                                     "model-{}".format(tf_global_step)),
Esempio n. 31
0
    def evaluate(self, session, official_stdout=False):
        # self.load_eval_data()
        with open(self.config["inv_mapping"], 'rb') as handle:
            inv_mapping = pickle.load(handle)
        with open(self.config["eval_path"], 'rb') as handle:
            test = pickle.load(handle)
        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        swag_predictions = []
        swag_labels = []
        for i in range(len(test)):
            if i == 191 or i == 217 or i == 225:
                continue
            example = test[i]
            file_name = example["doc_key"]
            inv_map = inv_mapping[file_name]
            tensorized_example = self.tensorize_example(example,
                                                        i,
                                                        is_training=False)
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            lee_predictions, swag_pred = session.run(
                [self.predictions2, self.swag_predictions],
                feed_dict=feed_dict)
            _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = lee_predictions
            top_span_starts = inv_map[top_span_starts]
            top_span_ends = inv_map[top_span_ends]
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[file_name] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            # SWAG evaluation
            swag_label = tensorized_example[-1]
            swag_predictions.append(swag_pred[0])
            swag_labels.append(swag_label[0])
            if i % 10 == 0:
                print("Evaluated {}/{} examples.".format(i + 1, len(test)))

        #. and now you getthe predictiosn basically.
        summary_dict = {}
        try:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)

            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))
        except:
            print("unstable results")
            average_f1 = 0
        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        print("Now evaluating SWAG")
        swag_accuracy = self.swag_evaluation(swag_predictions, swag_labels)
        print("Average SWAG accuracy is : {:.2f}%".format(swag_accuracy * 100))
        return util.make_summary(summary_dict), average_f1, swag_accuracy