Example #1
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)
      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)
      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    return util.make_summary(summary_dict), average_f1
Example #2
0
    def evaluate(self, model, tensor_examples, stored_info, step, official=False, conll_path=None, tb_writer=None):
        logger.info('Step %d: evaluating on %d samples...' % (step, len(tensor_examples)))
        model.to(self.device)
        evaluator = CorefEvaluator()
        doc_to_prediction = {}

        model.eval()
        for i, (doc_key, tensor_example) in enumerate(tensor_examples):
            gold_clusters = stored_info['gold'][doc_key]
            tensor_example = tensor_example[:7]  # Strip out gold
            example_gpu = [d.to(self.device) for d in tensor_example]
            with torch.no_grad():
                _, _, _, span_starts, span_ends, antecedent_idx, antecedent_scores = model(*example_gpu)
            span_starts, span_ends = span_starts.tolist(), span_ends.tolist()
            antecedent_idx, antecedent_scores = antecedent_idx.tolist(), antecedent_scores.tolist()
            predicted_clusters = model.update_evaluator(span_starts, span_ends, antecedent_idx, antecedent_scores, gold_clusters, evaluator)
            doc_to_prediction[doc_key] = predicted_clusters

        p, r, f = evaluator.get_prf()
        metrics = {'Eval_Avg_Precision': p * 100, 'Eval_Avg_Recall': r * 100, 'Eval_Avg_F1': f * 100}
        for name, score in metrics.items():
            logger.info('%s: %.2f' % (name, score))
            if tb_writer:
                tb_writer.add_scalar(name, score, step)

        if official:
            conll_results = conll.evaluate_conll(conll_path, doc_to_prediction, stored_info['subtoken_maps'])
            official_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
            logger.info('Official avg F1: %.4f' % official_f1)

        return f * 100, metrics
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    def _k_to_tag(k):
      if k == -3:
        return "oracle"
      elif k == -2:
        return "actual"
      elif k == -1:
        return "exact"
      elif k == 0:
        return "threshold"
      else:
        return "{}%".format(k)
    mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] }

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _, tag_labels, tag_seq, tag_loss_label = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores, tag_outputs, tag_seq = session.run(self.predictions, feed_dict=feed_dict)

      self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators)
      predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores)

      coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print "Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))
        # print tag_outputs
        # print tag_seq

    summary_dict = {}
    for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)):
      tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")]
      results_to_print = []
      for t, v in zip(tags, evaluator.metrics()):
        results_to_print.append("{:<10}: {:.2f}".format(t, v))
        summary_dict[t] = v
      print ", ".join(results_to_print)

    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print "Average F1 (conll): {:.2f}%".format(average_f1)

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print "Average F1 (py): {:.2f}%".format(f * 100)
    summary_dict["Average precision (py)"] = p
    print "Average precision (py): {:.2f}%".format(p * 100)
    summary_dict["Average recall (py)"] = r
    print "Average recall (py): {:.2f}%".format(r * 100)

    return util.make_summary(summary_dict), average_f1
Example #4
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    def _k_to_tag(k):
      if k == -3:
        return "oracle"
      elif k == -2:
        return "actual"
      elif k == -1:
        return "exact"
      elif k == 0:
        return "threshold"
      else:
        return "{}%".format(k)
    mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] }

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)

      self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators)
      predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores)

      coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)):
      tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")]
      results_to_print = []
      for t, v in zip(tags, evaluator.metrics()):
        results_to_print.append("{:<10}: {:.2f}".format(t, v))
        summary_dict[t] = v
      print(", ".join(results_to_print))

    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    return util.make_summary(summary_dict), average_f1
Example #5
0
def conll_evaluate(l0_inputs, alphas, conll_eval_path,
                   all_top_antecedent_scores):
    print("Compiling clusters and evaluators for conll suite")
    if isinstance(alphas, float) or isinstance(alphas, int):
        alphas = [alphas]
    coref_predictions = [{} for _ in alphas]
    coref_evaluators = [metrics.CorefEvaluator() for _ in alphas]
    subtoken_maps = {}

    with open(l0_inputs, "rb") as f:
        data_dicts = np.load(f, allow_pickle=True).item().get("data_dicts")

    for example_num, data_dict in enumerate(tqdm(data_dicts)):
        example = data_dict["example"]
        subtoken_maps[example["doc_key"]] = example["subtoken_map"]
        top_span_starts = data_dict["top_span_starts"]
        top_span_ends = data_dict["top_span_ends"]
        top_antecedents = data_dict["top_antecedents"]

        for i in range(len(alphas)):
            top_antecedent_scores = all_top_antecedent_scores[
                example["doc_key"]][i]
            predicted_antecedents = get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[i][example["doc_key"]] = evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluators[i])

    summary_dict = DD(list)
    for i in range(len(alphas)):
        print("\n*****************************")
        print("******* alpha = %f *******" % alphas[i])
        summary_dict["alpha"].append(alphas[i])
        conll_results = conll.evaluate_conll(conll_eval_path,
                                             coref_predictions[i],
                                             subtoken_maps,
                                             official_stdout=True)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)
        summary_dict["Average F1 (conll)"].append(average_f1)
        print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluators[i].get_prf()
        summary_dict["Average F1 (py)"].append(f)
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(subtoken_maps.keys())))
        summary_dict["Average precision (py)"].append(p)
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"].append(r)
        print("Average recall (py): {:.2f}%".format(r * 100))

    return summary_dict
Example #6
0
    def evaluate(self, session, official_stdout=False, eval_mode=False):
        self.load_eval_data()
        coref_predictions = {}

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, \
                top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)
            """
            candidate_starts: (num_words, max_span_width) 所有候选span的start
            candidate_ends: (num_words, max_span_width) 所有候选span的end
            candidate_mention_scores: (num_candidates,) 候选答案的得分
            top_span_starts: (k, ) 筛选过mention之后的候选的start_index
            top_span_ends: (k, ) 筛选过mention之后的候选的end_index
            top_antecedents: (k, c) 粗筛过antecedent之后的每个候选antecedent的index
            top_antecedent_scores: (k, c) 粗筛过antecedent之后的每个候选antecedent的score
            """
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"])
            if (example_num + 1) % 100 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        summary_dict = {}
        if eval_mode:  # 在测试集评测的时候,需要用官方的脚本再评测一遍
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = self.coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(self.eval_data)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util.make_summary(summary_dict), f
Example #7
0
    def evaluate(self, model, device, official_stdout=False, keys=None, eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        doc_keys = []

        with torch.no_grad():
            for example_num, example in enumerate(tqdm(self.eval_data, desc="Eval_Examples")):
                tensorized_example = model.tensorize_example(example, is_training=False)

                input_ids = torch.from_numpy(tensorized_example[0]).long().to(device)
                input_mask = torch.from_numpy(tensorized_example[1]).long().to(device)
                text_len = torch.from_numpy(tensorized_example[2]).long().to(device)
                speaker_ids = torch.from_numpy(tensorized_example[3]).long().to(device)
                genre = torch.tensor(tensorized_example[4]).long().to(device)
                is_training = tensorized_example[5]
                gold_starts = torch.from_numpy(tensorized_example[6]).long().to(device)
                gold_ends = torch.from_numpy(tensorized_example[7]).long().to(device)
                cluster_ids = torch.from_numpy(tensorized_example[8]).long().to(device)
                sentence_map = torch.Tensor(tensorized_example[9]).long().to(device)

                if keys is not None and example['doc_key'] not in keys:
                    continue
                doc_keys.append(example['doc_key'])

                (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends,
                 top_antecedents, top_antecedent_scores), loss = model(input_ids, input_mask, text_len, speaker_ids,
                                                genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map)

                predicted_antecedents = self.get_predicted_antecedents(top_antecedents.cpu(), top_antecedent_scores.cpu())
                coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends,
                                                                            predicted_antecedents, example["clusters"],
                                                                            coref_evaluator)

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps,
                                                 official_stdout)
            average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return summary_dict, f
Example #8
0
    def evaluate(self, session, official_stdout=False):
        # self.load_eval_data()
        with open(self.config["inv_mapping"], 'rb') as handle:
            inv_mapping = pickle.load(handle)
        with open(self.config["eval_path"], 'rb') as handle:
            test = pickle.load(handle)

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        for i in range(len(test)):
            if i == 191 or i == 217 or i == 225:
                continue
            example = test[i]
            file_name = example["doc_key"]
            inv_map = inv_mapping[file_name]
            tensorized_example = self.tensorize_example(example,
                                                        is_training=False)
            _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(
                self.predictions, feed_dict=feed_dict)
            top_span_starts = inv_map[top_span_starts]
            top_span_ends = inv_map[top_span_ends]
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[file_name] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if i % 10 == 0:
                print("Evaluated {}/{} examples.".format(i + 1, len(test)))

        summary_dict = {}
        conll_results = conll.evaluate_conll(self.config["conll_eval_path"],
                                             coref_predictions,
                                             official_stdout)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)

        summary_dict["Average F1 (conll)"] = average_f1
        print("Average F1 (conll): {:.2f}%".format(average_f1))
        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
Example #9
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    avg_loss = 0.0
    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      
      predictions, loss = session.run([self.predictions, self.loss], feed_dict=feed_dict)
      candidate_starts, candidate_ends, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, _, _ = predictions

      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 20 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))
      avg_loss += loss

    avg_loss = avg_loss / len(self.eval_data)
    summary_dict = {}
    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)

    cluster_result = {'prediction':coref_predictions, 'gold':official_stdout}

    with open('evaluate_result.pickle', 'wb') as handle:
      pickle.dump(cluster_result, handle, protocol=pickle.HIGHEST_PROTOCOL)

    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))
    summary_dict["Validation loss"] = avg_loss
    print("Validation loss: {:.3f}".format(avg_loss))
    

    return util.make_summary(summary_dict), average_f1, avg_loss
Example #10
0
def evaluate(model, eval_dataloader, data_path, conll_path, prediction_path,
             device):
    with open(data_path) as f:
        examples = [json.loads(jsonline) for jsonline in f.readlines()]

    model.eval()
    coref_predictions = {}
    subtoken_maps = {}
    coref_evaluator = metrics.CorefEvaluator(singleton=False)
    predicted_antecedents = []
    predicted_clusters = []
    with torch.no_grad():
        for i, (batch, example) in enumerate(zip(eval_dataloader, examples)):
            subtoken_maps[example['doc_key']] = example["subtoken_map"]
            doc_key = batch[0]
            assert doc_key == example["doc_key"], (doc_key, example["doc_key"])
            input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, \
            subtoken_map = [b.to(device) for b in batch[1:]]

            predictions, loss = model(input_ids, input_mask, text_len,
                                      speaker_ids, genre, gold_starts,
                                      gold_ends, cluster_ids, sentence_map,
                                      subtoken_map)
            (top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, candidate_starts, candidate_ends,
             top_span_cluster_ids, top_span_mention_scores, candidate_mention_scores) = \
                [p.detach().cpu() for p in predictions]

            antecedents = get_predicted_antecedents(
                top_antecedents.numpy(), top_antecedent_scores.numpy())
            clusters = evaluate_coref(top_span_starts.numpy(),
                                      top_span_ends.numpy(), antecedents,
                                      example["clusters"], coref_evaluator,
                                      top_span_mention_scores)
            coref_predictions[example["doc_key"]] = clusters
            predicted_antecedents.append(antecedents)
            predicted_clusters.append(clusters)

    coref_p, coref_r, coref_f = coref_evaluator.get_prf()
    conll_results = conll.evaluate_conll(conll_path,
                                         prediction_path,
                                         coref_predictions,
                                         subtoken_maps,
                                         official_stdout=True)

    return coref_p, coref_r, coref_f, conll_results
Example #11
0
  def evaluate(self, session, global_step=None, official_stdout=False,
               keys=None, eval_mode=False, to_npy=None, from_npy=None,
               rsa_model=None):
    assert not (to_npy is not None and from_npy is not None), "cannot set both to_npy and from_npy to be none!"

    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    losses = []
    doc_keys = []
    num_evaluated = 0
    total_time = 0

    if to_npy:
      data_dicts = []
    if from_npy:
      with open(from_npy, "rb") as f:
        from_npy_dict = np.load(f)
        data_dicts = from_npy_dict.item().get("data_dicts")

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example

      if from_npy is None:
        feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
        # if tensorized_example[0].shape[0] <= 9:
        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])
        loss, (candidate_starts, candidate_ends, candidate_mention_scores,
               top_span_starts, top_span_ends,
               top_antecedents, top_antecedent_scores) = \
          session.run([self.loss, self.predictions], feed_dict=feed_dict)
      else:
        data_dict = data_dicts[example_num]
        example = data_dict["example"]

        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])

        tensorized_example = data_dict["tensorized_example"]
        loss = data_dict["loss"]
        top_span_starts = data_dict["top_span_starts"]
        top_span_ends = data_dict["top_span_ends"]
        top_antecedents = data_dict["top_antecedents"]
        top_antecedent_scores = data_dict["top_antecedent_scores"]

      # losses.append(session.run(self.loss, feed_dict=feed_dict))
      losses.append(loss)

      if rsa_model is not None:
        print("Running l1 for sentence %d" % example_num)
        start_time = time.time()
        top_antecedent_scores = rsa_model.l1(example, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores)
        duration = time.time() - start_time
        print("Finished sentence %d, took %.2f s" % (example_num, duration))
        total_time += duration
        num_evaluated += 1

      if to_npy:
          data_dict = {
              "example_num": example_num,
              "tensorized_example": tensorized_example,
              "example": example,
              "top_span_starts": top_span_starts,
              "top_span_ends": top_span_ends,
              "top_antecedents": top_antecedents,
              "top_antecedent_scores": top_antecedent_scores,
              "loss": loss,
          }
          data_dicts.append(data_dict)

      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    if to_npy:
      dict_to_npy = {"data_dicts": data_dicts}

    if eval_mode:
      conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout )
      average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      summary_dict["Average F1 (conll)"] = average_f1
      if to_npy:
        dict_to_npy["Average F1 (conll)"] = average_f1
      print("Average F1 (conll): {:.2f}%".format(average_f1))


    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    if to_npy:
      dict_to_npy["Average F1 (py)"] = f
      dict_to_npy["Average precision (py)"] = p
      dict_to_npy["Average recall (py)"] = r
      with open(to_npy, "wb") as f_to_npy:
        np.save(f_to_npy, dict_to_npy)

    if rsa_model:
        print("Ran rsa on %d sentences, avg time per sentence %.2f s" % num_evaluated, total_time / num_evaluated)

    return util.make_summary(summary_dict), f
Example #12
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            # if keys is not None and example['doc_key']  in keys:
            # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
            # continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        summary_dict = {}
        # with open('doc_keys_512.txt', 'w') as f:
        # for key in doc_keys:
        # f.write(key + '\n')
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util.make_summary(summary_dict), f
Example #13
0
                        top_antecedents, top_antecedent_scores)
                    example[
                        "predicted_clusters"], _ = model.get_predicted_clusters(
                            top_span_starts, top_span_ends,
                            predicted_antecedents)
                    coref_predictions[
                        example["doc_key"]] = model.evaluate_coref(
                            top_span_starts, top_span_ends,
                            predicted_antecedents, example["clusters"],
                            coref_evaluator)

                    output_file.write(json.dumps(example))
                    output_file.write("\n")
                    if example_num % 100 == 0:
                        print("Decoded {} examples.".format(example_num + 1))
    summary_dict = {}
    conll_results = conll.evaluate_conll(model.config["conll_eval_path"],
                                         coref_predictions, False)
    average_f1 = sum(
        results["f"]
        for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))
    p, r, f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))
Example #14
0
def eval_coref(config):
    """
    指代消解模型验证
    :param config: 配置参数
    :return: None
    """
    model = CorefModel.from_pretrained(config["model_save_path"],
                                       coref_task_config=config)
    model.to(device)

    examples = model.get_eval_example()

    logger.info("********** Running Eval ****************")
    logger.info("  Num dev examples = %d", len(examples))

    model.eval()
    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    doc_keys = []
    keys = None
    with torch.no_grad():
        for example_num, example in enumerate(
                tqdm(examples, desc="Eval_Examples")):
            tensorized_example = model.tensorize_example(example,
                                                         is_training=False)

            input_ids = torch.from_numpy(
                tensorized_example[0]).long().to(device)
            input_mask = torch.from_numpy(
                tensorized_example[1]).long().to(device)
            text_len = torch.from_numpy(
                tensorized_example[2]).long().to(device)
            speaker_ids = torch.from_numpy(
                tensorized_example[3]).long().to(device)
            genre = torch.tensor(tensorized_example[4]).long().to(device)
            is_training = tensorized_example[5]
            gold_starts = torch.from_numpy(
                tensorized_example[6]).long().to(device)
            gold_ends = torch.from_numpy(
                tensorized_example[7]).long().to(device)
            cluster_ids = torch.from_numpy(
                tensorized_example[8]).long().to(device)
            sentence_map = torch.Tensor(
                tensorized_example[9]).long().to(device)

            if keys is not None and example['doc_key'] not in keys:
                continue
            doc_keys.append(example['doc_key'])

            (candidate_starts, candidate_ends, candidate_mention_scores,
             top_span_starts, top_span_ends, top_antecedents,
             top_antecedent_scores), loss = model(input_ids, input_mask,
                                                  text_len, speaker_ids, genre,
                                                  is_training, gold_starts,
                                                  gold_ends, cluster_ids,
                                                  sentence_map)

            predicted_antecedents = model.get_predicted_antecedents(
                top_antecedents.cpu(), top_antecedent_scores.cpu())
            coref_predictions[example["doc_key"]] = model.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
    official_stdout = True
    eval_mode = True
    summary_dict = {}
    if eval_mode:
        conll_results = conll.evaluate_conll(config["conll_eval_path"],
                                             coref_predictions,
                                             model.subtoken_maps,
                                             official_stdout)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)
        summary_dict["Average F1 (conll)"] = average_f1
        print("Average F1 (conll): {:.2f}%".format(average_f1))

    p, r, f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))
Example #15
0
    def evaluate(self, session, official_stdout=False):
        # self.load_eval_data()
        with open(self.config["inv_mapping"], 'rb') as handle:
            inv_mapping = pickle.load(handle)
        with open(self.config["eval_path"], 'rb') as handle:
            test = pickle.load(handle)
        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        swag_predictions = []
        swag_labels = []
        for i in range(len(test)):
            if i == 191 or i == 217 or i == 225:
                continue
            example = test[i]
            file_name = example["doc_key"]
            inv_map = inv_mapping[file_name]
            tensorized_example = self.tensorize_example(example,
                                                        i,
                                                        is_training=False)
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            lee_predictions, swag_pred = session.run(
                [self.predictions2, self.swag_predictions],
                feed_dict=feed_dict)
            _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = lee_predictions
            top_span_starts = inv_map[top_span_starts]
            top_span_ends = inv_map[top_span_ends]
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[file_name] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            # SWAG evaluation
            swag_label = tensorized_example[-1]
            swag_predictions.append(swag_pred[0])
            swag_labels.append(swag_label[0])
            if i % 10 == 0:
                print("Evaluated {}/{} examples.".format(i + 1, len(test)))

        #. and now you getthe predictiosn basically.
        summary_dict = {}
        try:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)

            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))
        except:
            print("unstable results")
            average_f1 = 0
        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        print("Now evaluating SWAG")
        swag_accuracy = self.swag_evaluation(swag_predictions, swag_labels)
        print("Average SWAG accuracy is : {:.2f}%".format(swag_accuracy * 100))
        return util.make_summary(summary_dict), average_f1, swag_accuracy
Example #16
0
    def evaluate(self, model, prefix="", tb_writer=None, global_step=None, official=False):
        eval_dataset = get_dataset(self.args, tokenizer=self.tokenizer, evaluate=True)

        if self.eval_output_dir and not os.path.exists(self.eval_output_dir) and self.args.local_rank in [-1, 0]:
            os.makedirs(self.eval_output_dir)

        # Note that DistributedSampler samples randomly
        # eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = BucketBatchSampler(eval_dataset, max_total_seq_len=self.args.max_total_seq_len, batch_size_1=True)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Examples number: %d", len(eval_dataset))
        model.eval()

        post_pruning_mention_evaluator = MentionEvaluator()
        mention_evaluator = MentionEvaluator()
        coref_evaluator = CorefEvaluator()
        losses = defaultdict(list)
        doc_to_prediction = {}
        doc_to_subtoken_map = {}
        for (doc_key, subtoken_maps), batch in eval_dataloader:

            batch = tuple(tensor.to(self.args.device) for tensor in batch)
            input_ids, attention_mask, start_entity_mentions_indices, end_entity_mentions_indices, start_antecedents_indices, end_antecedents_indices, gold_clusters = batch

            with torch.no_grad():
                outputs = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                start_entity_mention_labels=start_entity_mentions_indices,
                                end_entity_mention_labels=end_entity_mentions_indices,
                                start_antecedent_labels=start_antecedents_indices,
                                end_antecedent_labels=end_antecedents_indices,
                                gold_clusters=gold_clusters,
                                return_all_outputs=True)
                loss_dict = outputs[-1]

            if self.args.n_gpu > 1:
                loss_dict = {key: val.mean() for key, val in loss_dict.items()}

            for key, val in loss_dict.items():
                losses[key].append(val.item())

            outputs = outputs[1:-1]

            batch_np = tuple(tensor.cpu().numpy() for tensor in batch)
            outputs_np = tuple(tensor.cpu().numpy() for tensor in outputs)
            for output in zip(*(batch_np + outputs_np)):
                gold_clusters = output[6]
                gold_clusters = extract_clusters(gold_clusters)
                mention_to_gold_clusters = extract_mentions_to_predicted_clusters_from_clusters(gold_clusters)
                gold_mentions = list(mention_to_gold_clusters.keys())

                starts, end_offsets, coref_logits, mention_logits = output[-4:]

                max_antecedents = np.argmax(coref_logits, axis=1).tolist()
                mention_to_antecedent = {((int(start), int(end)), (int(starts[max_antecedent]), int(end_offsets[max_antecedent]))) for start, end, max_antecedent in
                                         zip(starts, end_offsets, max_antecedents) if max_antecedent < len(starts)}

                predicted_clusters, _ = extract_clusters_for_decode(mention_to_antecedent)
                candidate_mentions = list(zip(starts, end_offsets))

                mention_to_predicted_clusters = extract_mentions_to_predicted_clusters_from_clusters(predicted_clusters)
                predicted_mentions = list(mention_to_predicted_clusters.keys())
                post_pruning_mention_evaluator.update(candidate_mentions, gold_mentions)
                mention_evaluator.update(predicted_mentions, gold_mentions)
                coref_evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted_clusters,
                                       mention_to_gold_clusters)
                doc_to_prediction[doc_key] = predicted_clusters
                doc_to_subtoken_map[doc_key] = subtoken_maps

        post_pruning_mention_precision, post_pruning_mentions_recall, post_pruning_mention_f1 = post_pruning_mention_evaluator.get_prf()
        mention_precision, mentions_recall, mention_f1 = mention_evaluator.get_prf()
        prec, rec, f1 = coref_evaluator.get_prf()

        results = [(key, sum(val) / len(val)) for key, val in losses.items()]
        results += [
            ("post pruning mention precision", post_pruning_mention_precision),
            ("post pruning mention recall", post_pruning_mentions_recall),
            ("post pruning mention f1", post_pruning_mention_f1),
            ("mention precision", mention_precision),
            ("mention recall", mentions_recall),
            ("mention f1", mention_f1),
            ("precision", prec),
            ("recall", rec),
            ("f1", f1)
        ]
        logger.info("***** Eval results {} *****".format(prefix))
        for key, values in results:
            if isinstance(values, float):
                logger.info(f"  {key} = {values:.3f}")
            else:
                logger.info(f"  {key} = {values}")
            if tb_writer is not None and global_step is not None:
                tb_writer.add_scalar(key, values, global_step)

        if self.eval_output_dir:
            output_eval_file = os.path.join(self.eval_output_dir, "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                if prefix:
                    writer.write(f'\n{prefix}:\n')
                for key, values in results:
                    if isinstance(values, float):
                        writer.write(f"{key} = {values:.3f}\n")
                    else:
                        writer.write(f"{key} = {values}\n")

        results = OrderedDict(results)
        results["experiment_name"] = self.args.experiment_name
        results["data"] = prefix
        with open(os.path.join(self.args.output_dir, "results.jsonl"), "a+") as f:
            f.write(json.dumps(results) + '\n')

        if official:
            with open(os.path.join(self.args.output_dir, "preds.jsonl"), "w") as f:
                f.write(json.dumps(doc_to_prediction) + '\n')
                f.write(json.dumps(doc_to_subtoken_map) + '\n')

            if self.args.conll_path_for_eval is not None:
                conll_results = evaluate_conll(self.args.conll_path_for_eval, doc_to_prediction, doc_to_subtoken_map)
                official_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
                logger.info('Official avg F1: %.4f' % official_f1)

        return results
Example #17
0
  def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    losses = []
    doc_keys = []
    num_evaluated= 0


    ##################################################################################################
    ################### WE FURTHER DETECT THE RESULTS SEPERATEDLY FOR P-P; NP-NP, P-NP ###############
    ##################################################################################################
    coref_predictions_pp = {}
    coref_predictions_pnp = {}
    coref_predictions_npnp = {}

    # span type
    coref_evaluator_pp = PairEvaluator()
    coref_evaluator_pnp = PairEvaluator()
    coref_evaluator_npnp = PairEvaluator()
    coref_evaluator_all = PairEvaluator()

    num_coref_pp = 0
    num_coref_pnp = 0
    num_coref_npnp = 0
    num_coref_all = 0

    # span freq
    coref_evaluator_freq = PairEvaluator()
    coref_evaluator_rare = PairEvaluator()
    
    num_coref_freq = 0
    num_coref_rare = 0

    # pron type
    coref_evaluators_type = dict()
    coref_evaluators_type["demo"], coref_evaluators_type["pos"], coref_evaluators_type["third"] = PairEvaluator(), PairEvaluator(), PairEvaluator()
    nums_coref_type = dict()
    nums_coref_type["demo"], nums_coref_type["pos"], nums_coref_type["third"] = 0, 0, 0

    count = 0 

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      try:
        # count += 1
        # if count == 10:
        #   break
        _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
        feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
        # if tensorized_example[0].shape[0] <= 9:
        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])
        loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run([self.loss, self.predictions], feed_dict=feed_dict)
        # losses.append(session.run(self.loss, feed_dict=feed_dict))
        losses.append(loss)
        predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)

        coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

        if example_num % 10 == 0:
          print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

        #####################################################################################
        # Evaluate on three different settings: NP-NP, NP-P, P-P by using a different cluster
        #####################################################################################

        # Span Type
        flatten_sentences = util.flatten(example["sentences"])
        gold_pp_pairs, gold_pnp_pairs, gold_npnp_pairs, num_pp_pairs, num_pnp_pairs, num_npnp_pairs, num_relation = self.cluster_to_pairs(example["clusters"], flatten_sentences)
        # predicted_clusters = coref_predictions[example["doc_key"]]
        pred_pp_pairs, pred_pnp_pairs, pred_npnp_pairs, _, _, _, _ = self.cluster_to_pairs(coref_predictions[example["doc_key"]], flatten_sentences)

        # Span Frequency
        gold_freq_pnp_pairs, gold_rare_pnp_pairs, num_freq_pairs, num_rare_pairs = self.cluster_to_pair_frequent(example["clusters"], flatten_sentences)
        pred_freq_pnp_pairs, pred_rare_pnp_pairs, _, _ = self.cluster_to_pair_frequent(coref_predictions[example["doc_key"]], flatten_sentences)

        # pronoun type demo, pos, third
        gold_type_pairs, gold_type_nums = self.cluster_to_pair_detailed_pronoun(example["clusters"], flatten_sentences)
        pred_type_pairs, pred_type_nums = self.cluster_to_pair_detailed_pronoun(coref_predictions[example["doc_key"]], flatten_sentences)
        
        for pron_type in ["demo", "pos", "third"]:
          coref_evaluators_type[pron_type].update(gold_type_pairs[pron_type], pred_type_pairs[pron_type])
          nums_coref_type[pron_type] += gold_type_nums[pron_type]

        all_gold = gold_pp_pairs + gold_pnp_pairs + gold_npnp_pairs
        all_pred = pred_pp_pairs + pred_pnp_pairs + pred_npnp_pairs

        coref_evaluator_pp.update(pred_pp_pairs, gold_pp_pairs)
        coref_evaluator_pnp.update(pred_pnp_pairs, gold_pnp_pairs)
        coref_evaluator_npnp.update(pred_npnp_pairs, gold_npnp_pairs)
        coref_evaluator_all.update(all_pred, all_gold)

        coref_evaluator_freq.update(pred_freq_pnp_pairs, gold_freq_pnp_pairs)
        coref_evaluator_rare.update(pred_rare_pnp_pairs, gold_rare_pnp_pairs)

        num_coref_pp += num_pp_pairs
        num_coref_pnp += num_pnp_pairs
        num_coref_npnp += num_npnp_pairs
        num_coref_all = num_coref_all + num_pp_pairs + num_pnp_pairs + num_npnp_pairs
        num_coref_freq += num_freq_pairs
        num_coref_rare += num_rare_pairs
      except:
        a = "do nothing"

    summary_dict = {}

    self.print_prf(coref_evaluator_pp, summary_dict, doc_keys, "PP", num_coref_pp)
    self.print_prf(coref_evaluator_pnp, summary_dict, doc_keys, "PNP", num_coref_pnp)
    self.print_prf(coref_evaluator_npnp, summary_dict, doc_keys, "NPNP", num_coref_npnp)

    self.print_prf(coref_evaluator_freq, summary_dict, doc_keys, "FREQ", num_coref_freq)
    self.print_prf(coref_evaluator_rare, summary_dict, doc_keys, "RARE", num_coref_rare)

    for pron_type in ["demo", "pos", "third"]:
      self.print_prf(coref_evaluators_type[pron_type], summary_dict, doc_keys, pron_type, nums_coref_type[pron_type])
    
    self.print_prf(coref_evaluator_all, summary_dict, doc_keys, "ALL_PAIRS", num_coref_all)

    #######################################################################################

    # summary_dict = {}

    print("The evaluatoin results for all clusters")
    print("The number of pairs is "+ str(num_coref_all))
    
    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    if eval_mode:
      conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout)
      average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      summary_dict["Average F1 (conll)"] = average_f1
      print("Average F1 (conll): {:.2f}%".format(average_f1))


    return util.make_summary(summary_dict), f
Example #18
0
        for example_num, (tensorized_example,
                          example) in enumerate(model.eval_data):
            doc_key = example["doc_key"]
            mention_starts = mention_start_dict[doc_key]
            mention_ends = mention_end_dict[doc_key]
            antecedents = antecedents_dict[doc_key]
            antecedent_scores = mean_antecedent_scores[doc_key]
            predicted_antecedents = []
            for i, index in enumerate(
                    np.argmax(antecedent_scores, axis=1) - 1):
                if index < 0:
                    predicted_antecedents.append(-1)
                else:
                    predicted_antecedents.append(antecedents[i, index])
            merged_predictions[doc_key] = model.evaluate_coref(
                mention_starts, mention_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)

    conll_results = conll.evaluate_conll(main_config["conll_eval_path"],
                                         merged_predictions,
                                         official_stdout=True)
    average_f = sum(results["f"]
                    for results in conll_results.values()) / len(conll_results)
    average_r = sum(results["r"]
                    for results in conll_results.values()) / len(conll_results)
    average_p = sum(results["p"]
                    for results in conll_results.values()) / len(conll_results)
    print "Merged average F1 (conll): {:.2f}%".format(average_f)
    print "Merged average Recall (conll): {:.2f}%".format(average_r)
    print "Merged average Precision (conll): {:.2f}%".format(average_p)
Example #19
0
        all_antecedent_scores[doc_key].append(antecedent_scores)

        if example_num % 10 == 0:
          print("Computed {}/{} examples.".format(example_num + 1, len(model.eval_data)))

    mean_antecedent_scores = { doc_key : np.mean(s, 0) for doc_key, s in all_antecedent_scores.items() }

    merged_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    for example_num, (tensorized_example, example) in enumerate(model.eval_data):
      doc_key = example["doc_key"]
      mention_starts = mention_start_dict[doc_key]
      mention_ends = mention_end_dict[doc_key]
      antecedents = antecedents_dict[doc_key]
      antecedent_scores = mean_antecedent_scores[doc_key]
      predicted_antecedents = []
      for i, index in enumerate(np.argmax(antecedent_scores, axis=1) - 1):
        if index < 0:
          predicted_antecedents.append(-1)
        else:
          predicted_antecedents.append(antecedents[i, index])
      merged_predictions[doc_key] = model.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator)

  conll_results = conll.evaluate_conll(main_config["conll_eval_path"], merged_predictions, official_stdout=True)
  average_f = sum(results["f"] for results in conll_results.values()) / len(conll_results)
  average_r = sum(results["r"] for results in conll_results.values()) / len(conll_results)
  average_p = sum(results["p"] for results in conll_results.values()) / len(conll_results)
  print("Merged average F1 (conll): {:.2f}%".format(average_f))
  print("Merged average Recall (conll): {:.2f}%".format(average_r))
  print("Merged average Precision (conll): {:.2f}%".format(average_p))
Example #20
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            if keys is not None and example['doc_key'] not in keys:
                # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
                continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)

            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

                ##### print mentions of both example["clusters"] and coref_predictions[example["doc_key"]] below ########
                print_clusters = False
                if print_clusters:
                    comb_text = [
                        word for sentence in example['sentences']
                        for word in sentence
                    ]
                    print('#### Example Clusters: ####')
                    for cluster in example['clusters']:
                        mapped = []
                        for mention in cluster:
                            mapped.append(
                                self.convert_mention(mention, comb_text,
                                                     example))
                        print(mapped, end=",\n")

                    print('#### Predicted Clusters: ####')
                    for cluster in coref_predictions[example["doc_key"]]:
                        mapped = []
                        for mention in cluster:
                            mapped.append(
                                self.convert_mention(mention, comb_text,
                                                     example))
                        print(mapped, end=",\n")

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util_xlnet.make_summary(summary_dict), f
Example #21
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False,
                 visualize=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0
        visualize_list = []

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            if keys is not None and example['doc_key'] not in keys:
                # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
                continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            predicted_clusters = self.evaluate_coref(top_span_starts,
                                                     top_span_ends,
                                                     predicted_antecedents,
                                                     example["clusters"],
                                                     coref_evaluator)
            coref_predictions[example["doc_key"]] = predicted_clusters
            # if example_num % 10 == 0:
            #   print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

            # Visualize antecedents
            if visualize:
                print('*****New Doc*****')
                subtokens = util.flatten(example['sentences'])
                span_list, antecedent_list = [], []
                for idx, antecedent_idx in enumerate(predicted_antecedents):
                    if antecedent_idx == -1:
                        continue
                    span_subtoken_idx = (top_span_starts[idx],
                                         top_span_ends[idx])
                    span_str = ' '.join(
                        subtokens[span_subtoken_idx[0]:span_subtoken_idx[1] +
                                  1])

                    antecedent_subtoken_idx = (top_span_starts[antecedent_idx],
                                               top_span_ends[antecedent_idx])
                    antecedent_str = ' '.join(subtokens[
                        antecedent_subtoken_idx[0]:antecedent_subtoken_idx[1] +
                        1])

                    # print('%s ---> %s' % (span_str, antecedent_str))
                    span_list.append(span_str)
                    antecedent_list.append(antecedent_str)
                visualize_list.append((span_list, antecedent_list))

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        logger.info("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        logger.info("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        logger.info("Average recall (py): {:.2f}%".format(r * 100))

        if visualize:
            with open('visualize.bin', 'wb') as f:
                pickle.dump(visualize_list, f)
            logger.info('Saved visialized')

        return util.make_summary(summary_dict), f
Example #22
0
    def evaluate(self, session, official_stdout=False):
        self.load_eval_data()

        tp, fn, fp = 0, 0, 0
        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(
                self.predictions, feed_dict=feed_dict)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)

            gold_mentions = set([(s, e) for cl in example["clusters"]
                                 for s, e in cl])
            pred_mentions = set([
                (s, e) for s, e in zip(top_span_starts, top_span_ends)
            ])
            tp += len(gold_mentions & pred_mentions)
            fn += len(gold_mentions - pred_mentions)
            fp += len(pred_mentions - gold_mentions)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        m_r = float(tp) / (tp + fn)
        m_p = float(tp) / (tp + fp)
        m_f1 = 2.0 * m_r * m_p / (m_r + m_p)
        print("Mention F1: {:.2f}%".format(m_f1 * 100))
        print("Mention recall: {:.2f}%".format(m_r * 100))
        print("Mention precision: {:.2f}%".format(m_p * 100))

        summary_dict = {}
        if official_stdout:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        average_f1 = average_f1 if official_stdout else f * 100
        return util.make_summary(summary_dict), average_f1