Beispiel #1
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    predictions = []

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)
      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)
      example['predict_cluster'] = coref_predictions[example["doc_key"]]
      predictions.append(example)
      if example_num % 10 == 9:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    df = pd.DataFrame(predictions)
    df['predict'] = df[['predict_cluster', 'Pronoun_mention', 'A_mention', 'B_mention']].apply(predict, axis=1)
    accuracy = accuracy_score(df['predict'], df['label'])
    summary_dict = {}
    summary_dict['accuracy'] = accuracy
    print(accuracy)
    return summary_dict, accuracy

    '''
Beispiel #2
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(self.predictions, feed_dict=feed_dict)
      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)
      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    return util.make_summary(summary_dict), average_f1
Beispiel #3
0
def evaluate(model, eval_dataloader, data_path, device):
    with open(data_path) as f:
        examples = [json.loads(jsonline) for jsonline in f.readlines()]

    model.eval()
    coref_evaluator = metrics.CorefEvaluator(singleton=False)
    with torch.no_grad():
        for i, (batch, example) in enumerate(zip(eval_dataloader, examples)):
            doc_key = batch[0]
            assert doc_key == example["doc_key"], (doc_key, example["doc_key"])
            input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, \
            subtoken_map = [b.to(device) for b in batch[1:]]
            predictions, loss = model(input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends,
                                      cluster_ids, sentence_map, subtoken_map)
            (top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, candidate_starts, candidate_ends,
             top_span_cluster_ids, top_span_mention_scores, candidate_mention_scores) = \
                [p.detach().cpu() for p in predictions]

            predicted_antecedents = get_predicted_antecedents(top_antecedents.numpy(), top_antecedent_scores.numpy())
            predicted_clusters = evaluate_coref(top_span_starts.numpy(), top_span_ends.numpy(), predicted_antecedents,
                                                example["clusters"], coref_evaluator, top_span_mention_scores)

    coref_p, coref_r, coref_f = coref_evaluator.get_prf()

    return coref_p, coref_r, coref_f
Beispiel #4
0
    def eval(self, path):
        eval_fd_list = self.get_feed_dict_list(path, False)
        coref_evaluator = metrics.CorefEvaluator()

        for fd, clusters in eval_fd_list:
            mention_starts, mention_ends = fd[self.mention_starts], fd[
                self.mention_ends]
            antecedents, mention_pair_scores = self.sess.run(
                self.predictions, fd)

            predicted_antecedents = []
            for i, index in enumerate(
                    np.argmax(mention_pair_scores, axis=1) - 1):
                if index < 0:
                    predicted_antecedents.append(-1)
                else:
                    predicted_antecedents.append(antecedents[i, index])

            self.evaluate_coref(mention_starts, mention_ends,
                                predicted_antecedents, clusters,
                                coref_evaluator)

        p, r, f = coref_evaluator.get_prf()
        print("Average F1 (py): {:.2f}%".format(f * 100))
        print("Average precision (py): {:.2f}%".format(p * 100))
        print("Average recall (py): {:.2f}%".format(r * 100))
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    def _k_to_tag(k):
      if k == -3:
        return "oracle"
      elif k == -2:
        return "actual"
      elif k == -1:
        return "exact"
      elif k == 0:
        return "threshold"
      else:
        return "{}%".format(k)
    mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] }

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _, tag_labels, tag_seq, tag_loss_label = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      candidate_starts, candidate_ends, mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores, tag_outputs, tag_seq = session.run(self.predictions, feed_dict=feed_dict)

      self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_scores, gold_starts, gold_ends, example, mention_evaluators)
      predicted_antecedents = self.get_predicted_antecedents(antecedents, antecedent_scores)

      coref_predictions[example["doc_key"]] = self.evaluate_coref(mention_starts, mention_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print "Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data))
        # print tag_outputs
        # print tag_seq

    summary_dict = {}
    for k, evaluator in sorted(mention_evaluators.items(), key=operator.itemgetter(0)):
      tags = ["{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F")]
      results_to_print = []
      for t, v in zip(tags, evaluator.metrics()):
        results_to_print.append("{:<10}: {:.2f}".format(t, v))
        summary_dict[t] = v
      print ", ".join(results_to_print)

    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)
    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print "Average F1 (conll): {:.2f}%".format(average_f1)

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print "Average F1 (py): {:.2f}%".format(f * 100)
    summary_dict["Average precision (py)"] = p
    print "Average precision (py): {:.2f}%".format(p * 100)
    summary_dict["Average recall (py)"] = r
    print "Average recall (py): {:.2f}%".format(r * 100)

    return util.make_summary(summary_dict), average_f1
Beispiel #6
0
def conll_evaluate(l0_inputs, alphas, conll_eval_path,
                   all_top_antecedent_scores):
    print("Compiling clusters and evaluators for conll suite")
    if isinstance(alphas, float) or isinstance(alphas, int):
        alphas = [alphas]
    coref_predictions = [{} for _ in alphas]
    coref_evaluators = [metrics.CorefEvaluator() for _ in alphas]
    subtoken_maps = {}

    with open(l0_inputs, "rb") as f:
        data_dicts = np.load(f, allow_pickle=True).item().get("data_dicts")

    for example_num, data_dict in enumerate(tqdm(data_dicts)):
        example = data_dict["example"]
        subtoken_maps[example["doc_key"]] = example["subtoken_map"]
        top_span_starts = data_dict["top_span_starts"]
        top_span_ends = data_dict["top_span_ends"]
        top_antecedents = data_dict["top_antecedents"]

        for i in range(len(alphas)):
            top_antecedent_scores = all_top_antecedent_scores[
                example["doc_key"]][i]
            predicted_antecedents = get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[i][example["doc_key"]] = evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluators[i])

    summary_dict = DD(list)
    for i in range(len(alphas)):
        print("\n*****************************")
        print("******* alpha = %f *******" % alphas[i])
        summary_dict["alpha"].append(alphas[i])
        conll_results = conll.evaluate_conll(conll_eval_path,
                                             coref_predictions[i],
                                             subtoken_maps,
                                             official_stdout=True)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)
        summary_dict["Average F1 (conll)"].append(average_f1)
        print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluators[i].get_prf()
        summary_dict["Average F1 (py)"].append(f)
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(subtoken_maps.keys())))
        summary_dict["Average precision (py)"].append(p)
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"].append(r)
        print("Average recall (py): {:.2f}%".format(r * 100))

    return summary_dict
Beispiel #7
0
    def evaluate(self, model, device, official_stdout=False, keys=None, eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        doc_keys = []

        with torch.no_grad():
            for example_num, example in enumerate(tqdm(self.eval_data, desc="Eval_Examples")):
                tensorized_example = model.tensorize_example(example, is_training=False)

                input_ids = torch.from_numpy(tensorized_example[0]).long().to(device)
                input_mask = torch.from_numpy(tensorized_example[1]).long().to(device)
                text_len = torch.from_numpy(tensorized_example[2]).long().to(device)
                speaker_ids = torch.from_numpy(tensorized_example[3]).long().to(device)
                genre = torch.tensor(tensorized_example[4]).long().to(device)
                is_training = tensorized_example[5]
                gold_starts = torch.from_numpy(tensorized_example[6]).long().to(device)
                gold_ends = torch.from_numpy(tensorized_example[7]).long().to(device)
                cluster_ids = torch.from_numpy(tensorized_example[8]).long().to(device)
                sentence_map = torch.Tensor(tensorized_example[9]).long().to(device)

                if keys is not None and example['doc_key'] not in keys:
                    continue
                doc_keys.append(example['doc_key'])

                (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends,
                 top_antecedents, top_antecedent_scores), loss = model(input_ids, input_mask, text_len, speaker_ids,
                                                genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map)

                predicted_antecedents = self.get_predicted_antecedents(top_antecedents.cpu(), top_antecedent_scores.cpu())
                coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends,
                                                                            predicted_antecedents, example["clusters"],
                                                                            coref_evaluator)

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps,
                                                 official_stdout)
            average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return summary_dict, f
Beispiel #8
0
    def evaluate(self, session, official_stdout=False):
        # self.load_eval_data()
        with open(self.config["inv_mapping"], 'rb') as handle:
            inv_mapping = pickle.load(handle)
        with open(self.config["eval_path"], 'rb') as handle:
            test = pickle.load(handle)

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        for i in range(len(test)):
            if i == 191 or i == 217 or i == 225:
                continue
            example = test[i]
            file_name = example["doc_key"]
            inv_map = inv_mapping[file_name]
            tensorized_example = self.tensorize_example(example,
                                                        is_training=False)
            _, _, _, _, _, _, gold_starts, gold_ends, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(
                self.predictions, feed_dict=feed_dict)
            top_span_starts = inv_map[top_span_starts]
            top_span_ends = inv_map[top_span_ends]
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[file_name] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if i % 10 == 0:
                print("Evaluated {}/{} examples.".format(i + 1, len(test)))

        summary_dict = {}
        conll_results = conll.evaluate_conll(self.config["conll_eval_path"],
                                             coref_predictions,
                                             official_stdout)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)

        summary_dict["Average F1 (conll)"] = average_f1
        print("Average F1 (conll): {:.2f}%".format(average_f1))
        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
Beispiel #9
0
  def evaluate(self, session, official_stdout=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    avg_loss = 0.0
    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example
      feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
      
      predictions, loss = session.run([self.predictions, self.loss], feed_dict=feed_dict)
      candidate_starts, candidate_ends, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores, _, _ = predictions

      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 20 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))
      avg_loss += loss

    avg_loss = avg_loss / len(self.eval_data)
    summary_dict = {}
    conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, official_stdout)

    cluster_result = {'prediction':coref_predictions, 'gold':official_stdout}

    with open('evaluate_result.pickle', 'wb') as handle:
      pickle.dump(cluster_result, handle, protocol=pickle.HIGHEST_PROTOCOL)

    average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
    summary_dict["Average F1 (conll)"] = average_f1
    print("Average F1 (conll): {:.2f}%".format(average_f1))

    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}%".format(f * 100))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))
    summary_dict["Validation loss"] = avg_loss
    print("Validation loss: {:.3f}".format(avg_loss))
    

    return util.make_summary(summary_dict), average_f1, avg_loss
Beispiel #10
0
def evaluate(gold_file, predicted_file):

    metrics.INCLUDE_SINGLETONS = True

    eval_data = load_eval_data(gold_file, predicted_file)

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()

    for example_num, example in enumerate(eval_data):

        coref_predictions[example["doc_key"]] = evaluate_coref(
            example['predicted_clusters'],
            example['clusters'],
            coref_evaluator,
        )

    mention_p, mention_r, mention_f = metrics.get_prf_mentions_for_all_documents(
        eval_data, coref_predictions)

    summary_dict = {}

    p, r, f = coref_evaluator.get_prf()
    average_f1 = f * 100
    summary_dict["Average F1 (py)"] = average_f1
    print("Average F1 (py): {:.2f}%".format(average_f1))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    average_mention_f1 = mention_f * 100
    summary_dict["Average mention F1 (py)"] = average_mention_f1
    print("Average mention F1 (py): {:.2f}%".format(average_mention_f1))
    summary_dict["Average mention precision (py)"] = mention_p
    print("Average mention precision (py): {:.2f}%".format(mention_p * 100))
    summary_dict["Average mention recall (py)"] = mention_r
    print("Average mention recall (py): {:.2f}%".format(mention_r * 100))
Beispiel #11
0
    def evaluate(self, data, transformer_model):
        """ 评估函数
        """
        coref_evaluator = metrics.CorefEvaluator()

        with torch.no_grad():
            for idx, data_i in enumerate(data):
                sentences_ids, sentences_masks, sentences_valid_masks, gold_clusters, speaker_ids, sentence_map, subtoken_map, genre = data_i
                top_antecedents_score, top_antecedents_index, top_m_spans_masks, top_m_spans_start, top_m_spans_end = self.forward(
                    sentences_ids, sentences_masks, sentences_valid_masks,
                    speaker_ids, sentence_map, subtoken_map, genre,
                    transformer_model)
                predicted_antecedents = self.get_predicted_antecedents(
                    top_antecedents_index, top_antecedents_score)
                top_m_spans = list()
                for i in range(len(top_m_spans_start)):
                    top_m_spans.append(
                        tuple([
                            top_m_spans_start[i].item(),
                            top_m_spans_end[i].item()
                        ]))

                # all spans
                gold_clusters = [
                    tuple(tuple([m[0], m[1]]) for m in gc)
                    for gc in gold_clusters
                ]
                mention_to_gold = {}
                for gc in gold_clusters:
                    for mention in gc:
                        mention_to_gold[tuple(mention)] = gc
                predicted_clusters, mention_to_predicted = self.get_predicted_clusters(
                    top_m_spans, predicted_antecedents)
                coref_evaluator.update(predicted_clusters, gold_clusters,
                                       mention_to_predicted, mention_to_gold)

        return coref_evaluator.get_prf()
Beispiel #12
0
def eval_coref(config):
    """
    指代消解模型验证
    :param config: 配置参数
    :return: None
    """
    model = CorefModel.from_pretrained(config["model_save_path"],
                                       coref_task_config=config)
    model.to(device)

    examples = model.get_eval_example()

    logger.info("********** Running Eval ****************")
    logger.info("  Num dev examples = %d", len(examples))

    model.eval()
    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    doc_keys = []
    keys = None
    with torch.no_grad():
        for example_num, example in enumerate(
                tqdm(examples, desc="Eval_Examples")):
            tensorized_example = model.tensorize_example(example,
                                                         is_training=False)

            input_ids = torch.from_numpy(
                tensorized_example[0]).long().to(device)
            input_mask = torch.from_numpy(
                tensorized_example[1]).long().to(device)
            text_len = torch.from_numpy(
                tensorized_example[2]).long().to(device)
            speaker_ids = torch.from_numpy(
                tensorized_example[3]).long().to(device)
            genre = torch.tensor(tensorized_example[4]).long().to(device)
            is_training = tensorized_example[5]
            gold_starts = torch.from_numpy(
                tensorized_example[6]).long().to(device)
            gold_ends = torch.from_numpy(
                tensorized_example[7]).long().to(device)
            cluster_ids = torch.from_numpy(
                tensorized_example[8]).long().to(device)
            sentence_map = torch.Tensor(
                tensorized_example[9]).long().to(device)

            if keys is not None and example['doc_key'] not in keys:
                continue
            doc_keys.append(example['doc_key'])

            (candidate_starts, candidate_ends, candidate_mention_scores,
             top_span_starts, top_span_ends, top_antecedents,
             top_antecedent_scores), loss = model(input_ids, input_mask,
                                                  text_len, speaker_ids, genre,
                                                  is_training, gold_starts,
                                                  gold_ends, cluster_ids,
                                                  sentence_map)

            predicted_antecedents = model.get_predicted_antecedents(
                top_antecedents.cpu(), top_antecedent_scores.cpu())
            coref_predictions[example["doc_key"]] = model.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
    official_stdout = True
    eval_mode = True
    summary_dict = {}
    if eval_mode:
        conll_results = conll.evaluate_conll(config["conll_eval_path"],
                                             coref_predictions,
                                             model.subtoken_maps,
                                             official_stdout)
        average_f1 = sum(
            results["f"]
            for results in conll_results.values()) / len(conll_results)
        summary_dict["Average F1 (conll)"] = average_f1
        print("Average F1 (conll): {:.2f}%".format(average_f1))

    p, r, f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))
Beispiel #13
0
    def evaluate(self, session, official_stdout=False):
        self.load_eval_data()

        tp, fn, fp = 0, 0, 0
        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, _, _, _, gold_starts, gold_ends, _, _, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = session.run(
                self.predictions, feed_dict=feed_dict)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)

            gold_mentions = set([(s, e) for cl in example["clusters"]
                                 for s, e in cl])
            pred_mentions = set([
                (s, e) for s, e in zip(top_span_starts, top_span_ends)
            ])
            tp += len(gold_mentions & pred_mentions)
            fn += len(gold_mentions - pred_mentions)
            fp += len(pred_mentions - gold_mentions)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        m_r = float(tp) / (tp + fn)
        m_p = float(tp) / (tp + fp)
        m_f1 = 2.0 * m_r * m_p / (m_r + m_p)
        print("Mention F1: {:.2f}%".format(m_f1 * 100))
        print("Mention recall: {:.2f}%".format(m_r * 100))
        print("Mention precision: {:.2f}%".format(m_p * 100))

        summary_dict = {}
        if official_stdout:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        average_f1 = average_f1 if official_stdout else f * 100
        return util.make_summary(summary_dict), average_f1
Beispiel #14
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False,
                 visualize=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0
        visualize_list = []

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            if keys is not None and example['doc_key'] not in keys:
                # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
                continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            predicted_clusters = self.evaluate_coref(top_span_starts,
                                                     top_span_ends,
                                                     predicted_antecedents,
                                                     example["clusters"],
                                                     coref_evaluator)
            coref_predictions[example["doc_key"]] = predicted_clusters
            # if example_num % 10 == 0:
            #   print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

            # Visualize antecedents
            if visualize:
                print('*****New Doc*****')
                subtokens = util.flatten(example['sentences'])
                span_list, antecedent_list = [], []
                for idx, antecedent_idx in enumerate(predicted_antecedents):
                    if antecedent_idx == -1:
                        continue
                    span_subtoken_idx = (top_span_starts[idx],
                                         top_span_ends[idx])
                    span_str = ' '.join(
                        subtokens[span_subtoken_idx[0]:span_subtoken_idx[1] +
                                  1])

                    antecedent_subtoken_idx = (top_span_starts[antecedent_idx],
                                               top_span_ends[antecedent_idx])
                    antecedent_str = ' '.join(subtokens[
                        antecedent_subtoken_idx[0]:antecedent_subtoken_idx[1] +
                        1])

                    # print('%s ---> %s' % (span_str, antecedent_str))
                    span_list.append(span_str)
                    antecedent_list.append(antecedent_str)
                visualize_list.append((span_list, antecedent_list))

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        logger.info("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        logger.info("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        logger.info("Average recall (py): {:.2f}%".format(r * 100))

        if visualize:
            with open('visualize.bin', 'wb') as f:
                pickle.dump(visualize_list, f)
            logger.info('Saved visialized')

        return util.make_summary(summary_dict), f
Beispiel #15
0
    def evaluate(self, session, official_stdout=False):
        # self.load_eval_data()
        with open(self.config["inv_mapping"], 'rb') as handle:
            inv_mapping = pickle.load(handle)
        with open(self.config["eval_path"], 'rb') as handle:
            test = pickle.load(handle)
        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        swag_predictions = []
        swag_labels = []
        for i in range(len(test)):
            if i == 191 or i == 217 or i == 225:
                continue
            example = test[i]
            file_name = example["doc_key"]
            inv_map = inv_mapping[file_name]
            tensorized_example = self.tensorize_example(example,
                                                        i,
                                                        is_training=False)
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            lee_predictions, swag_pred = session.run(
                [self.predictions2, self.swag_predictions],
                feed_dict=feed_dict)
            _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = lee_predictions
            top_span_starts = inv_map[top_span_starts]
            top_span_ends = inv_map[top_span_ends]
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[file_name] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            # SWAG evaluation
            swag_label = tensorized_example[-1]
            swag_predictions.append(swag_pred[0])
            swag_labels.append(swag_label[0])
            if i % 10 == 0:
                print("Evaluated {}/{} examples.".format(i + 1, len(test)))

        #. and now you getthe predictiosn basically.
        summary_dict = {}
        try:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)

            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))
        except:
            print("unstable results")
            average_f1 = 0
        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))
        print("Now evaluating SWAG")
        swag_accuracy = self.swag_evaluation(swag_predictions, swag_labels)
        print("Average SWAG accuracy is : {:.2f}%".format(swag_accuracy * 100))
        return util.make_summary(summary_dict), average_f1, swag_accuracy
Beispiel #16
0
    def __init__(self, config):
        self.config = config
        self.max_segment_len = config['max_segment_len']
        self.max_span_width = config["max_span_width"]
        self.genres = {g: i for i, g in enumerate(config["genres"])}
        self.subtoken_maps = {}
        self.gold = {}
        self.eval_data = None  # Load eval data lazily.
        self.dropout = None
        self.bert_config = modeling.BertConfig.from_json_file(
            config["bert_config_file"])
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=config['vocab_file'], do_lower_case=False)

        input_props = []
        input_props.append(
            (tf.int32, [None, None]))  # input_ids. (batch_size, seq_len)
        input_props.append(
            (tf.int32, [None, None]))  # input_mask (batch_size, seq_len)
        input_props.append((tf.int32, [None]))  # Text lengths.
        input_props.append(
            (tf.int32, [None, None]))  # Speaker IDs.  (batch_size, seq_len)
        input_props.append(
            (tf.int32, []))  # Genre.  能确保整个batch都是同主题,能因为一篇文章的多段放在一个batch里
        input_props.append((tf.bool, []))  # Is training.
        input_props.append(
            (tf.int32,
             [None]))  # Gold starts. 一个instance只有一个start?是整篇文章的所有mention的start
        input_props.append((tf.int32, [None]))  # Gold ends. 整篇文章的所有mention的end
        input_props.append(
            (tf.int32, [None]))  # Cluster ids. 整篇文章的所有mention的id
        input_props.append(
            (tf.int32, [None]))  # Sentence Map 整篇文章的每个token属于哪个句子

        self.queue_input_tensors = [
            tf.placeholder(dtype, shape) for dtype, shape in input_props
        ]
        dtypes, shapes = zip(*input_props)
        queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes,
                                    shapes=shapes)  # 10是batch_size?
        self.enqueue_op = queue.enqueue(self.queue_input_tensors)
        self.input_tensors = queue.dequeue()  # self.queue_input_tensors 不一样?

        self.predictions, self.loss = self.get_predictions_and_loss(
            *self.input_tensors)
        # bert stuff
        tvars = tf.trainable_variables()
        # If you're using TF weights only, tf_checkpoint and init_checkpoint can be the same
        # Get the assignment map from the tensorflow checkpoint.
        # Depending on the extension, use TF/Pytorch to load weights.
        assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint(
            tvars, config['tf_checkpoint'])
        init_from_checkpoint = tf.train.init_from_checkpoint if config[
            'init_checkpoint'].endswith(
                'ckpt') else load_from_pytorch_checkpoint
        init_from_checkpoint(config['init_checkpoint'], assignment_map)
        print("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            # tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
            # init_string)
            print("  name = %s, shape = %s%s" %
                  (var.name, var.shape, init_string))

        num_train_steps = int(self.config['num_docs'] *
                              self.config['num_epochs'])  # 文章数 * 训练轮数
        num_warmup_steps = int(num_train_steps * 0.1)  # 前1/10做warm_up
        self.global_step = tf.train.get_or_create_global_step(
        )  # 根据不同的model得到不同的optimizer
        self.train_op = optimization.create_custom_optimizer(
            tvars,
            self.loss,
            self.config['bert_learning_rate'],
            self.config['task_learning_rate'],
            num_train_steps,
            num_warmup_steps,
            False,
            self.global_step,
            freeze=-1,
            task_opt=self.config['task_optimizer'],
            eps=config['adam_eps'])
        self.coref_evaluator = metrics.CorefEvaluator()
Beispiel #17
0
    def evaluate(self, name='test', saves_results=False):
        # from collections import Counter
        # span_len_cnts = Counter()

        with torch.no_grad():
            log('evaluating')
            evaluator = metrics.CorefEvaluator()

            self.model.eval()
            batch_num = 0
            next_logging_pct = 10.
            start_time = time.time()
            cluster_predictions = {}
            avg_pos_acc = 0.

            for pct, example_idx, input_tensors, pos_tags, cand_mention_labels in data_utils.gen_batches(
                    name):
                batch_num += 1

                (
                    # [cand_num]
                    cand_mention_scores,
                    # [top_cand_num]
                    top_start_idxes,
                    # [top_cand_num]
                    top_end_idxes,
                    # [top_cand_num]
                    top_span_cluster_ids,
                    # [top_span_num, pruned_ant_num]
                    top_ant_idxes_of_spans,
                    # [top_cand_num, pruned_ant_num]
                    top_ant_cluster_ids_of_spans,
                    # # [top_cand_num, 1 + pruned_ant_num]
                    # top_ant_scores_of_spans,
                    # 4 * [top_cand_num, 1 + pruned_ant_num]
                    list_of_top_ant_scores_of_spans,
                    # [top_span_num, pruned_ant_num]
                    top_ant_mask_of_spans,
                    # [doc_len, pos_tag_num]
                    pos_tag_logits,
                    # [top_span_num, 1 + top_span_num], [top_span_num, top_span_num]
                    full_fast_ant_scores_of_spans,
                    full_ant_mask_of_spans) = self.model(*input_tensors)

                (
                    top_start_idxes, top_end_idxes, predicted_ant_idxes,
                    predicted_clusters, span_to_predicted_cluster
                ) = Runner.predict(
                    # [cand_num]
                    cand_mention_scores,
                    # [top_cand_num]
                    top_start_idxes,
                    # [top_cand_num]
                    top_end_idxes,
                    # [top_cand_num]
                    top_span_cluster_ids,
                    # [top_span_num, pruned_ant_num]
                    top_ant_idxes_of_spans,
                    # [top_cand_num, pruned_ant_num]
                    top_ant_cluster_ids_of_spans,
                    # # [top_cand_num, 1 + pruned_ant_num]
                    # top_ant_scores_of_spans,
                    # 4 * [top_cand_num, 1 + pruned_ant_num]
                    list_of_top_ant_scores_of_spans,
                    # [top_span_num, pruned_ant_num]
                    top_ant_mask_of_spans)

                # span_len_cnts.update((top_end_idxes - top_start_idxes + 1).tolist())

                if configs.predicts_pos_tags:
                    avg_pos_acc += Runner.compute_accuracy(
                        pos_tag_logits, pos_tags.cuda())

                gold_clusters = data_utils.get_gold_clusters(name, example_idx)
                gold_clusters = [
                    tuple(tuple(span) for span in cluster)
                    for cluster in gold_clusters
                ]
                span_to_gold_cluster = {
                    span: cluster
                    for cluster in gold_clusters for span in cluster
                }

                evaluator.update(
                    predicted=predicted_clusters,
                    gold=gold_clusters,
                    mention_to_predicted=span_to_predicted_cluster,
                    mention_to_gold=span_to_gold_cluster)
                cluster_predictions[data_utils.get_doc_key(
                    name, example_idx)] = predicted_clusters

                if pct >= next_logging_pct:
                    na_str = 'N/A'

                    log(f'{int(pct)}%,\ttime:\t{time.time() - start_time}\n'
                        f'pos_acc:\t{avg_pos_acc / batch_num if configs.predicts_pos_tags else na_str}\n'
                        f'f1:\t{evaluator.get_f1()}\n')
                    next_logging_pct += 5.

            epoch_precision, epoch_recall, epoch_f1 = evaluator.get_prf()

            avg_pos_acc /= batch_num

            avg_conll_f1 = conll.compute_avg_conll_f1(
                f'{configs.data_dir}/{name}.english.v4_gold_conll',
                cluster_predictions,
                official_stdout=True)

            na_str = 'N/A'

            log(f'avg_valid_time:\t{time.time() - start_time}\n'
                f'pos_acc:\t{avg_pos_acc if configs.predicts_pos_tags else na_str}\n'
                f'precision:\t{epoch_precision}\n'
                f'recall:\t{epoch_recall}\n'
                f'f1:\t{epoch_f1}\n'
                f'conll_f1: {avg_conll_f1}')

            # if saves_results:
            #     data_utils.save_predictions(name, cluster_predictions)

            # if name == 'test' and configs.training:
            #     if avg_conll_f1 > self.max_f1:
            #         self.max_f1 = avg_conll_f1
            #         # self.save_ckpt()
            #
            #         max_f1_file = open(configs.max_f1_path)
            #
            #         if epoch_f1 > float(max_f1_file.readline().strip()):
            #             max_f1_file.close()
            #             max_f1_file = open(configs.max_f1_path, 'w')
            #             print(epoch_f1, file=max_f1_file)
            #             self.save_ckpt()
            #
            #         max_f1_file.close()

            # self.lr_scheduler.step(epoch_f1)
            # self.lr_scheduler.step(-avg_epoch_loss)

            return avg_conll_f1
Beispiel #18
0
  def evaluate(self, session, global_step=None, official_stdout=False, keys=None, eval_mode=False):
    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    losses = []
    doc_keys = []
    num_evaluated= 0


    ##################################################################################################
    ################### WE FURTHER DETECT THE RESULTS SEPERATEDLY FOR P-P; NP-NP, P-NP ###############
    ##################################################################################################
    coref_predictions_pp = {}
    coref_predictions_pnp = {}
    coref_predictions_npnp = {}

    # span type
    coref_evaluator_pp = PairEvaluator()
    coref_evaluator_pnp = PairEvaluator()
    coref_evaluator_npnp = PairEvaluator()
    coref_evaluator_all = PairEvaluator()

    num_coref_pp = 0
    num_coref_pnp = 0
    num_coref_npnp = 0
    num_coref_all = 0

    # span freq
    coref_evaluator_freq = PairEvaluator()
    coref_evaluator_rare = PairEvaluator()
    
    num_coref_freq = 0
    num_coref_rare = 0

    # pron type
    coref_evaluators_type = dict()
    coref_evaluators_type["demo"], coref_evaluators_type["pos"], coref_evaluators_type["third"] = PairEvaluator(), PairEvaluator(), PairEvaluator()
    nums_coref_type = dict()
    nums_coref_type["demo"], nums_coref_type["pos"], nums_coref_type["third"] = 0, 0, 0

    count = 0 

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      try:
        # count += 1
        # if count == 10:
        #   break
        _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
        feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
        # if tensorized_example[0].shape[0] <= 9:
        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])
        loss, (candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores) = session.run([self.loss, self.predictions], feed_dict=feed_dict)
        # losses.append(session.run(self.loss, feed_dict=feed_dict))
        losses.append(loss)
        predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)

        coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

        if example_num % 10 == 0:
          print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

        #####################################################################################
        # Evaluate on three different settings: NP-NP, NP-P, P-P by using a different cluster
        #####################################################################################

        # Span Type
        flatten_sentences = util.flatten(example["sentences"])
        gold_pp_pairs, gold_pnp_pairs, gold_npnp_pairs, num_pp_pairs, num_pnp_pairs, num_npnp_pairs, num_relation = self.cluster_to_pairs(example["clusters"], flatten_sentences)
        # predicted_clusters = coref_predictions[example["doc_key"]]
        pred_pp_pairs, pred_pnp_pairs, pred_npnp_pairs, _, _, _, _ = self.cluster_to_pairs(coref_predictions[example["doc_key"]], flatten_sentences)

        # Span Frequency
        gold_freq_pnp_pairs, gold_rare_pnp_pairs, num_freq_pairs, num_rare_pairs = self.cluster_to_pair_frequent(example["clusters"], flatten_sentences)
        pred_freq_pnp_pairs, pred_rare_pnp_pairs, _, _ = self.cluster_to_pair_frequent(coref_predictions[example["doc_key"]], flatten_sentences)

        # pronoun type demo, pos, third
        gold_type_pairs, gold_type_nums = self.cluster_to_pair_detailed_pronoun(example["clusters"], flatten_sentences)
        pred_type_pairs, pred_type_nums = self.cluster_to_pair_detailed_pronoun(coref_predictions[example["doc_key"]], flatten_sentences)
        
        for pron_type in ["demo", "pos", "third"]:
          coref_evaluators_type[pron_type].update(gold_type_pairs[pron_type], pred_type_pairs[pron_type])
          nums_coref_type[pron_type] += gold_type_nums[pron_type]

        all_gold = gold_pp_pairs + gold_pnp_pairs + gold_npnp_pairs
        all_pred = pred_pp_pairs + pred_pnp_pairs + pred_npnp_pairs

        coref_evaluator_pp.update(pred_pp_pairs, gold_pp_pairs)
        coref_evaluator_pnp.update(pred_pnp_pairs, gold_pnp_pairs)
        coref_evaluator_npnp.update(pred_npnp_pairs, gold_npnp_pairs)
        coref_evaluator_all.update(all_pred, all_gold)

        coref_evaluator_freq.update(pred_freq_pnp_pairs, gold_freq_pnp_pairs)
        coref_evaluator_rare.update(pred_rare_pnp_pairs, gold_rare_pnp_pairs)

        num_coref_pp += num_pp_pairs
        num_coref_pnp += num_pnp_pairs
        num_coref_npnp += num_npnp_pairs
        num_coref_all = num_coref_all + num_pp_pairs + num_pnp_pairs + num_npnp_pairs
        num_coref_freq += num_freq_pairs
        num_coref_rare += num_rare_pairs
      except:
        a = "do nothing"

    summary_dict = {}

    self.print_prf(coref_evaluator_pp, summary_dict, doc_keys, "PP", num_coref_pp)
    self.print_prf(coref_evaluator_pnp, summary_dict, doc_keys, "PNP", num_coref_pnp)
    self.print_prf(coref_evaluator_npnp, summary_dict, doc_keys, "NPNP", num_coref_npnp)

    self.print_prf(coref_evaluator_freq, summary_dict, doc_keys, "FREQ", num_coref_freq)
    self.print_prf(coref_evaluator_rare, summary_dict, doc_keys, "RARE", num_coref_rare)

    for pron_type in ["demo", "pos", "third"]:
      self.print_prf(coref_evaluators_type[pron_type], summary_dict, doc_keys, pron_type, nums_coref_type[pron_type])
    
    self.print_prf(coref_evaluator_all, summary_dict, doc_keys, "ALL_PAIRS", num_coref_all)

    #######################################################################################

    # summary_dict = {}

    print("The evaluatoin results for all clusters")
    print("The number of pairs is "+ str(num_coref_all))
    
    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    if eval_mode:
      conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout)
      average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      summary_dict["Average F1 (conll)"] = average_f1
      print("Average F1 (conll): {:.2f}%".format(average_f1))


    return util.make_summary(summary_dict), f
Beispiel #19
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            # if keys is not None and example['doc_key']  in keys:
            # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
            # continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)
            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

        summary_dict = {}
        # with open('doc_keys_512.txt', 'w') as f:
        # for key in doc_keys:
        # f.write(key + '\n')
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util.make_summary(summary_dict), f
Beispiel #20
0
    def evaluate(self,
                 session,
                 global_step=None,
                 official_stdout=False,
                 keys=None,
                 eval_mode=False):
        self.load_eval_data()

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        losses = []
        doc_keys = []
        num_evaluated = 0

        for example_num, (tensorized_example,
                          example) in enumerate(self.eval_data):
            _, _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example
            feed_dict = {
                i: t
                for i, t in zip(self.input_tensors, tensorized_example)
            }
            # if tensorized_example[0].shape[0] <= 9:
            if keys is not None and example['doc_key'] not in keys:
                # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
                continue
            doc_keys.append(example['doc_key'])
            loss, (candidate_starts, candidate_ends, candidate_mention_scores,
                   top_span_starts, top_span_ends, top_antecedents,
                   top_antecedent_scores) = session.run(
                       [self.loss, self.predictions], feed_dict=feed_dict)
            # losses.append(session.run(self.loss, feed_dict=feed_dict))
            losses.append(loss)
            predicted_antecedents = self.get_predicted_antecedents(
                top_antecedents, top_antecedent_scores)

            coref_predictions[example["doc_key"]] = self.evaluate_coref(
                top_span_starts, top_span_ends, predicted_antecedents,
                example["clusters"], coref_evaluator)
            if example_num % 10 == 0:
                print("Evaluated {}/{} examples.".format(
                    example_num + 1, len(self.eval_data)))

                ##### print mentions of both example["clusters"] and coref_predictions[example["doc_key"]] below ########
                print_clusters = False
                if print_clusters:
                    comb_text = [
                        word for sentence in example['sentences']
                        for word in sentence
                    ]
                    print('#### Example Clusters: ####')
                    for cluster in example['clusters']:
                        mapped = []
                        for mention in cluster:
                            mapped.append(
                                self.convert_mention(mention, comb_text,
                                                     example))
                        print(mapped, end=",\n")

                    print('#### Predicted Clusters: ####')
                    for cluster in coref_predictions[example["doc_key"]]:
                        mapped = []
                        for mention in cluster:
                            mapped.append(
                                self.convert_mention(mention, comb_text,
                                                     example))
                        print(mapped, end=",\n")

        summary_dict = {}
        if eval_mode:
            conll_results = conll.evaluate_conll(
                self.config["conll_eval_path"], coref_predictions,
                self.subtoken_maps, official_stdout)
            average_f1 = sum(
                results["f"]
                for results in conll_results.values()) / len(conll_results)
            summary_dict["Average F1 (conll)"] = average_f1
            print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["Average F1 (py)"] = f
        print("Average F1 (py): {:.2f}% on {} docs".format(
            f * 100, len(doc_keys)))
        summary_dict["Average precision (py)"] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        return util_xlnet.make_summary(summary_dict), f
Beispiel #21
0
  def evaluate(self, session, global_step=None, official_stdout=False,
               keys=None, eval_mode=False, to_npy=None, from_npy=None,
               rsa_model=None):
    assert not (to_npy is not None and from_npy is not None), "cannot set both to_npy and from_npy to be none!"

    self.load_eval_data()

    coref_predictions = {}
    coref_evaluator = metrics.CorefEvaluator()
    losses = []
    doc_keys = []
    num_evaluated = 0
    total_time = 0

    if to_npy:
      data_dicts = []
    if from_npy:
      with open(from_npy, "rb") as f:
        from_npy_dict = np.load(f)
        data_dicts = from_npy_dict.item().get("data_dicts")

    for example_num, (tensorized_example, example) in enumerate(self.eval_data):
      _, _, _, _, _, _, gold_starts, gold_ends, _, _ = tensorized_example

      if from_npy is None:
        feed_dict = {i:t for i,t in zip(self.input_tensors, tensorized_example)}
        # if tensorized_example[0].shape[0] <= 9:
        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])
        loss, (candidate_starts, candidate_ends, candidate_mention_scores,
               top_span_starts, top_span_ends,
               top_antecedents, top_antecedent_scores) = \
          session.run([self.loss, self.predictions], feed_dict=feed_dict)
      else:
        data_dict = data_dicts[example_num]
        example = data_dict["example"]

        if keys is not None and example['doc_key'] not in keys:
          # print('Skipping...', example['doc_key'], tensorized_example[0].shape)
          continue
        doc_keys.append(example['doc_key'])

        tensorized_example = data_dict["tensorized_example"]
        loss = data_dict["loss"]
        top_span_starts = data_dict["top_span_starts"]
        top_span_ends = data_dict["top_span_ends"]
        top_antecedents = data_dict["top_antecedents"]
        top_antecedent_scores = data_dict["top_antecedent_scores"]

      # losses.append(session.run(self.loss, feed_dict=feed_dict))
      losses.append(loss)

      if rsa_model is not None:
        print("Running l1 for sentence %d" % example_num)
        start_time = time.time()
        top_antecedent_scores = rsa_model.l1(example, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores)
        duration = time.time() - start_time
        print("Finished sentence %d, took %.2f s" % (example_num, duration))
        total_time += duration
        num_evaluated += 1

      if to_npy:
          data_dict = {
              "example_num": example_num,
              "tensorized_example": tensorized_example,
              "example": example,
              "top_span_starts": top_span_starts,
              "top_span_ends": top_span_ends,
              "top_antecedents": top_antecedents,
              "top_antecedent_scores": top_antecedent_scores,
              "loss": loss,
          }
          data_dicts.append(data_dict)

      predicted_antecedents = self.get_predicted_antecedents(top_antecedents, top_antecedent_scores)
      coref_predictions[example["doc_key"]] = self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluator)

      if example_num % 10 == 0:
        print("Evaluated {}/{} examples.".format(example_num + 1, len(self.eval_data)))

    summary_dict = {}
    if to_npy:
      dict_to_npy = {"data_dicts": data_dicts}

    if eval_mode:
      conll_results = conll.evaluate_conll(self.config["conll_eval_path"], coref_predictions, self.subtoken_maps, official_stdout )
      average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
      summary_dict["Average F1 (conll)"] = average_f1
      if to_npy:
        dict_to_npy["Average F1 (conll)"] = average_f1
      print("Average F1 (conll): {:.2f}%".format(average_f1))


    p,r,f = coref_evaluator.get_prf()
    summary_dict["Average F1 (py)"] = f
    print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(doc_keys)))
    summary_dict["Average precision (py)"] = p
    print("Average precision (py): {:.2f}%".format(p * 100))
    summary_dict["Average recall (py)"] = r
    print("Average recall (py): {:.2f}%".format(r * 100))

    if to_npy:
      dict_to_npy["Average F1 (py)"] = f
      dict_to_npy["Average precision (py)"] = p
      dict_to_npy["Average recall (py)"] = r
      with open(to_npy, "wb") as f_to_npy:
        np.save(f_to_npy, dict_to_npy)

    if rsa_model:
        print("Ran rsa on %d sentences, avg time per sentence %.2f s" % num_evaluated, total_time / num_evaluated)

    return util.make_summary(summary_dict), f
Beispiel #22
0
                    mention_end_dict[doc_key] = mention_ends
                    antecedents_dict[doc_key] = antecedents

                all_antecedent_scores[doc_key].append(antecedent_scores)

                if example_num % 10 == 0:
                    print "Computed {}/{} examples.".format(
                        example_num + 1, len(model.eval_data))

        mean_antecedent_scores = {
            doc_key: np.mean(s, 0)
            for doc_key, s in all_antecedent_scores.items()
        }

        merged_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        for example_num, (tensorized_example,
                          example) in enumerate(model.eval_data):
            doc_key = example["doc_key"]
            mention_starts = mention_start_dict[doc_key]
            mention_ends = mention_end_dict[doc_key]
            antecedents = antecedents_dict[doc_key]
            antecedent_scores = mean_antecedent_scores[doc_key]
            predicted_antecedents = []
            for i, index in enumerate(
                    np.argmax(antecedent_scores, axis=1) - 1):
                if index < 0:
                    predicted_antecedents.append(-1)
                else:
                    predicted_antecedents.append(antecedents[i, index])
            merged_predictions[doc_key] = model.evaluate_coref(