Exemple #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file-a",
                        help="Cluster predictions from system A")
    parser.add_argument("--salient-mentions-file-a",
                        help="Salient mentions from system A")
    parser.add_argument("--clusters-file-b",
                        help="Cluster predictions from system B")
    parser.add_argument("--salient-mentions-file-b",
                        help="Salient mentions from system B")
    parser.add_argument("--edge-degree-direction",
                        default="both",
                        choices=["both", "out", "in"],
                        type=str)
    parser.add_argument("--num-buckets", default=6, type=int)
    args = parser.parse_args()

    bucketed_eval_comparison = {}

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str,
                                          Dict[tuple, tuple]] = ner_metrics(
                                              gold_data, predicted_ner)

    doc_ids = [doc["doc_id"] for doc in gold_data]
    doc_buckets = bucket_documents_by_graph_degree(
        doc_ids, num_buckets=args.num_buckets)

    for bucket_name, bucket_docs in doc_buckets:
        gold_data_in_bucket = [
            doc for doc in gold_data if doc["doc_id"] in bucket_docs
        ]
        print("\n")
        print(
            f"bucket: {bucket_name}, contains {len(gold_data_in_bucket)} documents"
        )

        predicted_salient_mentions_a = convert_to_dict(
            load_jsonl(args.salient_mentions_file_a))
        preds_a, labels_a = salent_mentions_metrics(
            gold_data_in_bucket, predicted_salient_mentions_a)

        predicted_salient_mentions_b = convert_to_dict(
            load_jsonl(args.salient_mentions_file_b))
        preds_b, labels_b = salent_mentions_metrics(
            gold_data_in_bucket, predicted_salient_mentions_b)
        assert labels_a == labels_b
        gold_mentions = labels_a

        print(
            f"Paired Bootstrap Comparison of System A and System B on salient mention metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        assert len(preds_a) == len(preds_b)
        assert len(preds_a) == len(gold_mentions)
        sys1_mention = list(preds_a)
        sys2_mention = list(preds_b)
        assert len(sys1_mention) == len(sys2_mention)
        sys1_summary, sys2_summary, p_value_lose, p_value_win = eval_with_paired_bootstrap(
            gold_mentions,
            sys1_mention,
            sys2_mention,
            num_samples=1000,
            sample_ratio=0.5,
            eval_type='f1',
            return_results=True)
        bucketed_eval_comparison[str(bucket_name)] = {
            "base": [list(sys1_summary), p_value_lose],
            "diff": [list(sys2_summary), p_value_win]
        }

        predicted_salient_clusters_a = convert_to_dict(
            load_jsonl(args.clusters_file_a))
        predicted_salient_clusters_b = convert_to_dict(
            load_jsonl(args.clusters_file_b))

        get_types_of_clusters(convert_to_dict(gold_data_in_bucket),
                              convert_to_dict(gold_data_in_bucket))

        i = 0
        filenames = [
            args.salient_mentions_file_a, args.salient_mentions_file_b
        ]
        for predicted_salient_clusters in [
                predicted_salient_clusters_a, predicted_salient_clusters_b
        ]:
            print(f"\nMetrics for {filenames[i]}")
            i += 1
            for d, doc in predicted_salient_clusters.items():
                if 'clusters' not in doc:
                    merge_method_subrelations(doc)
                    doc['clusters'] = {
                        x: v
                        for x, v in doc['coref'].items() if len(v) > 0
                    }
            get_types_of_clusters(predicted_ner, predicted_salient_clusters)

        _, all_metrics_a = clustering_metrics(gold_data_in_bucket,
                                              predicted_salient_clusters_a,
                                              predicted_span_to_gold_span_map)
        _, all_metrics_b = clustering_metrics(gold_data_in_bucket,
                                              predicted_salient_clusters_b,
                                              predicted_span_to_gold_span_map)

        print(
            f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        sys1_cluster = list(all_metrics_a["f1"])
        sys2_cluster = list(all_metrics_b["f1"])
        assert len(sys1_cluster) == len(sys2_cluster)

        gold = [None for _ in sys1_cluster]
        # Each bootstrap sample draws 50 items.
        eval_with_paired_bootstrap(gold,
                                   sys1_cluster,
                                   sys2_cluster,
                                   num_samples=1000,
                                   sample_ratio=0.76,
                                   eval_type='avg')
    print(
        f"Bucket evaluations (diff):\n{json.dumps(bucketed_eval_comparison, indent=2)}"
    )
    draw_box_plot_with_error_bars(
        bucketed_eval_comparison,
        fname=
        f"/tmp/bucketed_salient_mention_eval_comparison_n_{args.num_buckets}.png"
    )
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file")
    parser.add_argument("--relations-file-a", help="Relation predictions from system A")
    parser.add_argument("--thresh-a", default=None, type=float)
    parser.add_argument("--relations-file-b", help="Relation predictions from system A")
    parser.add_argument("--thresh-b", default=None, type=float)
    parser.add_argument("--edge-degree-direction", default="both", choices=["both", "out", "in"], type=str)
    parser.add_argument("--num-buckets", default=6, type=int)
    parser.add_argument("--metric-type", default="retrieval", choices=["retrieval", "classification"], type=str)

    args = parser.parse_args()

    processed_data_a = prepare_data(args.gold_file, args.ner_file, args.clusters_file, args.relations_file_a)
    gold_data_a, predicted_ner_a, predicted_salient_clusters_a, predicted_relations_a, predicted_cluster_to_gold_cluster_map_a = processed_data_a

    processed_data_b = prepare_data(args.gold_file, args.ner_file, args.clusters_file, args.relations_file_b)
    gold_data_b, predicted_ner_b, predicted_salient_clusters_b, predicted_relations_b, predicted_cluster_to_gold_cluster_map_b = processed_data_b

    assert gold_data_a == gold_data_b
    gold_data = gold_data_a
    assert predicted_ner_a == predicted_ner_b
    predicted_ner = predicted_ner_a
    assert predicted_salient_clusters_a == predicted_salient_clusters_b
    predicted_salient_clusters = predicted_salient_clusters_a

    doc_ids = [doc["doc_id"] for doc in gold_data]
    doc_buckets = bucket_documents_by_graph_degree(doc_ids, num_buckets=args.num_buckets, degree_direction=args.edge_degree_direction)

    for n in [4]:
        bucketed_eval_comparison = {}
        bucket_evaluations_diff = {}

        print("\n")
        print(f"n: {n}")
        for bucket_name, bucket_docs in doc_buckets:
            gold_data_in_bucket = [doc for doc in gold_data if doc["doc_id"] in bucket_docs]
            print("\n")
            print(f"bucket: {bucket_name}, contains {len(gold_data_in_bucket)} documents")
            retrieval_metrics_df_a, _, y_labels_a, y_preds_a, _, _, _ = compute_relations_metrics(
                                                    gold_data_in_bucket,
                                                    predicted_ner,
                                                    predicted_salient_clusters,
                                                    predicted_relations_a,
                                                    predicted_cluster_to_gold_cluster_map_a,
                                                    n=n,
                                                    thresh=args.thresh_a)
            retrieval_metrics_df_b, _, y_labels_b, y_preds_b, _, _, _ = compute_relations_metrics(
                                                    gold_data_in_bucket,
                                                    predicted_ner,
                                                    predicted_salient_clusters,
                                                    predicted_relations_b,
                                                    predicted_cluster_to_gold_cluster_map_b,
                                                    n=n,
                                                    thresh=args.thresh_b)
            assert y_labels_a == y_labels_b, breakpoint()
            y_labels = y_labels_a
            assert len(y_preds_a) == len(y_preds_b)

            print(f"Paired Bootstrap Comparison of System A and System B on relation classification metric:")
            sys1_summary_class, sys2_summary_class, p_value_lose_class, p_value_win_class = eval_with_paired_bootstrap(y_labels, y_preds_a, y_preds_b,
                                    num_samples=5000, sample_ratio=0.50,
                                    eval_type='macro-f1', return_results=True)

            print("\n")
            print(f"Paired Bootstrap Comparison of System A and System B on relation retrieval metric:")
            # The bootstrap script expects a list of gold values, but here the "system" values are already 
            # comparisons with gold, so just pass in a list of Nones to satisfy the input.
            sys1_retrieval = list(retrieval_metrics_df_a["f1"])
            sys2_retrieval = list(retrieval_metrics_df_b["f1"])
            assert len(sys1_retrieval) == len(sys2_retrieval)

            gold = [None for _ in sys1_retrieval]
            # Each bootstrap sample draws 50 items.
            sys1_summary_ret, sys2_summary_ret, p_value_lose_ret, p_value_win_ret = eval_with_paired_bootstrap(gold, sys1_retrieval, sys2_retrieval,
                                    num_samples=1000, sample_ratio=0.75,
                                    eval_type='avg', return_results=True)
            if args.metric_type == "retrieval":
                bucketed_eval_comparison[str(bucket_name)] = {"base": [list(sys1_summary_ret), p_value_lose_ret], "diff": [list(sys2_summary_ret), p_value_win_ret]}
            else:
                bucketed_eval_comparison[str(bucket_name)] = {"base": [list(sys1_summary_class), p_value_lose_class], "diff": [list(sys2_summary_class), p_value_win_class]}
        print(f"Bucket evaluations (base):\n{json.dumps(bucketed_eval_comparison, indent=2)}")

        draw_box_plot_with_error_bars(bucketed_eval_comparison, 'Degree of documents in citation graph)', 'Mean Retrieval F1 score', fname=f"/tmp/bucketed_eval_comparison_bucket_{args.metric_type}_{args.num_buckets}_n_{n}.png")
def main(args):
    gold_data = load_jsonl(args.gold_file)
    all_label_ner_values = [x['ner'] for x in gold_data]
    entity_labels = set([
        entity[2] for document in all_label_ner_values for entity in document
    ])

    f1metric_a = SpanBasedF1Measure(entity_labels=entity_labels)
    f1metric_b = SpanBasedF1Measure(entity_labels=entity_labels)
    predicted_ner_a = convert_to_dict(load_jsonl(args.ner_file_a))
    predicted_ner_b = convert_to_dict(load_jsonl(args.ner_file_b))

    assert len(gold_data) == len(predicted_ner_a)
    assert len(gold_data) == len(predicted_ner_b)

    single_y_labels = None
    y_preds_a = []
    y_labels_a = []
    y_preds_b = []
    y_labels_b = []
    for labeled_doc in gold_data:
        labeled_ner_values = labeled_doc['ner']
        doc_id = labeled_doc['doc_id']

        f1metric_a.reset()
        f1metric_a(predicted_ner_a[doc_id]['ner'], labeled_ner_values)

        tps = sum(f1metric_a._true_positives.values())
        fps = sum(f1metric_a._false_positives.values())
        fns = sum(f1metric_a._false_negatives.values())

        for _ in range(tps):
            y_preds_a.append(1)
            y_labels_a.append(1)

        for _ in range(fns):
            y_preds_a.append(0)
            y_labels_a.append(1)

        for _ in range(fps):
            y_preds_a.append(1)
            y_labels_a.append(0)

        f1metric_b.reset()
        f1metric_b(predicted_ner_b[doc_id]['ner'], labeled_ner_values)

        tps = sum(f1metric_b._true_positives.values())
        fps = sum(f1metric_b._false_positives.values())
        fns = sum(f1metric_b._false_negatives.values())

        for _ in range(tps):
            y_preds_b.append(1)
            y_labels_b.append(1)

        for _ in range(fns):
            y_preds_b.append(0)
            y_labels_b.append(1)

        for _ in range(fps):
            y_preds_b.append(1)
            y_labels_b.append(0)

        # Add TNs to round things out
        if len(y_labels_b) > len(y_labels_a):
            diff = len(y_labels_b) - len(y_labels_a)
            for _ in range(diff):
                y_labels_a.append(0)
                y_preds_a.append(0)
        elif len(y_labels_a) > len(y_labels_b):
            diff = len(y_labels_a) - len(y_labels_b)
            for _ in range(diff):
                y_labels_b.append(0)
                y_preds_b.append(0)

    assert y_labels_a == y_labels_b, breakpoint()

    print(f"\nBaseline:")
    print(f"Overall F1: {f1_score(y_labels_a, y_preds_a)}")
    print(f"Overall Precision: {precision_score(y_labels_a, y_preds_a)}")
    print(f"Overall Recall: {recall_score(y_labels_a, y_preds_a)}")
    print(f"\nNew System:")
    print(f"Overall F1: {f1_score(y_labels_b, y_preds_b)}")
    print(f"Overall Precision: {precision_score(y_labels_b, y_preds_b)}")
    print(f"Overall Recall: {recall_score(y_labels_b, y_preds_b)}")

    print(f"Bootstrap (F1)")
    eval_with_paired_bootstrap(y_labels_a,
                               y_preds_a,
                               y_preds_b,
                               num_samples=1000,
                               sample_ratio=0.5,
                               eval_type='f1')

    print(f"\nBootstrap (Precision)")
    eval_with_paired_bootstrap(y_labels_a,
                               y_preds_a,
                               y_preds_b,
                               num_samples=1000,
                               sample_ratio=0.5,
                               eval_type='precision')

    print(f"\nBootstrap (Recall)")
    eval_with_paired_bootstrap(y_labels_a,
                               y_preds_a,
                               y_preds_b,
                               num_samples=1000,
                               sample_ratio=0.5,
                               eval_type='recall')
Exemple #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file")
    parser.add_argument("--relations-file-a",
                        help="Relation predictions from system A")
    parser.add_argument("--thresh-a", default=None, type=float)
    parser.add_argument("--relations-file-b",
                        help="Relation predictions from system A")
    parser.add_argument("--thresh-b", default=None, type=float)

    args = parser.parse_args()

    processed_data_a = prepare_data(args.gold_file, args.ner_file,
                                    args.clusters_file, args.relations_file_a)
    gold_data_a, predicted_ner_a, predicted_salient_clusters_a, predicted_relations_a, predicted_cluster_to_gold_cluster_map_a = processed_data_a

    processed_data_b = prepare_data(args.gold_file, args.ner_file,
                                    args.clusters_file, args.relations_file_b)
    gold_data_b, predicted_ner_b, predicted_salient_clusters_b, predicted_relations_b, predicted_cluster_to_gold_cluster_map_b = processed_data_b

    assert gold_data_a == gold_data_b
    gold_data = gold_data_a
    assert predicted_ner_a == predicted_ner_b
    predicted_ner = predicted_ner_a
    assert predicted_salient_clusters_a == predicted_salient_clusters_b
    predicted_salient_clusters = predicted_salient_clusters_a

    for n in [2, 4]:
        print("\n")
        print(f"n: {n}")
        retrieval_metrics_df_a, _, y_labels_a, y_preds_a, _, _, _ = compute_relations_metrics(
            gold_data,
            predicted_ner,
            predicted_salient_clusters,
            predicted_relations_a,
            predicted_cluster_to_gold_cluster_map_a,
            n=n,
            thresh=args.thresh_a)
        retrieval_metrics_df_b, _, y_labels_b, y_preds_b, _, _, _ = compute_relations_metrics(
            gold_data,
            predicted_ner,
            predicted_salient_clusters,
            predicted_relations_b,
            predicted_cluster_to_gold_cluster_map_b,
            n=n,
            thresh=args.thresh_b)
        assert y_labels_a == y_labels_b, breakpoint()
        y_labels = y_labels_a
        assert len(y_preds_a) == len(y_preds_b)

        print(
            f"Paired Bootstrap Comparison of System A and System B on relation classification metric:"
        )
        eval_with_paired_bootstrap(y_labels,
                                   y_preds_a,
                                   y_preds_b,
                                   num_samples=10000,
                                   sample_ratio=0.50,
                                   eval_type='macro-f1')

        print("\n")
        print(
            f"Paired Bootstrap Comparison of System A and System B on relation retrieval metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        sys1_retrieval = list(retrieval_metrics_df_a["f1"])
        sys2_retrieval = list(retrieval_metrics_df_b["f1"])
        assert len(sys1_retrieval) == len(sys2_retrieval)

        gold = [None for _ in sys1_retrieval]
        # Each bootstrap sample draws 50 items.
        eval_with_paired_bootstrap(gold,
                                   sys1_retrieval,
                                   sys2_retrieval,
                                   num_samples=1000,
                                   sample_ratio=0.76,
                                   eval_type='avg')
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file-a", help="Cluster predictions from system A")
    parser.add_argument("--salient-mentions-file-a", help="Salient mentions from system A")
    parser.add_argument("--clusters-file-b", help="Cluster predictions from system B")
    parser.add_argument("--salient-mentions-file-b", help="Salient mentions from system B")
    args = parser.parse_args()

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)

    predicted_salient_mentions_a = convert_to_dict(load_jsonl(args.salient_mentions_file_a))
    preds_a, labels_a = salent_mentions_metrics(gold_data, predicted_salient_mentions_a)

    predicted_salient_mentions_b = convert_to_dict(load_jsonl(args.salient_mentions_file_b))
    preds_b, labels_b = salent_mentions_metrics(gold_data, predicted_salient_mentions_b)
    assert labels_a == labels_b
    gold_mentions = labels_a

    print(f"Paired Bootstrap Comparison of System A and System B on salient mention metric:")
    # The bootstrap script expects a list of gold values, but here the "system" values are already 
    # comparisons with gold, so just pass in a list of Nones to satisfy the input.
    assert len(preds_a) == len(preds_b)
    assert len(preds_a) == len(gold_mentions)
    sys1_mention = list(preds_a)
    sys2_mention = list(preds_b)
    assert len(sys1_mention) == len(sys2_mention)
    eval_with_paired_bootstrap(gold_mentions, sys1_mention, sys2_mention,
                               num_samples=1000, sample_ratio=0.5,
                               eval_type='f1')

    predicted_salient_clusters_a = convert_to_dict(load_jsonl(args.clusters_file_a))
    predicted_salient_clusters_b = convert_to_dict(load_jsonl(args.clusters_file_b))

    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))

    i = 0
    filenames = [args.salient_mentions_file_a, args.salient_mentions_file_b]
    for predicted_salient_clusters in [predicted_salient_clusters_a, predicted_salient_clusters_b]:
        print(f"\nMetrics for {filenames[i]}")
        i+=1
        for d, doc in predicted_salient_clusters.items() :
            if 'clusters' not in doc :
                merge_method_subrelations(doc)
                doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}
        get_types_of_clusters(predicted_ner, predicted_salient_clusters)

    _, all_metrics_a = clustering_metrics(
        gold_data, predicted_salient_clusters_a, predicted_span_to_gold_span_map
    )
    _, all_metrics_b = clustering_metrics(
        gold_data, predicted_salient_clusters_b, predicted_span_to_gold_span_map
    )

    print(f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:")
    # The bootstrap script expects a list of gold values, but here the "system" values are already 
    # comparisons with gold, so just pass in a list of Nones to satisfy the input.
    sys1_cluster = list(all_metrics_a["f1"])
    sys2_cluster = list(all_metrics_b["f1"])
    assert len(sys1_cluster) == len(sys2_cluster)

    gold = [None for _ in sys1_cluster]
    # Each bootstrap sample draws 50 items.
    eval_with_paired_bootstrap(gold, sys1_cluster, sys2_cluster,
                               num_samples=1000, sample_ratio=0.76,
                               eval_type='avg')