Ejemplo n.º 1
0
def combine_span_and_cluster_file(span_file, cluster_file):
    spans = load_jsonl(span_file)
    clusters = {item['doc_id']: item for item in load_jsonl(cluster_file)}

    for doc in spans:
        if doc['doc_id'] not in clusters.keys():
            continue

        if 'clusters' in clusters[doc['doc_id']]:
            doc['coref'] = clusters[doc['doc_id']]['clusters']
        else:
            merge_method_subrelations(clusters[doc['doc_id']])
            doc['coref'] = {
                x: v
                for x, v in clusters[doc['doc_id']]['coref'].items()
                if len(v) > 0
            }

        if 'n_ary_relations' in doc:
            del doc['n_ary_relations']

        if 'method_subrelations' in doc:
            del doc['method_subrelations']

    annotations_to_jsonl(spans, 'tmp_relation_42424242.jsonl')
Ejemplo n.º 2
0
def main(args):
    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]

    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_salient_clusters = convert_to_dict(load_jsonl(args.clusters_file))
    for d, doc in predicted_salient_clusters.items() :
        if 'clusters' not in doc :
            merge_method_subrelations(doc)
            doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}

    predicted_relations = convert_to_dict(load_jsonl(args.relations_file))

    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)
    get_types_of_clusters(predicted_ner, predicted_salient_clusters)
    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))
    predicted_cluster_to_gold_cluster_map = clustering_metrics(
        gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map
    )

    for n in [2, 4] :
        all_metrics = []
        for types in combinations(used_entities, n):
            for doc in gold_data:
                predicted_data = predicted_relations[doc["doc_id"]]
                mapping = predicted_cluster_to_gold_cluster_map[doc["doc_id"]]

                relations = list(set([
                    tuple([mapping.get(v, v) for v in x[0]])
                    for x in predicted_data["predicted_relations"]
                    if x[2] == 1
                ]))

                relations = [dict(zip(used_entities, x)) for x in relations]
                relations = set([tuple((t, x[t]) for t in types) for x in relations])

                gold_relations = [tuple((t, x[t]) for t in types) for x in doc['n_ary_relations']]
                gold_relations = set([x for x in gold_relations if has_all_mentions(doc, x)])

                matched = relations & gold_relations

                metrics = {
                    "p": len(matched) / (len(relations) + 1e-7),
                    "r": len(matched) / (len(gold_relations) + 1e-7),
                }
                metrics["f1"] = 2 * metrics["p"] * metrics["r"] / (metrics["p"] + metrics["r"] + 1e-7)

                if len(gold_relations) > 0:
                    all_metrics.append(metrics)

        all_metrics = pd.DataFrame(all_metrics)
        print(f"Relation Metrics n={n}")
        rln_metrics = all_metrics.describe().loc['mean'][['p', 'r', 'f1']]
        print(rln_metrics)
        rln_metrics.to_json(os.environ["DECODING_METRICS_OUTFP"])
Ejemplo n.º 3
0
def main(args):
    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]

    predicted_salient_mentions = convert_to_dict(load_jsonl(args.salient_mentions_file))
    salent_mentions_metrics(gold_data, predicted_salient_mentions)

    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_salient_clusters = convert_to_dict(load_jsonl(args.clusters_file))
    for d, doc in predicted_salient_clusters.items() :
        if 'clusters' not in doc :
            merge_method_subrelations(doc)
            doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}

    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)
    get_types_of_clusters(predicted_ner, predicted_salient_clusters)
    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))
    predicted_cluster_to_gold_cluster_map = clustering_metrics(
        gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map
    )
Ejemplo n.º 4
0
def prepare_data(gold_file, ner_file, clusters_file, relations_file):
    gold_data = load_jsonl(gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]

    predicted_ner = convert_to_dict(load_jsonl(ner_file))
    predicted_salient_clusters = convert_to_dict(load_jsonl(clusters_file))
    for d, doc in predicted_salient_clusters.items() :
        if 'clusters' not in doc :
            merge_method_subrelations(doc)
            doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}

    predicted_relations = convert_to_dict(load_jsonl(relations_file))

    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)
    get_types_of_clusters(predicted_ner, predicted_salient_clusters)
    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))
    predicted_cluster_to_gold_cluster_map = match_predicted_clusters_with_gold(
        gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map
    )
    return gold_data, predicted_ner, predicted_salient_clusters, predicted_relations, predicted_cluster_to_gold_cluster_map
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file-a",
                        help="Cluster predictions from system A")
    parser.add_argument("--salient-mentions-file-a",
                        help="Salient mentions from system A")
    parser.add_argument("--clusters-file-b",
                        help="Cluster predictions from system B")
    parser.add_argument("--salient-mentions-file-b",
                        help="Salient mentions from system B")
    parser.add_argument("--edge-degree-direction",
                        default="both",
                        choices=["both", "out", "in"],
                        type=str)
    parser.add_argument("--num-buckets", default=6, type=int)
    args = parser.parse_args()

    bucketed_eval_comparison = {}

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str,
                                          Dict[tuple, tuple]] = ner_metrics(
                                              gold_data, predicted_ner)

    doc_ids = [doc["doc_id"] for doc in gold_data]
    doc_buckets = bucket_documents_by_graph_degree(
        doc_ids, num_buckets=args.num_buckets)

    for bucket_name, bucket_docs in doc_buckets:
        gold_data_in_bucket = [
            doc for doc in gold_data if doc["doc_id"] in bucket_docs
        ]
        print("\n")
        print(
            f"bucket: {bucket_name}, contains {len(gold_data_in_bucket)} documents"
        )

        predicted_salient_mentions_a = convert_to_dict(
            load_jsonl(args.salient_mentions_file_a))
        preds_a, labels_a = salent_mentions_metrics(
            gold_data_in_bucket, predicted_salient_mentions_a)

        predicted_salient_mentions_b = convert_to_dict(
            load_jsonl(args.salient_mentions_file_b))
        preds_b, labels_b = salent_mentions_metrics(
            gold_data_in_bucket, predicted_salient_mentions_b)
        assert labels_a == labels_b
        gold_mentions = labels_a

        print(
            f"Paired Bootstrap Comparison of System A and System B on salient mention metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        assert len(preds_a) == len(preds_b)
        assert len(preds_a) == len(gold_mentions)
        sys1_mention = list(preds_a)
        sys2_mention = list(preds_b)
        assert len(sys1_mention) == len(sys2_mention)
        sys1_summary, sys2_summary, p_value_lose, p_value_win = eval_with_paired_bootstrap(
            gold_mentions,
            sys1_mention,
            sys2_mention,
            num_samples=1000,
            sample_ratio=0.5,
            eval_type='f1',
            return_results=True)
        bucketed_eval_comparison[str(bucket_name)] = {
            "base": [list(sys1_summary), p_value_lose],
            "diff": [list(sys2_summary), p_value_win]
        }

        predicted_salient_clusters_a = convert_to_dict(
            load_jsonl(args.clusters_file_a))
        predicted_salient_clusters_b = convert_to_dict(
            load_jsonl(args.clusters_file_b))

        get_types_of_clusters(convert_to_dict(gold_data_in_bucket),
                              convert_to_dict(gold_data_in_bucket))

        i = 0
        filenames = [
            args.salient_mentions_file_a, args.salient_mentions_file_b
        ]
        for predicted_salient_clusters in [
                predicted_salient_clusters_a, predicted_salient_clusters_b
        ]:
            print(f"\nMetrics for {filenames[i]}")
            i += 1
            for d, doc in predicted_salient_clusters.items():
                if 'clusters' not in doc:
                    merge_method_subrelations(doc)
                    doc['clusters'] = {
                        x: v
                        for x, v in doc['coref'].items() if len(v) > 0
                    }
            get_types_of_clusters(predicted_ner, predicted_salient_clusters)

        _, all_metrics_a = clustering_metrics(gold_data_in_bucket,
                                              predicted_salient_clusters_a,
                                              predicted_span_to_gold_span_map)
        _, all_metrics_b = clustering_metrics(gold_data_in_bucket,
                                              predicted_salient_clusters_b,
                                              predicted_span_to_gold_span_map)

        print(
            f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        sys1_cluster = list(all_metrics_a["f1"])
        sys2_cluster = list(all_metrics_b["f1"])
        assert len(sys1_cluster) == len(sys2_cluster)

        gold = [None for _ in sys1_cluster]
        # Each bootstrap sample draws 50 items.
        eval_with_paired_bootstrap(gold,
                                   sys1_cluster,
                                   sys2_cluster,
                                   num_samples=1000,
                                   sample_ratio=0.76,
                                   eval_type='avg')
    print(
        f"Bucket evaluations (diff):\n{json.dumps(bucketed_eval_comparison, indent=2)}"
    )
    draw_box_plot_with_error_bars(
        bucketed_eval_comparison,
        fname=
        f"/tmp/bucketed_salient_mention_eval_comparison_n_{args.num_buckets}.png"
    )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-files-a",
                        help="Cluster predictions from system A",
                        nargs='+',
                        type=str)
    parser.add_argument("--salient-mentions-files-a",
                        help="Salient mentions from system A",
                        nargs='+',
                        type=str)
    parser.add_argument("--clusters-files-b",
                        help="Cluster predictions from system B",
                        nargs='+',
                        type=str)
    parser.add_argument("--salient-mentions-files-b",
                        help="Salient mentions from system B",
                        nargs='+',
                        type=str)
    args = parser.parse_args()

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str,
                                          Dict[tuple, tuple]] = ner_metrics(
                                              gold_data, predicted_ner)

    salient_mention_predictions_a = []
    gold_mentions = None

    salient_mentions_files_a = list(args.salient_mentions_files_a)
    salient_mentions_files_b = list(args.salient_mentions_files_b)

    for salient_file_a in salient_mentions_files_a:
        predicted_salient_mentions_a = convert_to_dict(
            load_jsonl(salient_file_a))
        preds_a, labels_a = salent_mentions_metrics(
            gold_data, predicted_salient_mentions_a)
        if gold_mentions is None:
            gold_mentions = labels_a
        else:
            assert gold_mentions == labels_a, breakpoint()
        assert len(gold_mentions) == len(labels_a)
        salient_mention_predictions_a.append(preds_a)

    print("\n")
    salient_mention_predictions_b = []
    for salient_file_b in salient_mentions_files_b:
        predicted_salient_mentions_b = convert_to_dict(
            load_jsonl(salient_file_b))
        preds_b, labels_b = salent_mentions_metrics(
            gold_data, predicted_salient_mentions_b)
        assert gold_mentions == labels_b
        assert len(gold_mentions) == len(preds_b)
        salient_mention_predictions_b.append(preds_b)

    for metric_type in ["f1", "precision", "recall"]:
        print(
            f"Paired Bootstrap Comparison of System A and System B on salient mention metric: {metric_type}"
        )
        sys1_mention_list = list(salient_mention_predictions_a)
        sys2_mention_list = list(salient_mention_predictions_b)
        eval_with_hierarchical_paired_bootstrap(gold_mentions,
                                                sys1_mention_list,
                                                sys2_mention_list,
                                                num_samples=1000,
                                                sample_ratio=0.5,
                                                eval_type=metric_type)

    get_types_of_clusters(convert_to_dict(gold_data),
                          convert_to_dict(gold_data))

    predicted_salient_clusters_a_list = [
        convert_to_dict(load_jsonl(x)) for x in args.clusters_files_a
    ]
    predicted_salient_clusters_b_list = [
        convert_to_dict(load_jsonl(x)) for x in args.clusters_files_b
    ]

    all_clusters = [
        predicted_salient_clusters_a_list, predicted_salient_clusters_b_list
    ]
    for i in range(len(all_clusters)):
        clusters_set = all_clusters[i]
        for predicted_salient_clusters in clusters_set:
            for d, doc in predicted_salient_clusters.items():
                if 'clusters' not in doc:
                    merge_method_subrelations(doc)
                    doc['clusters'] = {
                        x: v
                        for x, v in doc['coref'].items() if len(v) > 0
                    }
            get_types_of_clusters(predicted_ner, predicted_salient_clusters)

    all_metrics_a_list = []
    preds_len = None
    for predicted_salient_clusters_a in predicted_salient_clusters_a_list:
        _, all_metrics_a = clustering_metrics(gold_data,
                                              predicted_salient_clusters_a,
                                              predicted_span_to_gold_span_map)
        all_metrics_a_list.append(all_metrics_a)
        if preds_len is None:
            preds_len = len(all_metrics_a)
        else:
            assert preds_len == len(all_metrics_a)

    all_metrics_b_list = []
    for predicted_salient_clusters_b in predicted_salient_clusters_b_list:
        _, all_metrics_b = clustering_metrics(gold_data,
                                              predicted_salient_clusters_b,
                                              predicted_span_to_gold_span_map)
        all_metrics_b_list.append(all_metrics_b)
        assert preds_len == len(all_metrics_b)

    print("\n")
    for metric_type in ["f1", "p", "r"]:
        print(
            f"Paired Bootstrap Comparison of System A and System B on salient cluster metric: {metric_type}"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        sys1_cluster_list = [
            list(metrics_a[metric_type]) for metrics_a in all_metrics_a_list
        ]
        sys2_cluster_list = [
            list(metrics_b[metric_type]) for metrics_b in all_metrics_b_list
        ]

        gold = [None for _ in sys1_cluster_list[0]]
        # Each bootstrap sample draws 50 items.
        eval_with_hierarchical_paired_bootstrap(gold,
                                                sys1_cluster_list,
                                                sys2_cluster_list,
                                                num_samples=5000,
                                                sample_ratio=0.5,
                                                eval_type='avg')
Ejemplo n.º 7
0
def evaluate(predicted_data, gold_data):
    p, r, f1 = 0, 0, 0
    gold_data = {x['doc_id']: x for x in gold_data}
    all_metrics = []

    for doc in predicted_data:
        predicted_doc = predicted_data[doc]
        gold_doc = gold_data[doc]
        merge_method_subrelations(gold_doc)
        gold_doc["clusters"] = gold_doc["coref"]

        gold_spans = [tuple(x) for x in gold_doc['ner']]
        predicted_spans = [tuple(x) for x in predicted_doc['ner']]

        for t in used_entities:
            typed_gold_spans = set([x for x in gold_spans if x[2] == t])
            typed_predicted_spans = set(
                [x for x in predicted_spans if x[2] == t])

            matched = len(typed_gold_spans & typed_predicted_spans)
            tp, tr = matched / (len(typed_predicted_spans) +
                                1e-7), matched / (len(typed_gold_spans) + 1e-7)
            tf1 = 2 * tp * tr / (tp + tr + 1e-7)

            p += tp / (len(used_entities) * len(predicted_data))
            r += tr / (len(used_entities) * len(predicted_data))
            f1 += tf1 / (len(used_entities) * len(predicted_data))

        clusters = gold_doc['coref']
        span_to_cluster = {}
        for c, spans in clusters.items():
            for span in spans:
                span_to_cluster[tuple(span)] = c

        predicted_span_to_gold = {}
        for i, (s, e, t) in enumerate(predicted_spans):
            span = (s, e)
            predicted_span_to_gold[span] = (t, span, str(i))
            for sg, eg, tg in gold_spans:
                span_g = (sg, eg)
                if span_match(span, span_g) > 0.5:
                    predicted_span_to_gold[span] = (tg, span_g,
                                                    span_to_cluster.get(
                                                        span_g, str(i)))
                    break

        for types in combinations(used_entities, 2):
            gold_relations = [
                tuple((t, x[t]) for t in types)
                for x in gold_doc['n_ary_relations']
            ]
            gold_relations = set(
                [x for x in gold_relations if has_all_mentions(gold_doc, x)])

            predicted_relations = []
            for s1, s2 in predicted_doc['relations']:
                if s1 in predicted_span_to_gold and s2 in predicted_span_to_gold:
                    t1, span_1, c_1 = predicted_span_to_gold[s1]
                    t2, span_2, c_2 = predicted_span_to_gold[s2]

                    if t1 in types and t2 in types and t1 != t2:
                        rel = {t1: c_1, t2: c_2}
                        predicted_relations.append(
                            tuple([(t, rel[t]) for t in types]))

            predicted_relations = set(predicted_relations)

            matched = predicted_relations & gold_relations
            metrics = {
                "p": len(matched) / (len(predicted_relations) + 1e-7),
                "r": len(matched) / (len(gold_relations) + 1e-7),
            }
            metrics["f1"] = 2 * metrics["p"] * metrics["r"] / (
                metrics["p"] + metrics["r"] + 1e-7)

            if len(gold_relations) > 0:
                all_metrics.append(metrics)

    print(p, r, f1)

    all_metrics = pd.DataFrame(all_metrics)
    print(f"Relation Metrics n={2}")
    print(all_metrics.describe().loc['mean'][['p', 'r', 'f1']])
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file-a", help="Cluster predictions from system A")
    parser.add_argument("--salient-mentions-file-a", help="Salient mentions from system A")
    parser.add_argument("--clusters-file-b", help="Cluster predictions from system B")
    parser.add_argument("--salient-mentions-file-b", help="Salient mentions from system B")
    args = parser.parse_args()

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)

    predicted_salient_mentions_a = convert_to_dict(load_jsonl(args.salient_mentions_file_a))
    preds_a, labels_a = salent_mentions_metrics(gold_data, predicted_salient_mentions_a)

    predicted_salient_mentions_b = convert_to_dict(load_jsonl(args.salient_mentions_file_b))
    preds_b, labels_b = salent_mentions_metrics(gold_data, predicted_salient_mentions_b)
    assert labels_a == labels_b
    gold_mentions = labels_a

    print(f"Paired Bootstrap Comparison of System A and System B on salient mention metric:")
    # The bootstrap script expects a list of gold values, but here the "system" values are already 
    # comparisons with gold, so just pass in a list of Nones to satisfy the input.
    assert len(preds_a) == len(preds_b)
    assert len(preds_a) == len(gold_mentions)
    sys1_mention = list(preds_a)
    sys2_mention = list(preds_b)
    assert len(sys1_mention) == len(sys2_mention)
    eval_with_paired_bootstrap(gold_mentions, sys1_mention, sys2_mention,
                               num_samples=1000, sample_ratio=0.5,
                               eval_type='f1')

    predicted_salient_clusters_a = convert_to_dict(load_jsonl(args.clusters_file_a))
    predicted_salient_clusters_b = convert_to_dict(load_jsonl(args.clusters_file_b))

    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))

    i = 0
    filenames = [args.salient_mentions_file_a, args.salient_mentions_file_b]
    for predicted_salient_clusters in [predicted_salient_clusters_a, predicted_salient_clusters_b]:
        print(f"\nMetrics for {filenames[i]}")
        i+=1
        for d, doc in predicted_salient_clusters.items() :
            if 'clusters' not in doc :
                merge_method_subrelations(doc)
                doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}
        get_types_of_clusters(predicted_ner, predicted_salient_clusters)

    _, all_metrics_a = clustering_metrics(
        gold_data, predicted_salient_clusters_a, predicted_span_to_gold_span_map
    )
    _, all_metrics_b = clustering_metrics(
        gold_data, predicted_salient_clusters_b, predicted_span_to_gold_span_map
    )

    print(f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:")
    # The bootstrap script expects a list of gold values, but here the "system" values are already 
    # comparisons with gold, so just pass in a list of Nones to satisfy the input.
    sys1_cluster = list(all_metrics_a["f1"])
    sys2_cluster = list(all_metrics_b["f1"])
    assert len(sys1_cluster) == len(sys2_cluster)

    gold = [None for _ in sys1_cluster]
    # Each bootstrap sample draws 50 items.
    eval_with_paired_bootstrap(gold, sys1_cluster, sys2_cluster,
                               num_samples=1000, sample_ratio=0.76,
                               eval_type='avg')