コード例 #1
0
def combine_span_and_cluster_file(span_file, cluster_file):
    spans = load_jsonl(span_file)
    clusters = {item['doc_id']: item for item in load_jsonl(cluster_file)}

    for doc in spans:
        if doc['doc_id'] not in clusters.keys():
            continue

        if 'clusters' in clusters[doc['doc_id']]:
            doc['coref'] = clusters[doc['doc_id']]['clusters']
        else:
            merge_method_subrelations(clusters[doc['doc_id']])
            doc['coref'] = {
                x: v
                for x, v in clusters[doc['doc_id']]['coref'].items()
                if len(v) > 0
            }

        if 'n_ary_relations' in doc:
            del doc['n_ary_relations']

        if 'method_subrelations' in doc:
            del doc['method_subrelations']

    annotations_to_jsonl(spans, 'tmp_relation_42424242.jsonl')
コード例 #2
0
def main(args):
    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]

    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_salient_clusters = convert_to_dict(load_jsonl(args.clusters_file))
    for d, doc in predicted_salient_clusters.items() :
        if 'clusters' not in doc :
            merge_method_subrelations(doc)
            doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}

    predicted_relations = convert_to_dict(load_jsonl(args.relations_file))

    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)
    get_types_of_clusters(predicted_ner, predicted_salient_clusters)
    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))
    predicted_cluster_to_gold_cluster_map = clustering_metrics(
        gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map
    )

    for n in [2, 4] :
        all_metrics = []
        for types in combinations(used_entities, n):
            for doc in gold_data:
                predicted_data = predicted_relations[doc["doc_id"]]
                mapping = predicted_cluster_to_gold_cluster_map[doc["doc_id"]]

                relations = list(set([
                    tuple([mapping.get(v, v) for v in x[0]])
                    for x in predicted_data["predicted_relations"]
                    if x[2] == 1
                ]))

                relations = [dict(zip(used_entities, x)) for x in relations]
                relations = set([tuple((t, x[t]) for t in types) for x in relations])

                gold_relations = [tuple((t, x[t]) for t in types) for x in doc['n_ary_relations']]
                gold_relations = set([x for x in gold_relations if has_all_mentions(doc, x)])

                matched = relations & gold_relations

                metrics = {
                    "p": len(matched) / (len(relations) + 1e-7),
                    "r": len(matched) / (len(gold_relations) + 1e-7),
                }
                metrics["f1"] = 2 * metrics["p"] * metrics["r"] / (metrics["p"] + metrics["r"] + 1e-7)

                if len(gold_relations) > 0:
                    all_metrics.append(metrics)

        all_metrics = pd.DataFrame(all_metrics)
        print(f"Relation Metrics n={n}")
        rln_metrics = all_metrics.describe().loc['mean'][['p', 'r', 'f1']]
        print(rln_metrics)
        rln_metrics.to_json(os.environ["DECODING_METRICS_OUTFP"])
コード例 #3
0
ファイル: ner_evaluate.py プロジェクト: viswavi/SciREX
def main(args):
    gold_data = load_jsonl(args.gold_file)
    all_label_ner_values = [x['ner'] for x in gold_data]
    entity_labels = set([
        entity[2] for document in all_label_ner_values for entity in document
    ])

    f1metric = SpanBasedF1Measure(entity_labels=entity_labels)
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))

    assert len(gold_data) == len(predicted_ner)

    for labeled_doc in gold_data:
        labeled_ner_values = labeled_doc['ner']
        doc_id = labeled_doc['doc_id']
        if doc_id not in predicted_ner:
            raise ValueError(f"No predictions found for document {doc_id}")
        predicted_ner_values = predicted_ner[doc_id]['ner']
        f1metric(predicted_ner_values, labeled_ner_values)

    print(json.dumps(f1metric.get_metric(), indent=2))
コード例 #4
0
def main(args):
    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]

    predicted_salient_mentions = convert_to_dict(load_jsonl(args.salient_mentions_file))
    salent_mentions_metrics(gold_data, predicted_salient_mentions)

    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_salient_clusters = convert_to_dict(load_jsonl(args.clusters_file))
    for d, doc in predicted_salient_clusters.items() :
        if 'clusters' not in doc :
            merge_method_subrelations(doc)
            doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}

    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)
    get_types_of_clusters(predicted_ner, predicted_salient_clusters)
    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))
    predicted_cluster_to_gold_cluster_map = clustering_metrics(
        gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map
    )
コード例 #5
0
def prepare_data(gold_file, ner_file, clusters_file, relations_file):
    gold_data = load_jsonl(gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]

    predicted_ner = convert_to_dict(load_jsonl(ner_file))
    predicted_salient_clusters = convert_to_dict(load_jsonl(clusters_file))
    for d, doc in predicted_salient_clusters.items() :
        if 'clusters' not in doc :
            merge_method_subrelations(doc)
            doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}

    predicted_relations = convert_to_dict(load_jsonl(relations_file))

    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)
    get_types_of_clusters(predicted_ner, predicted_salient_clusters)
    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))
    predicted_cluster_to_gold_cluster_map = match_predicted_clusters_with_gold(
        gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map
    )
    return gold_data, predicted_ner, predicted_salient_clusters, predicted_relations, predicted_cluster_to_gold_cluster_map
コード例 #6
0
        metrics = compute_metrics(pc, gold_data[p])
        all_metrics.append(metrics)

    all_metrics = pd.DataFrame(all_metrics)
    print(all_metrics.describe())


def score_dygie_model(predictions, gold_data):
    gold_data = {x["doc_id"]: list(x["coref"].values()) for x in gold_data}
    predictions = {
        x["doc_key"]:
        [[(s, e + 1) for s, e in c] for c in x["predicted_clusters"]]
        for x in predictions
    }

    all_metrics = []
    for p, pc in predictions.items():
        metrics = compute_metrics(pc, gold_data[p])
        all_metrics.append(metrics)

    all_metrics = pd.DataFrame(all_metrics)
    print(all_metrics.describe())


if __name__ == "__main__":
    print("DyGIE")
    score_dygie_model(load_jsonl(argv[2]), load_jsonl(argv[1]))

    print("SciREX")
    score_scirex_model(load_jsonl(argv[3]), load_jsonl(argv[1]))
コード例 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file-a",
                        help="Cluster predictions from system A")
    parser.add_argument("--salient-mentions-file-a",
                        help="Salient mentions from system A")
    parser.add_argument("--clusters-file-b",
                        help="Cluster predictions from system B")
    parser.add_argument("--salient-mentions-file-b",
                        help="Salient mentions from system B")
    parser.add_argument("--edge-degree-direction",
                        default="both",
                        choices=["both", "out", "in"],
                        type=str)
    parser.add_argument("--num-buckets", default=6, type=int)
    args = parser.parse_args()

    bucketed_eval_comparison = {}

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str,
                                          Dict[tuple, tuple]] = ner_metrics(
                                              gold_data, predicted_ner)

    doc_ids = [doc["doc_id"] for doc in gold_data]
    doc_buckets = bucket_documents_by_graph_degree(
        doc_ids, num_buckets=args.num_buckets)

    for bucket_name, bucket_docs in doc_buckets:
        gold_data_in_bucket = [
            doc for doc in gold_data if doc["doc_id"] in bucket_docs
        ]
        print("\n")
        print(
            f"bucket: {bucket_name}, contains {len(gold_data_in_bucket)} documents"
        )

        predicted_salient_mentions_a = convert_to_dict(
            load_jsonl(args.salient_mentions_file_a))
        preds_a, labels_a = salent_mentions_metrics(
            gold_data_in_bucket, predicted_salient_mentions_a)

        predicted_salient_mentions_b = convert_to_dict(
            load_jsonl(args.salient_mentions_file_b))
        preds_b, labels_b = salent_mentions_metrics(
            gold_data_in_bucket, predicted_salient_mentions_b)
        assert labels_a == labels_b
        gold_mentions = labels_a

        print(
            f"Paired Bootstrap Comparison of System A and System B on salient mention metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        assert len(preds_a) == len(preds_b)
        assert len(preds_a) == len(gold_mentions)
        sys1_mention = list(preds_a)
        sys2_mention = list(preds_b)
        assert len(sys1_mention) == len(sys2_mention)
        sys1_summary, sys2_summary, p_value_lose, p_value_win = eval_with_paired_bootstrap(
            gold_mentions,
            sys1_mention,
            sys2_mention,
            num_samples=1000,
            sample_ratio=0.5,
            eval_type='f1',
            return_results=True)
        bucketed_eval_comparison[str(bucket_name)] = {
            "base": [list(sys1_summary), p_value_lose],
            "diff": [list(sys2_summary), p_value_win]
        }

        predicted_salient_clusters_a = convert_to_dict(
            load_jsonl(args.clusters_file_a))
        predicted_salient_clusters_b = convert_to_dict(
            load_jsonl(args.clusters_file_b))

        get_types_of_clusters(convert_to_dict(gold_data_in_bucket),
                              convert_to_dict(gold_data_in_bucket))

        i = 0
        filenames = [
            args.salient_mentions_file_a, args.salient_mentions_file_b
        ]
        for predicted_salient_clusters in [
                predicted_salient_clusters_a, predicted_salient_clusters_b
        ]:
            print(f"\nMetrics for {filenames[i]}")
            i += 1
            for d, doc in predicted_salient_clusters.items():
                if 'clusters' not in doc:
                    merge_method_subrelations(doc)
                    doc['clusters'] = {
                        x: v
                        for x, v in doc['coref'].items() if len(v) > 0
                    }
            get_types_of_clusters(predicted_ner, predicted_salient_clusters)

        _, all_metrics_a = clustering_metrics(gold_data_in_bucket,
                                              predicted_salient_clusters_a,
                                              predicted_span_to_gold_span_map)
        _, all_metrics_b = clustering_metrics(gold_data_in_bucket,
                                              predicted_salient_clusters_b,
                                              predicted_span_to_gold_span_map)

        print(
            f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        sys1_cluster = list(all_metrics_a["f1"])
        sys2_cluster = list(all_metrics_b["f1"])
        assert len(sys1_cluster) == len(sys2_cluster)

        gold = [None for _ in sys1_cluster]
        # Each bootstrap sample draws 50 items.
        eval_with_paired_bootstrap(gold,
                                   sys1_cluster,
                                   sys2_cluster,
                                   num_samples=1000,
                                   sample_ratio=0.76,
                                   eval_type='avg')
    print(
        f"Bucket evaluations (diff):\n{json.dumps(bucketed_eval_comparison, indent=2)}"
    )
    draw_box_plot_with_error_bars(
        bucketed_eval_comparison,
        fname=
        f"/tmp/bucketed_salient_mention_eval_comparison_n_{args.num_buckets}.png"
    )
コード例 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-files-a",
                        help="Cluster predictions from system A",
                        nargs='+',
                        type=str)
    parser.add_argument("--salient-mentions-files-a",
                        help="Salient mentions from system A",
                        nargs='+',
                        type=str)
    parser.add_argument("--clusters-files-b",
                        help="Cluster predictions from system B",
                        nargs='+',
                        type=str)
    parser.add_argument("--salient-mentions-files-b",
                        help="Salient mentions from system B",
                        nargs='+',
                        type=str)
    args = parser.parse_args()

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str,
                                          Dict[tuple, tuple]] = ner_metrics(
                                              gold_data, predicted_ner)

    salient_mention_predictions_a = []
    gold_mentions = None

    salient_mentions_files_a = list(args.salient_mentions_files_a)
    salient_mentions_files_b = list(args.salient_mentions_files_b)

    for salient_file_a in salient_mentions_files_a:
        predicted_salient_mentions_a = convert_to_dict(
            load_jsonl(salient_file_a))
        preds_a, labels_a = salent_mentions_metrics(
            gold_data, predicted_salient_mentions_a)
        if gold_mentions is None:
            gold_mentions = labels_a
        else:
            assert gold_mentions == labels_a, breakpoint()
        assert len(gold_mentions) == len(labels_a)
        salient_mention_predictions_a.append(preds_a)

    print("\n")
    salient_mention_predictions_b = []
    for salient_file_b in salient_mentions_files_b:
        predicted_salient_mentions_b = convert_to_dict(
            load_jsonl(salient_file_b))
        preds_b, labels_b = salent_mentions_metrics(
            gold_data, predicted_salient_mentions_b)
        assert gold_mentions == labels_b
        assert len(gold_mentions) == len(preds_b)
        salient_mention_predictions_b.append(preds_b)

    for metric_type in ["f1", "precision", "recall"]:
        print(
            f"Paired Bootstrap Comparison of System A and System B on salient mention metric: {metric_type}"
        )
        sys1_mention_list = list(salient_mention_predictions_a)
        sys2_mention_list = list(salient_mention_predictions_b)
        eval_with_hierarchical_paired_bootstrap(gold_mentions,
                                                sys1_mention_list,
                                                sys2_mention_list,
                                                num_samples=1000,
                                                sample_ratio=0.5,
                                                eval_type=metric_type)

    get_types_of_clusters(convert_to_dict(gold_data),
                          convert_to_dict(gold_data))

    predicted_salient_clusters_a_list = [
        convert_to_dict(load_jsonl(x)) for x in args.clusters_files_a
    ]
    predicted_salient_clusters_b_list = [
        convert_to_dict(load_jsonl(x)) for x in args.clusters_files_b
    ]

    all_clusters = [
        predicted_salient_clusters_a_list, predicted_salient_clusters_b_list
    ]
    for i in range(len(all_clusters)):
        clusters_set = all_clusters[i]
        for predicted_salient_clusters in clusters_set:
            for d, doc in predicted_salient_clusters.items():
                if 'clusters' not in doc:
                    merge_method_subrelations(doc)
                    doc['clusters'] = {
                        x: v
                        for x, v in doc['coref'].items() if len(v) > 0
                    }
            get_types_of_clusters(predicted_ner, predicted_salient_clusters)

    all_metrics_a_list = []
    preds_len = None
    for predicted_salient_clusters_a in predicted_salient_clusters_a_list:
        _, all_metrics_a = clustering_metrics(gold_data,
                                              predicted_salient_clusters_a,
                                              predicted_span_to_gold_span_map)
        all_metrics_a_list.append(all_metrics_a)
        if preds_len is None:
            preds_len = len(all_metrics_a)
        else:
            assert preds_len == len(all_metrics_a)

    all_metrics_b_list = []
    for predicted_salient_clusters_b in predicted_salient_clusters_b_list:
        _, all_metrics_b = clustering_metrics(gold_data,
                                              predicted_salient_clusters_b,
                                              predicted_span_to_gold_span_map)
        all_metrics_b_list.append(all_metrics_b)
        assert preds_len == len(all_metrics_b)

    print("\n")
    for metric_type in ["f1", "p", "r"]:
        print(
            f"Paired Bootstrap Comparison of System A and System B on salient cluster metric: {metric_type}"
        )
        # The bootstrap script expects a list of gold values, but here the "system" values are already
        # comparisons with gold, so just pass in a list of Nones to satisfy the input.
        sys1_cluster_list = [
            list(metrics_a[metric_type]) for metrics_a in all_metrics_a_list
        ]
        sys2_cluster_list = [
            list(metrics_b[metric_type]) for metrics_b in all_metrics_b_list
        ]

        gold = [None for _ in sys1_cluster_list[0]]
        # Each bootstrap sample draws 50 items.
        eval_with_hierarchical_paired_bootstrap(gold,
                                                sys1_cluster_list,
                                                sys2_cluster_list,
                                                num_samples=5000,
                                                sample_ratio=0.5,
                                                eval_type='avg')
コード例 #9
0
                    t1, span_1, c_1 = predicted_span_to_gold[s1]
                    t2, span_2, c_2 = predicted_span_to_gold[s2]

                    if t1 in types and t2 in types and t1 != t2:
                        rel = {t1: c_1, t2: c_2}
                        predicted_relations.append(
                            tuple([(t, rel[t]) for t in types]))

            predicted_relations = set(predicted_relations)

            matched = predicted_relations & gold_relations
            metrics = {
                "p": len(matched) / (len(predicted_relations) + 1e-7),
                "r": len(matched) / (len(gold_relations) + 1e-7),
            }
            metrics["f1"] = 2 * metrics["p"] * metrics["r"] / (
                metrics["p"] + metrics["r"] + 1e-7)

            if len(gold_relations) > 0:
                all_metrics.append(metrics)

    print(p, r, f1)

    all_metrics = pd.DataFrame(all_metrics)
    print(f"Relation Metrics n={2}")
    print(all_metrics.describe().loc['mean'][['p', 'r', 'f1']])


if __name__ == '__main__':
    evaluate(convert_all_instances(load_jsonl(argv[1])), load_jsonl(argv[2]))
コード例 #10
0
def main(args):
    gold_data = load_jsonl(args.gold_file)
    all_label_ner_values = [x['ner'] for x in gold_data]
    entity_labels = set([
        entity[2] for document in all_label_ner_values for entity in document
    ])

    f1metric_a = SpanBasedF1Measure(entity_labels=entity_labels)
    f1metric_b = SpanBasedF1Measure(entity_labels=entity_labels)
    predicted_ner_a = convert_to_dict(load_jsonl(args.ner_file_a))
    predicted_ner_b = convert_to_dict(load_jsonl(args.ner_file_b))

    assert len(gold_data) == len(predicted_ner_a)
    assert len(gold_data) == len(predicted_ner_b)

    single_y_labels = None
    y_preds_a = []
    y_labels_a = []
    y_preds_b = []
    y_labels_b = []
    for labeled_doc in gold_data:
        labeled_ner_values = labeled_doc['ner']
        doc_id = labeled_doc['doc_id']

        f1metric_a.reset()
        f1metric_a(predicted_ner_a[doc_id]['ner'], labeled_ner_values)

        tps = sum(f1metric_a._true_positives.values())
        fps = sum(f1metric_a._false_positives.values())
        fns = sum(f1metric_a._false_negatives.values())

        for _ in range(tps):
            y_preds_a.append(1)
            y_labels_a.append(1)

        for _ in range(fns):
            y_preds_a.append(0)
            y_labels_a.append(1)

        for _ in range(fps):
            y_preds_a.append(1)
            y_labels_a.append(0)

        f1metric_b.reset()
        f1metric_b(predicted_ner_b[doc_id]['ner'], labeled_ner_values)

        tps = sum(f1metric_b._true_positives.values())
        fps = sum(f1metric_b._false_positives.values())
        fns = sum(f1metric_b._false_negatives.values())

        for _ in range(tps):
            y_preds_b.append(1)
            y_labels_b.append(1)

        for _ in range(fns):
            y_preds_b.append(0)
            y_labels_b.append(1)

        for _ in range(fps):
            y_preds_b.append(1)
            y_labels_b.append(0)

        # Add TNs to round things out
        if len(y_labels_b) > len(y_labels_a):
            diff = len(y_labels_b) - len(y_labels_a)
            for _ in range(diff):
                y_labels_a.append(0)
                y_preds_a.append(0)
        elif len(y_labels_a) > len(y_labels_b):
            diff = len(y_labels_a) - len(y_labels_b)
            for _ in range(diff):
                y_labels_b.append(0)
                y_preds_b.append(0)

    assert y_labels_a == y_labels_b, breakpoint()

    print(f"\nBaseline:")
    print(f"Overall F1: {f1_score(y_labels_a, y_preds_a)}")
    print(f"Overall Precision: {precision_score(y_labels_a, y_preds_a)}")
    print(f"Overall Recall: {recall_score(y_labels_a, y_preds_a)}")
    print(f"\nNew System:")
    print(f"Overall F1: {f1_score(y_labels_b, y_preds_b)}")
    print(f"Overall Precision: {precision_score(y_labels_b, y_preds_b)}")
    print(f"Overall Recall: {recall_score(y_labels_b, y_preds_b)}")

    print(f"Bootstrap (F1)")
    eval_with_paired_bootstrap(y_labels_a,
                               y_preds_a,
                               y_preds_b,
                               num_samples=1000,
                               sample_ratio=0.5,
                               eval_type='f1')

    print(f"\nBootstrap (Precision)")
    eval_with_paired_bootstrap(y_labels_a,
                               y_preds_a,
                               y_preds_b,
                               num_samples=1000,
                               sample_ratio=0.5,
                               eval_type='precision')

    print(f"\nBootstrap (Recall)")
    eval_with_paired_bootstrap(y_labels_a,
                               y_preds_a,
                               y_preds_b,
                               num_samples=1000,
                               sample_ratio=0.5,
                               eval_type='recall')
コード例 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold-file")
    parser.add_argument("--ner-file")
    parser.add_argument("--clusters-file-a", help="Cluster predictions from system A")
    parser.add_argument("--salient-mentions-file-a", help="Salient mentions from system A")
    parser.add_argument("--clusters-file-b", help="Cluster predictions from system B")
    parser.add_argument("--salient-mentions-file-b", help="Salient mentions from system B")
    args = parser.parse_args()

    gold_data = load_jsonl(args.gold_file)
    for d in gold_data:
        merge_method_subrelations(d)
        d["clusters"] = d["coref"]
    predicted_ner = convert_to_dict(load_jsonl(args.ner_file))
    predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner)

    predicted_salient_mentions_a = convert_to_dict(load_jsonl(args.salient_mentions_file_a))
    preds_a, labels_a = salent_mentions_metrics(gold_data, predicted_salient_mentions_a)

    predicted_salient_mentions_b = convert_to_dict(load_jsonl(args.salient_mentions_file_b))
    preds_b, labels_b = salent_mentions_metrics(gold_data, predicted_salient_mentions_b)
    assert labels_a == labels_b
    gold_mentions = labels_a

    print(f"Paired Bootstrap Comparison of System A and System B on salient mention metric:")
    # The bootstrap script expects a list of gold values, but here the "system" values are already 
    # comparisons with gold, so just pass in a list of Nones to satisfy the input.
    assert len(preds_a) == len(preds_b)
    assert len(preds_a) == len(gold_mentions)
    sys1_mention = list(preds_a)
    sys2_mention = list(preds_b)
    assert len(sys1_mention) == len(sys2_mention)
    eval_with_paired_bootstrap(gold_mentions, sys1_mention, sys2_mention,
                               num_samples=1000, sample_ratio=0.5,
                               eval_type='f1')

    predicted_salient_clusters_a = convert_to_dict(load_jsonl(args.clusters_file_a))
    predicted_salient_clusters_b = convert_to_dict(load_jsonl(args.clusters_file_b))

    get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data))

    i = 0
    filenames = [args.salient_mentions_file_a, args.salient_mentions_file_b]
    for predicted_salient_clusters in [predicted_salient_clusters_a, predicted_salient_clusters_b]:
        print(f"\nMetrics for {filenames[i]}")
        i+=1
        for d, doc in predicted_salient_clusters.items() :
            if 'clusters' not in doc :
                merge_method_subrelations(doc)
                doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0}
        get_types_of_clusters(predicted_ner, predicted_salient_clusters)

    _, all_metrics_a = clustering_metrics(
        gold_data, predicted_salient_clusters_a, predicted_span_to_gold_span_map
    )
    _, all_metrics_b = clustering_metrics(
        gold_data, predicted_salient_clusters_b, predicted_span_to_gold_span_map
    )

    print(f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:")
    # The bootstrap script expects a list of gold values, but here the "system" values are already 
    # comparisons with gold, so just pass in a list of Nones to satisfy the input.
    sys1_cluster = list(all_metrics_a["f1"])
    sys2_cluster = list(all_metrics_b["f1"])
    assert len(sys1_cluster) == len(sys2_cluster)

    gold = [None for _ in sys1_cluster]
    # Each bootstrap sample draws 50 items.
    eval_with_paired_bootstrap(gold, sys1_cluster, sys2_cluster,
                               num_samples=1000, sample_ratio=0.76,
                               eval_type='avg')