コード例 #1
0
ファイル: 2_eval_pred.py プロジェクト: BlueBrain/Search
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("predictions")
    parser.add_argument("input_df_pkl")
    args = parser.parse_args()

    test_df = pd.read_pickle(args.input_df_pkl)
    y_pred = []
    with open(args.predictions) as fp:
        for line in fp:
            y_pred += line.strip().split()

    y_true = reduce(lambda acc, l: acc + l, test_df.entity_type, [])
    y_true = pd.Series(y_true)
    y_pred = pd.Series(y_pred)

    print(len(y_true))
    print(len(y_pred))

    print("Token level")
    eval_d = ner_report(y_true, y_pred, mode="token", return_dict=True)
    eval_d = dict(eval_d["PATHWAY"])
    with open("pathway_metrics_token.json", "w") as fp:
        json.dump(eval_d, fp)
        fp.write("\n")
    pprint(eval_d)

    print("Entity level")
    y_pred_corr = pd.Series(correct_iob(y_pred))
    eval_d = ner_report(y_true, y_pred_corr, mode="entity", return_dict=True)
    eval_d = dict(eval_d["PATHWAY"])
    with open("pathway_metrics_entity.json", "w") as fp:
        json.dump(eval_d, fp)
        fp.write("\n")
    pprint(eval_d)

    print("Seqeval")
    y_true = list(test_df.entity_type)
    y_pred = []
    with open(args.predictions) as fp:
        for line in fp:
            y_pred.append(line.strip().split())

    from collections import Counter
    c = Counter()
    for x in y_true:
        c.update(x)
    print(c)
    total = 0
    for s1, s2 in zip(y_true, y_pred):
        total += sum(t1 == t2 for t1, t2 in zip(s1, s2))
    acc = total / sum(len(s) for s in y_true)
    print("acc:", acc)
    print("acc_score:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, scheme=IOB2, mode="strict"))
    print(performance_measure(y_true, y_pred))
コード例 #2
0
ファイル: test_eval.py プロジェクト: KabyleAI/Search
def test_ner_report(ner_annotations, dataset, mode, etypes_map, dict_tp_fn_fp):
    report_str = ner_report(
        ner_annotations[dataset]["annotator_1"],
        ner_annotations[dataset]["annotator_2"],
        mode=mode,
        etypes_map=etypes_map,
        return_dict=False,
    )
    report_dict = ner_report(
        ner_annotations[dataset]["annotator_1"],
        ner_annotations[dataset]["annotator_2"],
        mode=mode,
        etypes_map=etypes_map,
        return_dict=True,
    )

    assert isinstance(report_str, str)
    assert isinstance(report_dict, OrderedDict)

    etypes = sorted(dict_tp_fn_fp.keys())
    assert list(report_dict.keys()) == etypes
    for etype in etypes:
        assert set(report_dict[etype].keys()) == {
            "precision",
            "recall",
            "f1-score",
            "support",
        }
        tp, fn, fp = dict_tp_fn_fp[etype]
        tot_true_pos = tp + fn
        tot_pred_pos = tp + fp
        prec_ = (tp / tot_pred_pos) if tot_pred_pos > 0 else 0
        recall_ = tp / tot_true_pos
        f1_ = (2 * prec_ * recall_ /
               (prec_ + recall_)) if tot_pred_pos > 0 else 0
        np.testing.assert_almost_equal(prec_, report_dict[etype]["precision"])
        np.testing.assert_almost_equal(recall_, report_dict[etype]["recall"])
        np.testing.assert_almost_equal(f1_, report_dict[etype]["f1-score"])
        np.testing.assert_almost_equal(tot_true_pos,
                                       report_dict[etype]["support"])
コード例 #3
0
ファイル: eval.py プロジェクト: KabyleAI/Search
def main():
    print("Read params.yaml...")
    params = yaml.safe_load(open("params.yaml"))["eval"][args.etype]

    # Load and preprocess the annotations
    df = annotations2df(args.annotation_files.split(","))
    ner_model = spacy.load(args.model)

    df_pred = []
    for source, df_ in df.groupby("source"):
        df_ = df_.sort_values(by="id", ignore_index=True)
        df_sentence = spacy2df(spacy_model=ner_model,
                               ground_truth_tokenization=df_["text"].to_list())
        df_sentence["id"] = df_["id"].values
        df_sentence["source"] = source
        df_pred.append(df_sentence)

    df_pred = pd.concat(
        df_pred, ignore_index=True).rename(columns={"class": "class_pred"})

    df = df.merge(df_pred, on=["source", "id", "text"], how="inner")

    df = remove_punctuation(df)

    iob_true = df["class"]
    iob_pred = df["class_pred"]

    output_file = pathlib.Path(args.output_file)
    with output_file.open("w") as f:
        all_metrics_dict = OrderedDict()
        for mode in ["entity", "token"]:
            metrics_dict = ner_report(
                iob_true,
                iob_pred,
                mode=mode,
                return_dict=True,
                etypes_map={args.etype: params["etype_name"]},
            )[args.etype]
            metrics_dict = OrderedDict([(f"{mode}_{k}", v)
                                        for k, v in metrics_dict.items()])
            all_metrics_dict.update(metrics_dict)
        json.dump(all_metrics_dict, f)
コード例 #4
0
ファイル: eval.py プロジェクト: BlueBrain/Search
def main():
    # Load and preprocess the annotations
    print("Loading data and model")
    df = annotations2df(args.annotation_files.split(","))
    ner_model = spacy.load(args.model)

    print("Computing predictions")
    df_pred = []
    for source, df_ in df.groupby("source"):
        df_ = df_.sort_values(by="id", ignore_index=True)
        df_sentence = spacy2df(spacy_model=ner_model,
                               ground_truth_tokenization=df_["text"].to_list())
        df_sentence["id"] = df_["id"].values
        df_sentence["source"] = source
        df_pred.append(df_sentence)

    print("Formatting predctions")
    df_pred = pd.concat(
        df_pred, ignore_index=True).rename(columns={"class": "class_pred"})
    df = df.merge(df_pred, on=["source", "id", "text"], how="inner")
    #df = remove_punctuation(df)
    iob_true = df["class"]
    iob_pred = df["class_pred"]

    print("Saving predictions")
    df.to_pickle("df_test_pred.pkl")

    print("Computing and saving metrics")
    output_file = pathlib.Path(args.output_file)
    metrics_dict = ner_report(
        iob_true,
        iob_pred,
        mode="token",
        return_dict=True,
    )
    metrics_dict = dict(metrics_dict[args.etype])
    with output_file.open("w") as f:
        json.dump(metrics_dict, f)
        f.write("\n")
    pprint(metrics_dict)
コード例 #5
0
ファイル: interrater.py プロジェクト: KabyleAI/Search
def main():
    # Load and preprocess the annotations
    df_a1 = annotations2df(args.annotations1.split(","))
    df_a2 = annotations2df(args.annotations2.split(","))
    # Merge the common sentences between annotators
    df = df_a2.merge(
        df_a1,
        on=["source", "id", "text", "start_char", "end_char"],
        suffixes=("_annotator_2", "_annotator_1"),
        how="inner",
    )

    df = remove_punctuation(df)

    iob_true = df["class_annotator_1"]
    iob_pred = df["class_annotator_2"]

    metrics_dict = {}
    for mode in ["entity", "token"]:
        metrics_dict[mode] = ner_report(
            iob_true,
            iob_pred,
            mode=mode,
            return_dict=True,
        )
    for etype in metrics_dict["entity"]:
        all_metrics_dict = OrderedDict()
        for mode in ["entity", "token"]:
            all_metrics_dict.update([
                (f"{mode}_{k}", v)
                for k, v in metrics_dict[mode][etype].items()
            ])
        filename = etype.lower() + ".json"
        output_file = pathlib.Path(args.output_dir) / filename
        with output_file.open("w") as f:
            json.dump(all_metrics_dict, f)