Esempio n. 1
0
def conll_eval(system_file, gold_file):
    gold_ud = load_conllu_file(
        gold_file
    )  #"/work/work/taa/corpus/ud-test-v2.0-conll2017/gold/conll17-ud-test-2017-05-09/ro.conllu")
    system_ud = load_conllu_file(
        system_file)  #"/work/work/sq2/sequence_labeler/ro.conllu")
    return evaluate(gold_ud, system_ud, None)
Esempio n. 2
0
def benchmark_model(model_name, test_data_path, ner_test_data):
    with open(test_data_path) as f:
        data = conllu.parse(f.read())
        text = " ".join(d.metadata["text"] for d in data)

    load_model = getattr(importlib.import_module(model_name), "load")
    nlp = load_model()

    _parsed = StringIO(format_as_conllu(nlp(text), 1))
    parsed = conll17_ud_eval.load_conllu(_parsed)
    gold = conll17_ud_eval.load_conllu_file(test_data_path)

    results = pd.DataFrame({
        k: v.__dict__
        for k, v in conll17_ud_eval.evaluate(gold, parsed).items()
    }).T

    print(results)

    diterator = DataIterator()
    test_sents = list(
        itertools.islice(diterator.tagged_sentences(ner_test_data), None))
    scorer = Scorer()
    for sentence, annot in test_sents:
        doc_gold_text = nlp.make_doc(sentence)
        gold = GoldParse(doc_gold_text, entities=annot)
        predicted = nlp(sentence)
        scorer.score(predicted, gold)

    print(scorer.scores)
Esempio n. 3
0
def score_las(filename_test, filename_gold):
    score = 0
    with open(filename_test, 'r') as ftest, \
            open(filename_gold, 'r') as fgold:
        test = las.load_conllu(ftest)
        gold = las.load_conllu(fgold)
        score = las.evaluate(gold, test)['LAS'].f1
    return score
Esempio n. 4
0
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
    with text_loc.open("r", encoding="utf8") as text_file:
        texts = split_text(text_file.read())
        docs = list(nlp.pipe(texts))
    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return scores
Esempio n. 5
0
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
    with text_loc.open("r", encoding="utf8") as text_file:
        texts = split_text(text_file.read())
        docs = list(nlp.pipe(texts))
    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return scores
Esempio n. 6
0
def benchmark_model(model_name, test_data_path):
    with open(test_data_path) as f:
        data = conllu.parse(f.read())
        text = " ".join(d.metadata["text"] for d in data)

    load_model = getattr(importlib.import_module(model_name), "load")
    nlp = load_model()

    _parsed = StringIO(format_as_conllu(nlp(text), 1))
    parsed = conll17_ud_eval.load_conllu(_parsed)
    gold = conll17_ud_eval.load_conllu_file(test_data_path)

    results = pd.DataFrame({
        k: v.__dict__
        for k, v in conll17_ud_eval.evaluate(gold, parsed).items()
    }).T

    print(results)
Esempio n. 7
0
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
    if text_loc.parts[-1].endswith(".conllu"):
        docs = []
        with text_loc.open() as file_:
            for conllu_doc in read_conllu(file_):
                for conllu_sent in conllu_doc:
                    words = [line[1] for line in conllu_sent]
                    docs.append(Doc(nlp.vocab, words=words))
        for name, component in nlp.pipeline:
            docs = list(component.pipe(docs))
    else:
        with text_loc.open("r", encoding="utf8") as text_file:
            texts = split_text(text_file.read())
            docs = list(nlp.pipe(texts))
    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return docs, scores
Esempio n. 8
0
def run_single_eval(nlp, loading_time, print_name, text_path, gold_ud, tmp_output_path, out_file, print_header,
                    check_parse, print_freq_tasks):
    """" Run an evaluation of a model nlp on a certain specified treebank """
    with text_path.open(mode='r', encoding='utf-8') as f:
        flat_text = f.read()

    # STEP 1: tokenize text
    tokenization_start = time.time()
    texts = split_text(flat_text)
    docs = list(nlp.pipe(texts))
    tokenization_end = time.time()
    tokenization_time = tokenization_end - tokenization_start

    # STEP 2: record stats and timings
    tokens_per_s = int(len(gold_ud.tokens) / tokenization_time)

    print_header_1 = ['date', 'text_path', 'gold_tokens', 'model', 'loading_time', 'tokenization_time', 'tokens_per_s']
    print_string_1 = [str(datetime.date.today()), text_path.name, len(gold_ud.tokens),
                      print_name, "%.2f" % loading_time, "%.2f" % tokenization_time, tokens_per_s]

    # STEP 3: evaluate predicted tokens and features
    with tmp_output_path.open(mode="w", encoding="utf8") as tmp_out_file:
        write_conllu(docs, tmp_out_file)
    with tmp_output_path.open(mode="r", encoding="utf8") as sys_file:
        sys_ud = conll17_ud_eval.load_conllu(sys_file, check_parse=check_parse)
    tmp_output_path.unlink()
    scores = conll17_ud_eval.evaluate(gold_ud, sys_ud, check_parse=check_parse)

    # STEP 4: format the scoring results
    eval_headers = EVAL_PARSE
    if not check_parse:
        eval_headers = EVAL_NO_PARSE

    for score_name in eval_headers:
        score = scores[score_name]
        print_string_1.extend(["%.2f" % score.precision,
                               "%.2f" % score.recall,
                               "%.2f" % score.f1])
        print_string_1.append("-" if score.aligned_accuracy is None else "%.2f" % score.aligned_accuracy)
        print_string_1.append("-" if score.undersegmented is None else "%.4f" % score.under_perc)
        print_string_1.append("-" if score.oversegmented is None else "%.4f" % score.over_perc)

        print_header_1.extend([score_name + '_p', score_name + '_r', score_name + '_F', score_name + '_acc',
                               score_name + '_under', score_name + '_over'])

        if score_name in print_freq_tasks:
            print_header_1.extend([score_name + '_word_under_ex', score_name + '_shape_under_ex',
                                   score_name + '_word_over_ex', score_name + '_shape_over_ex'])

            d_under_words = get_freq_tuples(score.undersegmented, PRINT_TOTAL)
            d_under_shapes = get_freq_tuples([word_shape(x) for x in score.undersegmented], PRINT_TOTAL)
            d_over_words = get_freq_tuples(score.oversegmented, PRINT_TOTAL)
            d_over_shapes = get_freq_tuples([word_shape(x) for x in score.oversegmented], PRINT_TOTAL)

            # saving to CSV with ; seperator so blinding ; in the example output
            print_string_1.append(
                str({k: v for k, v in d_under_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
            print_string_1.append(
                str({k: v for k, v in d_under_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
            print_string_1.append(
                str({k: v for k, v in d_over_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
            print_string_1.append(
                str({k: v for k, v in d_over_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))

    # STEP 5: print the formatted results to CSV
    if print_header:
        out_file.write(';'.join(map(str, print_header_1)) + '\n')
    out_file.write(';'.join(map(str, print_string_1)) + '\n')
Esempio n. 9
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("truth",
                        type=str,
                        help="Directory name of the truth dataset.")
    parser.add_argument("system",
                        type=str,
                        help="Directory name of system output.")
    parser.add_argument("output",
                        type=str,
                        help="Directory name of the output directory.")
    args = parser.parse_args()

    # Load input dataset metadata.json
    with open(args.truth + "/metadata.json", "r") as metadata_file:
        metadata = json.load(metadata_file)

    # Evaluate and compute sum of all treebanks
    metrics = [
        "Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags",
        "Lemmas", "UAS", "LAS", "CLAS"
    ]
    treebanks = 0
    summation = {}
    results = []
    results_las = {}
    for entry in metadata:
        treebanks += 1

        ltcode, goldfile, outfile = entry['ltcode'], entry['goldfile'], entry[
            'outfile']

        # Load gold data
        try:
            gold = load_conllu_file(args.truth + "/" + goldfile)
        except:
            results.append(
                (ltcode + "-Status", "Error: Cannot load gold file"))
            continue

        # Load system data
        try:
            system = load_conllu_file(args.system + "/" + outfile)
        except UDError as e:
            if e.args[0].startswith("There is a cycle"):
                results.append(
                    (ltcode + "-Status",
                     "Error: There is a cycle in generated CoNLL-U file"))
                continue
            if e.args[0].startswith("There are multiple roots"):
                results.append((
                    ltcode + "-Status",
                    "Error: There are multiple roots in a sentence in generated CoNLL-U file"
                ))
                continue
            results.append((
                ltcode + "-Status",
                "Error: There is a format error (tabs, ID values, etc) in generated CoNLL-U file"
            ))
            continue
        except:
            results.append((ltcode + "-Status",
                            "Error: Cannot open generated CoNLL-U file"))
            continue

        # Check for correctness
        if not system.characters:
            results.append(
                (ltcode + "-Status", "Error: The system file is empty"))
            continue
        if system.characters != gold.characters:
            results.append((
                ltcode + "-Status",
                "Error: The concatenation of tokens in gold file and in system file differ, system file has {} nonspace characters, which is approximately {}% of the gold file"
                .format(
                    len(system.characters),
                    int(100 * len(system.characters) / len(gold.characters)))))
            continue

        # Evaluate
        try:
            evaluation = evaluate(gold, system)
        except:
            # Should not happen
            results.append((
                ltcode + "-Status",
                "Error: Cannot evaluate generated CoNLL-U file, internal error"
            ))
            continue

        # Generate output metrics and compute sum
        results.append(
            (ltcode + "-Status",
             "OK: Evaluated non-zero LAS F1 score" if evaluation["LAS"].f1 > 0
             else "Error: Evaluated zero LAS F1 score"))

        for metric in metrics:
            results.append((ltcode + "-" + metric + "-F1",
                            "{:.2f}".format(100 * evaluation[metric].f1)))
            summation[metric] = summation.get(metric,
                                              0) + evaluation[metric].f1
        results_las[ltcode] = evaluation["LAS"].f1

    # Compute averages
    for metric in reversed(metrics):
        results.insert(0, ("total-" + metric + "-F1", "{:.2f}".format(
            100 * summation.get(metric, 0) / treebanks)))

    # Generate evaluation.prototext
    with open(args.output + "/evaluation.prototext", "w") as evaluation:
        for key, value in results:
            print('measure{{\n  key: "{}"\n  value: "{}"\n}}'.format(
                key, value),
                  file=evaluation)

    # Generate LAS-F1 + Status on stdout, Status on stderr
    for key, value in results:
        if not key.endswith("-Status"):
            continue

        ltcode = key[:-len("-Status")]
        print("{:13} LAS:{:6.2f} ({})".format(
            ltcode, 100 * results_las.get(ltcode, 0.), value),
              file=sys.stdout)
        print("{:13} {}".format(ltcode, value), file=sys.stderr)
Esempio n. 10
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("truth", type=str, help="Directory name of the truth dataset.")
    parser.add_argument("system", type=str, help="Directory name of system output.")
    parser.add_argument("output", type=str, help="Directory name of the output directory.")
    args = parser.parse_args()

    # Load input dataset metadata.json
    with open(args.truth + "/metadata.json","r") as metadata_file:
        metadata = json.load(metadata_file)

    # Evaluate and compute sum of all treebanks
    metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS"]
    treebanks = 0
    summation = {}
    results = []
    results_las = {}
    for entry in metadata:
        treebanks += 1

        ltcode, goldfile, outfile = entry['ltcode'], entry['goldfile'], entry['outfile']

        # Load gold data
        try:
            gold = load_conllu_file(args.truth + "/" + goldfile)
        except:
            results.append((ltcode+"-Status", "Error: Cannot load gold file"))
            continue

        # Load system data
        try:
            system = load_conllu_file(args.system + "/" + outfile)
        except UDError as e:
            if e.args[0].startswith("There is a cycle"):
                results.append((ltcode+"-Status", "Error: There is a cycle in generated CoNLL-U file"))
                continue
            if e.args[0].startswith("There are multiple roots"):
                results.append((ltcode+"-Status", "Error: There are multiple roots in a sentence in generated CoNLL-U file"))
                continue
            results.append((ltcode+"-Status", "Error: There is a format error (tabs, ID values, etc) in generated CoNLL-U file"))
            continue
        except:
            results.append((ltcode+"-Status", "Error: Cannot open generated CoNLL-U file"))
            continue

        # Check for correctness
        if not system.characters:
            results.append((ltcode+"-Status", "Error: The system file is empty"))
            continue
        if system.characters != gold.characters:
            results.append((ltcode+"-Status", "Error: The concatenation of tokens in gold file and in system file differ, system file has {} nonspace characters, which is approximately {}% of the gold file".format(len(system.characters), int(100 * len(system.characters) / len(gold.characters)))))
            continue

        # Evaluate
        try:
            evaluation = evaluate(gold, system)
        except:
            # Should not happen
            results.append((ltcode+"-Status", "Error: Cannot evaluate generated CoNLL-U file, internal error"))
            continue

        # Generate output metrics and compute sum
        results.append((ltcode+"-Status", "OK: Evaluated non-zero LAS F1 score" if evaluation["LAS"].f1 > 0 else "Error: Evaluated zero LAS F1 score"))

        for metric in metrics:
            results.append((ltcode+"-"+metric+"-F1", "{:.2f}".format(100 * evaluation[metric].f1)))
            summation[metric] = summation.get(metric, 0) + evaluation[metric].f1
        results_las[ltcode] = evaluation["LAS"].f1

    # Compute averages
    for metric in reversed(metrics):
        results.insert(0, ("total-"+metric+"-F1", "{:.2f}".format(100 * summation.get(metric, 0) / treebanks)))

    # Generate evaluation.prototext
    with open(args.output + "/evaluation.prototext", "w") as evaluation:
        for key, value in results:
            print('measure{{\n  key: "{}"\n  value: "{}"\n}}'.format(key, value), file=evaluation)

    # Generate LAS-F1 + Status on stdout, Status on stderr
    for key, value in results:
        if not key.endswith("-Status"):
            continue

        ltcode = key[:-len("-Status")]
        print("{:13} LAS:{:6.2f} ({})".format(ltcode, 100 * results_las.get(ltcode, 0.), value), file=sys.stdout)
        print("{:13} {}".format(ltcode, value), file=sys.stderr)