Beispiel #1
0
def test_corpus_macro_avg_f1_token_single_reference():
    ref_sents = ["A mother are accused of committing one local robbery"]
    sys_sent = "A mother and her teenage son are accused of committing one robbery"

    f1_token = corpus_f1_token([sys_sent], [ref_sents])

    correct = ['A', 'mother', 'are', 'accused', 'of', 'committing', 'one', 'robbery']

    precision = len(correct) / len(sys_sent.split())
    recall = len(correct) / len(ref_sents[0].split())
    f1 = 2 * precision * recall / (precision + recall)
    f1 = 100.0 * f1

    assert f1_token == pytest.approx(f1)
Beispiel #2
0
def test_corpus_macro_avg_f1_token_multiple_references():
    ref_sents = ["A mother are accused of committing one local robbery", "A mother accused of committing robbery"]
    sys_sent = "A mother and her teenage son are accused of committing one robbery"

    f1_token = corpus_f1_token([sys_sent], [ref_sents])

    correct_ref1 = ['A', 'mother', 'are', 'accused', 'of', 'committing', 'one', 'robbery']
    precision_ref1 = len(correct_ref1) / len(sys_sent.split())
    recall_ref1 = len(correct_ref1) / len(ref_sents[0].split())
    f1_ref1 = 2 * precision_ref1 * recall_ref1 / (precision_ref1 + recall_ref1)

    correct_ref2 = ['A', 'mother', 'accused', 'of', 'committing', 'robbery']
    precision_ref2 = len(correct_ref2) / len(sys_sent.split())
    recall_ref2 = len(correct_ref2) / len(ref_sents[1].split())
    f1_ref2 = 2 * precision_ref2 * recall_ref2 / (precision_ref2 + recall_ref2)

    f1 = 100.0 * np.max([f1_ref1, f1_ref2])

    assert f1_token == pytest.approx(f1)
Beispiel #3
0
def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer='13a',
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    '''
    Evaluate a system output with automatic metrics.
    '''
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if 'bleu' in metrics:
        metrics_scores['bleu'] = corpus_bleu(sys_sents,
                                             refs_sents,
                                             force=True,
                                             tokenizer=tokenizer,
                                             lowercase=lowercase)

    if 'sent_bleu' in metrics:
        metrics_scores['sent_bleu'] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if 'sari' in metrics:
        metrics_scores['sari'] = corpus_sari(orig_sents,
                                             sys_sents,
                                             refs_sents,
                                             tokenizer=tokenizer,
                                             lowercase=lowercase)

    if 'sari_legacy' in metrics:
        metrics_scores['sari_legacy'] = corpus_sari(orig_sents,
                                                    sys_sents,
                                                    refs_sents,
                                                    tokenizer=tokenizer,
                                                    lowercase=lowercase,
                                                    legacy=True)

    if 'samsa' in metrics:
        from easse.samsa import corpus_samsa
        metrics_scores['samsa'] = corpus_samsa(orig_sents,
                                               sys_sents,
                                               tokenizer=tokenizer,
                                               lowercase=lowercase,
                                               verbose=True)

    if 'fkgl' in metrics:
        metrics_scores['fkgl'] = corpus_fkgl(sys_sents, tokenizer=tokenizer)

    if 'f1_token' in metrics:
        metrics_scores['f1_token'] = corpus_f1_token(sys_sents,
                                                     refs_sents,
                                                     tokenizer=tokenizer,
                                                     lowercase=lowercase)

    if analysis:
        metrics_scores['word_level_analysis'] = corpus_analyse_operations(
            orig_sents, sys_sents, refs_sents, verbose=False, as_str=True)

    if quality_estimation:
        metrics_scores['quality_estimation'] = corpus_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase)

    return metrics_scores
Beispiel #4
0
def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer="13a",
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    """
    Evaluate a system output with automatic metrics.
    """
    for metric in metrics:
        assert metric in VALID_METRICS, f'"{metric}" not a valid metric. Valid metrics: {VALID_METRICS}'
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if "bleu" in metrics:
        metrics_scores["bleu"] = corpus_bleu(
            sys_sents,
            refs_sents,
            force=True,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sent_bleu" in metrics:
        metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if "sari" in metrics:
        metrics_scores["sari"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sari_legacy" in metrics:
        metrics_scores["sari_legacy"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            legacy=True,
        )

    if "sari_by_operation" in metrics:
        (
            metrics_scores["sari_add"],
            metrics_scores["sari_keep"],
            metrics_scores["sari_del"],
        ) = get_corpus_sari_operation_scores(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "samsa" in metrics:
        from easse.samsa import corpus_samsa

        metrics_scores["samsa"] = corpus_samsa(
            orig_sents,
            sys_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            verbose=True,
        )

    if "fkgl" in metrics:
        metrics_scores["fkgl"] = corpus_fkgl(sys_sents, tokenizer=tokenizer)

    if "f1_token" in metrics:
        metrics_scores["f1_token"] = corpus_f1_token(sys_sents,
                                                     refs_sents,
                                                     tokenizer=tokenizer,
                                                     lowercase=lowercase)

    if "bertscore" in metrics:
        from easse.bertscore import corpus_bertscore  # Inline import to use EASSE without installing all dependencies
        (
            metrics_scores["bertscore_precision"],
            metrics_scores["bertscore_recall"],
            metrics_scores["bertscore_f1"],
        ) = corpus_bertscore(sys_sents,
                             refs_sents,
                             tokenizer=tokenizer,
                             lowercase=lowercase)

    if analysis:
        from easse.annotation.word_level import WordOperationAnnotator  # Inline import to use EASSE without installing all dependencies
        word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer,
                                                          lowercase=lowercase,
                                                          verbose=True)
        metrics_scores[
            "word_level_analysis"] = word_operation_annotator.analyse_operations(
                orig_sents, sys_sents, refs_sents, as_str=True)

    if quality_estimation:
        metrics_scores["quality_estimation"] = corpus_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase)

    return metrics_scores