def test_corpus_macro_avg_f1_token_single_reference(): ref_sents = ["A mother are accused of committing one local robbery"] sys_sent = "A mother and her teenage son are accused of committing one robbery" f1_token = corpus_f1_token([sys_sent], [ref_sents]) correct = ['A', 'mother', 'are', 'accused', 'of', 'committing', 'one', 'robbery'] precision = len(correct) / len(sys_sent.split()) recall = len(correct) / len(ref_sents[0].split()) f1 = 2 * precision * recall / (precision + recall) f1 = 100.0 * f1 assert f1_token == pytest.approx(f1)
def test_corpus_macro_avg_f1_token_multiple_references(): ref_sents = ["A mother are accused of committing one local robbery", "A mother accused of committing robbery"] sys_sent = "A mother and her teenage son are accused of committing one robbery" f1_token = corpus_f1_token([sys_sent], [ref_sents]) correct_ref1 = ['A', 'mother', 'are', 'accused', 'of', 'committing', 'one', 'robbery'] precision_ref1 = len(correct_ref1) / len(sys_sent.split()) recall_ref1 = len(correct_ref1) / len(ref_sents[0].split()) f1_ref1 = 2 * precision_ref1 * recall_ref1 / (precision_ref1 + recall_ref1) correct_ref2 = ['A', 'mother', 'accused', 'of', 'committing', 'robbery'] precision_ref2 = len(correct_ref2) / len(sys_sent.split()) recall_ref2 = len(correct_ref2) / len(ref_sents[1].split()) f1_ref2 = 2 * precision_ref2 * recall_ref2 / (precision_ref2 + recall_ref2) f1 = 100.0 * np.max([f1_ref1, f1_ref2]) assert f1_token == pytest.approx(f1)
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer='13a', lowercase=True, metrics=DEFAULT_METRICS, analysis=False, quality_estimation=False, ): ''' Evaluate a system output with automatic metrics. ''' sys_sents = get_sys_sents(test_set, sys_sents_path) orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) # compute each metric metrics_scores = {} if 'bleu' in metrics: metrics_scores['bleu'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenizer=tokenizer, lowercase=lowercase) if 'sent_bleu' in metrics: metrics_scores['sent_bleu'] = corpus_averaged_sentence_bleu( sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'sari' in metrics: metrics_scores['sari'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'sari_legacy' in metrics: metrics_scores['sari_legacy'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, legacy=True) if 'samsa' in metrics: from easse.samsa import corpus_samsa metrics_scores['samsa'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase, verbose=True) if 'fkgl' in metrics: metrics_scores['fkgl'] = corpus_fkgl(sys_sents, tokenizer=tokenizer) if 'f1_token' in metrics: metrics_scores['f1_token'] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: metrics_scores['word_level_analysis'] = corpus_analyse_operations( orig_sents, sys_sents, refs_sents, verbose=False, as_str=True) if quality_estimation: metrics_scores['quality_estimation'] = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) return metrics_scores
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer="13a", lowercase=True, metrics=DEFAULT_METRICS, analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ for metric in metrics: assert metric in VALID_METRICS, f'"{metric}" not a valid metric. Valid metrics: {VALID_METRICS}' sys_sents = get_sys_sents(test_set, sys_sents_path) orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) # compute each metric metrics_scores = {} if "bleu" in metrics: metrics_scores["bleu"] = corpus_bleu( sys_sents, refs_sents, force=True, tokenizer=tokenizer, lowercase=lowercase, ) if "sent_bleu" in metrics: metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu( sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if "sari" in metrics: metrics_scores["sari"] = corpus_sari( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, ) if "sari_legacy" in metrics: metrics_scores["sari_legacy"] = corpus_sari( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, legacy=True, ) if "sari_by_operation" in metrics: ( metrics_scores["sari_add"], metrics_scores["sari_keep"], metrics_scores["sari_del"], ) = get_corpus_sari_operation_scores( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, ) if "samsa" in metrics: from easse.samsa import corpus_samsa metrics_scores["samsa"] = corpus_samsa( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase, verbose=True, ) if "fkgl" in metrics: metrics_scores["fkgl"] = corpus_fkgl(sys_sents, tokenizer=tokenizer) if "f1_token" in metrics: metrics_scores["f1_token"] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if "bertscore" in metrics: from easse.bertscore import corpus_bertscore # Inline import to use EASSE without installing all dependencies ( metrics_scores["bertscore_precision"], metrics_scores["bertscore_recall"], metrics_scores["bertscore_f1"], ) = corpus_bertscore(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: from easse.annotation.word_level import WordOperationAnnotator # Inline import to use EASSE without installing all dependencies word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer, lowercase=lowercase, verbose=True) metrics_scores[ "word_level_analysis"] = word_operation_annotator.analyse_operations( orig_sents, sys_sents, refs_sents, as_str=True) if quality_estimation: metrics_scores["quality_estimation"] = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) return metrics_scores