Beispiel #1
0
def corpus_f1_token(sys_sents: List[str],
                    refs_sents: List[List[str]],
                    lowercase: bool = True,
                    tokenizer: str = '13a'):
    def find_correct_tokens(sys_tokens, ref_tokens):
        return list((Counter(sys_tokens) & Counter(ref_tokens)).elements())

    sys_sents = [
        utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents
    ]
    refs_sents = [[
        utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents
    ] for ref_sents in refs_sents]

    f1_token_scores = []
    for sys_sent, *ref_sents in zip(sys_sents, *refs_sents):
        sys_tokens = sys_sent.split()
        sys_total = len(sys_tokens)

        candidate_f1_token_scores = []
        for ref_sent in ref_sents:
            ref_tokens = ref_sent.split()
            ref_total = len(ref_tokens)

            correct_tokens = len(find_correct_tokens(sys_tokens, ref_tokens))
            _, _, f1 = compute_precision_recall_f1(correct_tokens, sys_total,
                                                   ref_total)
            candidate_f1_token_scores.append(f1)

        f1_token_scores.append(np.max(candidate_f1_token_scores))

    return 100. * np.mean(f1_token_scores)
Beispiel #2
0
def corpus_quality_estimation(orig_sentences: List[str],
                              sys_sentences: List[str],
                              lowercase: bool = False,
                              tokenizer: str = '13a'):
    orig_sentences = [
        normalize(sent, lowercase, tokenizer) for sent in orig_sentences
    ]
    sys_sentences = [
        normalize(sent, lowercase, tokenizer) for sent in sys_sentences
    ]
    return {
        'Compression ratio':
        get_average(get_compression_ratio, orig_sentences, sys_sentences),
        'Sentence splits':
        get_average(count_sentence_splits, orig_sentences, sys_sentences),
        'Levenshtein similarity':
        get_average(get_levenshtein_similarity, orig_sentences, sys_sentences),
        'Exact copies':
        get_average(is_exact_match, orig_sentences, sys_sentences),
        'Additions proportion':
        get_average(get_additions_proportion, orig_sentences, sys_sentences),
        'Deletions proportion':
        get_average(get_deletions_proportion, orig_sentences, sys_sentences),
        'Lexical complexity score':
        get_average(wrap_single_sentence_vectorizer(get_wordrank_score),
                    orig_sentences, sys_sentences),
    }
Beispiel #3
0
def get_corpus_sari_operation_scores(orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]],
                                     lowercase: bool = True, tokenizer: str = '13a',
                                     legacy=False, use_f1_for_deletion=True, use_paper_version=False):
    """The `legacy` parameter allows reproducing scores reported in previous work.
    It replicates a bug in the original JAVA implementation where only the system outputs and the reference sentences
    are further tokenized. 
    In addition, it assumes that all sentences are already lowercased. """
    if legacy:
        lowercase = False
    else:
        orig_sents = [
            utils_prep.normalize(sent, lowercase, tokenizer)
            for sent in orig_sents
        ]

    sys_sents = [
        utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents
    ]
    refs_sents = [
        [utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents]
        for ref_sents in refs_sents
    ]

    stats = compute_ngram_stats(orig_sents, sys_sents, refs_sents)

    if not use_paper_version:
        add_score, keep_score, del_score = compute_macro_sari(*stats, use_f1_for_deletion=use_f1_for_deletion)
    else:
        add_score, keep_score, del_score = compute_micro_sari(*stats, use_f1_for_deletion=use_f1_for_deletion)
    return 100. * add_score, 100. * keep_score, 100. * del_score
Beispiel #4
0
    def identify_operations(self, orig_sentences: List[str], simp_sentences: List[str]):
        orig_sentences = [utils_prep.normalize(sent, self._lowercase, self._tokenizer) for sent in orig_sentences]
        simp_sentences = [utils_prep.normalize(sent, self._lowercase, self._tokenizer) for sent in simp_sentences]

        all_parses = syntactic_parse_texts(
            orig_sentences + simp_sentences,
            with_constituency_parse=self._include_phrase_level,
            verbose=self._verbose,
        )
        orig_parses = all_parses[: len(orig_sentences)]
        simp_parses = all_parses[len(orig_sentences) :]

        orig_labels_per_sentence = []
        simp_labels_per_sentence = []
        for orig_sent, simp_sent, orig_parse, simp_parse in tqdm(
            zip(orig_sentences, simp_sentences, orig_parses, simp_parses),
            disable=(not self._verbose),
        ):
            word_aligns_orig_simp = self._get_word_alignments(orig_sent, orig_parse, simp_sent, simp_parse)
            orig_annots, simp_annots = annotate_sentence(
                orig_sent.split(),
                simp_sent.split(),
                word_aligns_orig_simp,
                orig_parse,
                simp_parse,
            )
            orig_labels = _from_annots_to_labels(orig_annots, default_label="C")
            simp_labels = _from_annots_to_labels(simp_annots, default_label="O")

            orig_labels_per_sentence.append(orig_labels)
            simp_labels_per_sentence.append(simp_labels)

        return orig_labels_per_sentence, simp_labels_per_sentence
Beispiel #5
0
def corpus_sari(orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]],
                lowercase: bool = False, tokenizer: str = '13a'):

    # orig_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in orig_sents]
    sys_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents]
    refs_sents = [[utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents]
                      for ref_sents in refs_sents]

    stats = compute_ngram_stats(orig_sents, sys_sents, refs_sents)

    return compute_macro_sari(*stats, corpus_level=True)
Beispiel #6
0
def get_bertscore_sentence_scores(
    sys_sents: List[str],
    refs_sents: List[List[str]],
    lowercase: bool = False,
    tokenizer: str = "13a",
):
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)

    sys_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents]
    refs_sents = [[utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents] for ref_sents in refs_sents]
    refs_sents = [list(r) for r in zip(*refs_sents)]

    return scorer.score(sys_sents, refs_sents)
Beispiel #7
0
def get_samsa_sentence_scores(orig_sents: List[str], sys_sents: List[str], lowercase: bool = False, tokenizer: str = '13a',
                              verbose: bool = False):
    print('Warning: SAMSA metric is long to compute (120 sentences ~ 4min), disable it if you need fast evaluation.')

    orig_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in orig_sents]
    orig_ucca_passages = ucca_parse_texts(orig_sents)
    orig_synt_scenes = syntactic_parse_ucca_scenes(orig_ucca_passages, tokenize=False, sentence_split=False,
                                                   verbose=verbose)

    sys_sents = [utils_prep.normalize(output, lowercase, tokenizer) for output in sys_sents]
    sys_sents_synt = syntactic_parse_texts(sys_sents, tokenize=False, sentence_split=True, verbose=verbose)

    sentences_scores = []
    for orig_passage, orig_scenes, sys_synt in tqdm(zip(orig_ucca_passages, orig_synt_scenes, sys_sents_synt),
                                                    disable=(not verbose)):
        sentences_scores.append(100. * compute_samsa(orig_passage, orig_scenes, sys_synt))

    return sentences_scores
Beispiel #8
0
def corpus_bleu(
    sys_sents: List[str],
    refs_sents: List[List[str]],
    smooth_method: str = "exp",
    smooth_value: float = None,
    force: bool = True,
    lowercase: bool = False,
    tokenizer: str = "13a",
    effective_order: bool = False,
):
    sys_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents]
    refs_sents = [[utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents] for ref_sents in refs_sents]

    bleu_scorer = BLEU(lowercase=False, force=force, tokenize="none", smooth_method=smooth_method, smooth_value=smooth_value, effective_order=effective_order)

    return bleu_scorer.corpus_score(
        sys_sents,
        refs_sents,
    ).score
Beispiel #9
0
def corpus_samsa(orig_sentences: List[str], sys_outputs: List[str], lowercase: bool = False, tokenizer: str = '13a',
                 verbose: bool = False):
    print('Warning: SAMSA metric is long to compute, disable it if if you need fast evaluation.')
    orig_sentences = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in orig_sentences]
    orig_ucca_sents = ucca_parse_texts(orig_sentences)

    sys_outputs = [utils_prep.normalize(output, lowercase, tokenizer) for output in sys_outputs]
    sys_synt_outputs = syntactic_parse_texts(sys_outputs, tokenize=False, sentence_split=True, verbose=verbose)

    if verbose:
        print("Computing SAMSA score...")

    samsa_score = 0.0
    for orig_ucca, sys_synt in tqdm(zip(orig_ucca_sents, sys_synt_outputs), disable=(not verbose)):
        samsa_score += compute_samsa(orig_ucca, sys_synt)

    samsa_score /= len(orig_sentences)

    return 100. * samsa_score
Beispiel #10
0
def corpus_bleu(sys_sents: List[str],
                refs_sents: List[List[str]],
                smooth_method: str = 'exp',
                smooth_value: float = None,
                force: bool = True,
                lowercase: bool = False,
                tokenizer: str = '13a',
                use_effective_order: bool = False):

    sys_sents = [
        utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents
    ]
    refs_sents = [[
        utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents
    ] for ref_sents in refs_sents]

    return sacrebleu.corpus_bleu(sys_sents,
                                 refs_sents,
                                 smooth_method,
                                 smooth_value,
                                 force,
                                 lowercase=False,
                                 tokenize='none',
                                 use_effective_order=use_effective_order).score
Beispiel #11
0
def corpus_fkgl(sentences: List[str], tokenizer: str = '13a'):
    scorer = FKGLScorer()
    for sentence in sentences:
        scorer.add(normalize(sentence, tokenizer=tokenizer))
    return scorer.score()