Python corpus_chrf Exemples, sacrebleu.corpus_chrf Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : macrof1_analysis.py Projet : fallcat/QUM

def get_total_metrics(sys1_path, sys2_path, ref_path, lowercase=False, max_order=1, sys1_name='sys1', sys2_name='sys2', filepath=None):
    with open(sys1_path, 'rt') as sys1_file:
        with open(sys2_path, 'rt') as sys2_file:
            with open(ref_path, 'rt') as ref_file:
                sys1_list = [line.strip() for line in sys1_file.readlines()]
                sys2_list = [line.strip() for line in sys2_file.readlines()]
                ref_list = [line.strip() for line in ref_file.readlines()]

    mf1_sys1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order)
    mf1_sys2 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order)
    mf1_sys1_f1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order, measure_name='f1')
    mf1_sys2_f1 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order, measure_name='f1')
    mf1_sys1_new = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], average='macro', word_class=True, max_order=max_order)
    mf1_sys2_new = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], average='macro', word_class=True, max_order=max_order)
    mf1_sys1_new_f1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], average='macro', word_class=True, max_order=max_order, measure_name='f1')
    mf1_sys2_new_f1 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], average='macro', word_class=True, max_order=max_order, measure_name='f1')
    bleu_sys1 = sacrebleu.corpus_bleu(sys1_list, [ref_list], lowercase=lowercase)
    bleu_sys2 = sacrebleu.corpus_bleu(sys2_list, [ref_list], lowercase=lowercase)
    chrf_sys1 = sacrebleu.corpus_chrf(sys1_list, ref_list)
    chrf_sys2 = sacrebleu.corpus_chrf(sys2_list, ref_list)
    micro_perc_sys1, macro_perc_sys1, total_list_sys1, en_list_sys1, total_sys1, en_sys1 = get_percent_en(sys1_list)
    micro_perc_sys2, macro_perc_sys2, total_list_sys2, en_list_sys2, total_sys2, en_sys2 = get_percent_en(sys2_list)


    # bleurt_checkpoint = "/Users/weiqiuyou/Documents/USC_ISI/QUM/tools/bleurt/bleurt/bleurt-base-128"
    # scorer = score.BleurtScorer(bleurt_checkpoint)
    # bleurt_sys1 = np.mean(scorer.score(ref_list, sys1_list))
    # bleurt_sys2 = np.mean(scorer.score(ref_list, sys2_list))

    report = ''
    report += f'mf1_{sys1_name}: {mf1_sys1}\n'
    report += f'mf1_{sys2_name}: {mf1_sys2}\n'
    report += f'mf1_{sys1_name}_f1: {mf1_sys1_f1}\n'
    report += f'mf1_{sys2_name}_f1: {mf1_sys2_f1}\n'
    report += f'mf1_new_{sys1_name}: {mf1_sys1_new}\n'
    report += f'mf1_new_{sys2_name}: {mf1_sys2_new}\n'
    report += f'mf1_new_{sys1_name}_f1: {mf1_sys1_new_f1}\n'
    report += f'mf1_new_{sys2_name}_f1: {mf1_sys2_new_f1}\n'
    report += f'bleu_{sys1_name}: {bleu_sys1}\n'
    report += f'bleu_{sys2_name}: {bleu_sys2}\n'
    report += f'chrf_{sys1_name}: {chrf_sys1}\n'
    report += f'chrf_{sys2_name}: {chrf_sys2}\n'
    report += f'micro_perc_{sys1_name}: {micro_perc_sys1}\tmacro_perc_{sys1_name}: {macro_perc_sys1}\n'
    report += f'micro_perc_{sys2_name}: {micro_perc_sys2}\tmacro_perc_{sys2_name}: {macro_perc_sys2}\n'
    # report += f'bleurt_{sys1_name}: {bleurt_sys1}\n'
    # report += f'bleurt_{sys2_name}: {bleurt_sys2}\n'

    print(report)
    if filepath is not None:
        with open(filepath, 'wt') as output_file:
            output_file.write(report)

Exemple #2

0

Afficher le fichier

Fichier : utils.py Projet : divvun/lang-sme-ml-swe

def get_scores(enc_sources,
               enc_target_sents,
               model,
               device,
               tokenizersrc,
               tokenizertrg,
               search="greedy",
               n=4):
    """
    takes a list of sentences and their translations in string form and returns score objects
    model is the trained transformer model
    tokenizer is the spm sentencpiece vocabulary in the form "name".model
    search is the decoding strategy, either greedy or beam search
    n is the beam width in beam search
    """
    model.eval()
    sp.load(tokenizertrg)
    targets = []
    outputs = []
    target_str = [sp.DecodeIds(sent.tolist()) for sent in enc_target_sents]
    output_str = []
    if search == "greedy":
        x = divide_chunks(enc_sources, 100)
        output_str = []
        for sents in x:
            print((len(output_str) / len(enc_sources)) * 100, end="\r")
            y = translate_enc_sentences(model,
                                        sents,
                                        device,
                                        tokenizertrg,
                                        max_length=150)
            output_str.extend(y)
        bleu = sacrebleu.corpus_bleu(output_str, [target_str])
        chrf = sacrebleu.corpus_chrf(output_str, [target_str])
        ter = sacrebleu.corpus_ter(output_str, [target_str])

        return bleu, chrf, ter
    elif search == "beam":
        prediction = beam_search(source, device, tokenizersrc, tokenizertrg, n)
    sp.Load(tokenizertrg)
    target = sp.DecodeIds(target.tolist())
    targets.append([target.split()])
    target_str.append(target)
    outputs.append(prediction.split())
    output_str.append(prediction)

    bleu = sacrebleu.corpus_bleu(output_str, [target_str])
    chrf = sacrebleu.corpus_chrf(output_str, [target_str])
    ter = sacrebleu.corpus_ter(output_str, [target_str])
    return bleu, chrf, ter

Exemple #3

0

Afficher le fichier

Fichier : statistical_differences.py Projet : midobal/covid19mlia-mt-task

def compute_metrics(ref, hyp, hyp_order):
    refs = []
    hyps = []
    for id in hyp_order:
        for segment in hyp[id]:
            hyps.append(segment)
        try:
            for segment in ref[id]:
                refs.append(segment)
        except KeyError:
            sys.stderr.write('Error: there are no references for document' +
                             ' "' + id + '"\n')
            sys.exit(-1)

    metrics = []
    for n in range(len(hyps)):
        try:
            bleu = sacrebleu.corpus_bleu([hyps[n]], [[refs[n]]])
            chrf = sacrebleu.corpus_chrf([hyps[n]], [[refs[n]]])
        except EOFError:
            sys.stderr.write('Error: source and reference have different' +
                             ' lengths.\n')
            sys.exit(-1)
        metrics.append([bleu.score] + [chrf.score])
    return metrics

Exemple #4

0

Afficher le fichier

def test_chrf_keep_whitespace(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses, [references],
                                  char_order=6,
                                  word_order=0,
                                  beta=3,
                                  remove_whitespace=False).score
    assert abs(score - expected_score) < EPSILON

Exemple #5

0

Afficher le fichier

def test_chrf_eff_order(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses, [references],
                                  char_order=6,
                                  word_order=0,
                                  beta=3,
                                  eps_smoothing=False).score
    assert abs(score - expected_score) < EPSILON

Exemple #6

0

Afficher le fichier

Fichier : test_chrf.py Projet : yanzhangnlp/LDGCNs

def test_chrf_keep_whitespace(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses,
                                  references,
                                  6,
                                  3,
                                  remove_whitespace=False)
    assert abs(score - expected_score) < EPSILON

Exemple #7

0

Afficher le fichier

Fichier : base.py Projet : salvacarrion/nmt-continual-learning

def compute_metrics(hyp_dec_all,
                    ref_dec_all,
                    use_sacrebleu=True,
                    use_torchtext=True,
                    use_ter=False):
    metrics = {}

    # Sacrebleu
    if use_sacrebleu:
        metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf(
            hyp_dec_all, [ref_dec_all]).score
        if use_ter:  # Quite slow
            metrics["sacrebleu_ter"] = sacrebleu.corpus_ter(
                hyp_dec_all, [ref_dec_all]).score

    # Torchtext
    if use_torchtext:
        m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all],
                                  [[x.split(" ")] for x in ref_dec_all])
        metrics["torchtext_bleu"] = m_bleu_score * 100
    return metrics

Exemple #8

0

Afficher le fichier

Fichier : utils.py Projet : sillsdev/silnlp

 def __call__(self, ref_path: str, hyp_path: str) -> float:
     ref_streams = load_ref_streams(ref_path, detok=True)
     sys_stream = load_sys_stream(hyp_path, detok=True)
     chrf3_score = sacrebleu.corpus_chrf(sys_stream,
                                         ref_streams,
                                         order=6,
                                         beta=3,
                                         remove_whitespace=True)
     return np.round(float(chrf3_score.score * 100), 2)

Exemple #9

0

Afficher le fichier

Fichier : metrics.py Projet : dowobeha/sigmorphon-seq2seq

def chrf(hypotheses, references):
    """
    Character F-score from sacrebleu

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return:
    """
    return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)

Exemple #10

0

Afficher le fichier

def chrf(hypotheses, references):
    """
    Character F-score from sacrebleu

    :param hypotheses:
    :param references:
    :return:
    """
    return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)

Exemple #11

0

Afficher le fichier

def raw_corpus_chrf(hypotheses: Iterable[str],
                    references: Iterable[str]) -> float:
    """
    Simple wrapper around sacreBLEU's chrF implementation, without tokenization.

    :param hypotheses: Hypotheses stream.
    :param references: Reference stream.
    :return: chrF score as float between 0 and 1.
    """
    return sacrebleu.corpus_chrf(hypotheses, [references]).score

Exemple #12

0

Afficher le fichier

Fichier : evaluate.py Projet : vmujadia/sockeye

def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> float:
    """
    Simple wrapper around sacreBLEU's chrF implementation, without tokenization.

    :param hypotheses: Hypotheses stream.
    :param references: Reference stream.
    :return: chrF score as float between 0 and 1.
    """
    return sacrebleu.corpus_chrf(hypotheses, references, order=sacrebleu.CHRF_ORDER, beta=sacrebleu.CHRF_BETA,
                                 remove_whitespace=True)

Exemple #13

0

Afficher le fichier

def calculate_score_report(sys, ref, score_only):

    chrf = sacrebleu.corpus_chrf(sys, ref)
    bleu = sacrebleu.corpus_bleu(sys, ref)

    prefix = 'BLEU = ' if score_only else ''

    print('#### Score Report ####')
    print(chrf)
    print('{}{}'.format(prefix, bleu.format(score_only=score_only)))

Exemple #14

0

Afficher le fichier

def chrf(items):
    """chrF++ is a tool for automatic evaluation of machine translation output
    based on character n-gram precision and recall enhanced with word n-grams.
    Source: https://github.com/m-popovic/chrF
    Paper: https://www.aclweb.org/anthology/W15-3049.pdf

    Higher is better  # TODO I think
    """
    refs = list(zip(*items))[0]
    preds = list(zip(*items))[1]
    refs, preds = _sacreformat(refs, preds)
    return sacrebleu.corpus_chrf(preds, refs).score

Exemple #15

0

Afficher le fichier

Fichier : metrics.py Projet : Freshia/joeynmtmaml

def chrf(hypotheses, references, remove_whitespace=True):
    """
    Character F-score from sacrebleu

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :param remove_whitespace: (bool)
    :return:
    """
    return sacrebleu.corpus_chrf(hypotheses=hypotheses,
                                 references=[references],
                                 remove_whitespace=remove_whitespace).score

Exemple #16

0

Afficher le fichier

Fichier : metrics.py Projet : joeynmt/joeynmt

def chrf(hypotheses, references, remove_whitespace=True):
    """
    Character F-score from sacrebleu

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :param remove_whitespace: (bool)
    :return: character f-score (0 <= chf <= 1)
             see Breaking Change in sacrebleu v2.0
    """
    score = sacrebleu.corpus_chrf(hypotheses=hypotheses,
                                  references=[references],
                                  remove_whitespace=remove_whitespace).score
    return score / 100

Exemple #17

0

Afficher le fichier

Fichier : chrfpp_metric.py Projet : bobycv06fpm/SummEval

 def evaluate_batch(self, summaries, references, aggregate=True):
     if aggregate:
         score = sacrebleu.corpus_chrf(summaries,
                                       references,
                                       order=self.ncorder,
                                       beta=self.beta)
         score_dict = {"chrf": score.score}
         return score_dict
     else:
         p = Pool(processes=self.n_workers)
         results = p.starmap(self.evaluate_example,
                             zip(summaries, references))
         p.close()
         return results

Exemple #18

0

Afficher le fichier

Fichier : tb.py Projet : MemduhG/versetorch

def write_evals(writer, experiment, translation, file_path, ref, src):
    writer = SummaryWriter("runs/{}-{}".format(experiment, translation))

    steps = int(translation)
    output_path = "translations/{}/{}".format(experiment, translation)
    with open(output_path, "r", encoding="utf-8") as infile:
        system_output = [x.strip() for x in infile.readlines()]
        bleu = sacrebleu.corpus_bleu(system_output, [ref])
        chrf = sacrebleu.corpus_chrf(system_output, [ref])
        rhyme_score, copied, reconstructed = concurrent_score(system_output,
                                                                languages[experiment],
                                                                ref, src)
        print(experiment, translation, bleu.score, rhyme_score, copied, reconstructed)

    wall = os.stat(file_path).st_mtime
    writer.add_scalar(experiment + "/CHRF", chrf.score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/BLEU", bleu.score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Rhyme", rhyme_score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Copied", copied, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Reconstructed", reconstructed, global_step=steps, walltime=wall)
    writer.flush()

Exemple #19

0

Afficher le fichier

Fichier : statistical_significance.py Projet : antcont/LEXB

def eval_measure(gold, sys, eval_type='bleu'):
    ''' Evaluation measure

    This takes in gold labels and system outputs and evaluates their
    accuracy. It currently supports:
    * Accuracy (acc), percentage of labels that match
    * Pearson's correlation coefficient (pearson)
    * BLEU score (bleu)
    * BLEU_detok, on detokenized references and translations, with internal tokenization
    :param gold: the correct labels (reference)
    :param sys: the system outputs (hypothesis)
    :param eval_type: The type of evaluation to do (bleu, chrf3, hlepor)
    '''
    if eval_type == EVAL_TYPE_BLEU:
        # make sure score is 0-based instead of 100-based
        return corpus_bleu(sys, [gold]).score / 100.
    elif eval_type == EVAL_TYPE_CHRF3:
        return corpus_chrf(sys, [gold], beta=3).score
    elif eval_type == EVAL_TYPE_HLEPOR:
        return hlepor_score(sys, gold)
    else:
        raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)

Exemple #20

0

Afficher le fichier

 def chrf(self, hypo, groundtruth=None, lc=False):
     if groundtruth is None:
         ref = self._refs_for_sacre
     else:
         if isinstance(groundtruth[0], str):
             ref = [groundtruth]
         else:
             ref = groundtruth
     try:
         chrf = sacrebleu.corpus_chrf([(x.lower() if lc else x)
                                       for x in hypo],
                                      [[(x.lower() if lc else x) for x in y]
                                       for y in ref])
         return chrf.score
     except IndexError:
         logging.info("Found empty lines.")
         print(traceback.format_exc())
         return 0.
     except ZeroDivisionError:
         logging.info("Empty reference")
         print(traceback.format_exc())
         return 0.

Exemple #21

0

Afficher le fichier

def validate(model,
             test_data,
             golden_file,
             beam_size=8,
             alpha=0.6,
             max_time_step=100):
    """For development Only"""
    pp = PostProcess()

    ref_stream = []
    for line in open(golden_file + '.input_clean'):
        if line.startswith('# ::tokens '):
            o = json.loads(line[len('# ::tokens '):].strip())
            ref_stream.append(' '.join(o).lower())
    # gold model output
    graph, gold_sys_stream, _, abstract = read_file(golden_file + '.preproc')
    ref_streams = [ref_stream]

    sys_stream = []
    for batch in test_data:
        res = generate_batch(model, batch, beam_size, alpha, max_time_step)
        sys_stream.extend(res['token'])

    assert len(sys_stream) == len(ref_stream)
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
    ]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=True,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)

    return bleu, chrf

Exemple #22

0

Afficher le fichier

Fichier : work.py Projet : youngflyasd/gtos

def validate(model, test_data, beam_size=8, alpha=0.6, max_time_step=100):
    """For development Only"""
    pp = PostProcess()

    ref_stream = []
    sys_stream = []
    for batch in test_data:
        res = generate_batch(model, batch, beam_size, alpha, max_time_step)
        sys_stream.extend(res['token'])
        ref_stream.extend(batch['target'])

    assert len(sys_stream) == len(ref_stream)
    sys_stream = [pp.post_process(o) for o in sys_stream]
    ref_stream = [' '.join(o) for i in ref_stream]
    ref_streams = [ref_stream]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=False,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)

    return bleu, chrf

Exemple #23

0

Afficher le fichier

 def score_corpus_multiprocess(self, hypothesis: List[str],
                               references: List[List[str]]) -> float:
     if self.n_workers == 1:
         corpus_score = sb.corpus_chrf(hypothesis, references[0]).score
     else:
         batches = list(
             self._batch(hypothesis, references, n_batches=self.n_workers))
         corpus_statistics = [0 for _ in range(sb.CHRF_ORDER * 3)]
         with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
             futures = [
                 executor.submit(sb.get_corpus_statistics, b[0], b[1][0])
                 for b in batches
             ]
             progress = as_completed(futures)
             if self.verbose:
                 progress = tqdm(progress)
             for future in progress:
                 stats = future.result()
                 for i in range(sb.CHRF_ORDER * 3):
                     corpus_statistics[i] += stats[i]
         avg_precision, avg_recall = sb._avg_precision_and_recall(
             corpus_statistics, sb.CHRF_ORDER)
         corpus_score = sb._chrf(avg_precision, avg_recall)
     return corpus_score

Exemple #24

0

Afficher le fichier

Fichier : eval_bleu.py Projet : lukecyu/spring

def raw_corpus_chrf(hypotheses: Iterable[str],
                    references: Iterable[str]) -> float:
    return sacrebleu.corpus_chrf(hypotheses, references,
                                 order=sacrebleu.CHRF_ORDER,
                                 beta=sacrebleu.CHRF_BETA,
                                 remove_whitespace=True)

Exemple #25

0

Afficher le fichier

Fichier : test_chrf.py Projet : bricksdont/sacreBLEU

def test_chrf(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses, [references], 6, 3).score
    assert abs(score - expected_score) < EPSILON

Exemple #26

0

Afficher le fichier

Fichier : debug_macrof1_new.py Projet : fallcat/QUM

#
# sys = ['The dog runs home hi.', 'the dog runs home hi.']
# ref = ['The dog ran home hi.', 'the dog runs home hello.']
# mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True)
# mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)
# print(mf1.score)
# print(mf1_old.score)


sys = ['The dog runs home now.']
ref = ['The dog ran home now.']
mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True)
mf1_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True, measure_name='f1')
mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)
mf1_old_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False, measure_name='f1')
chrf = sacrebleu.corpus_chrf(sys, ref)
print("sys:", sys)
print("ref:", ref)
print("mf1 new:", mf1.score)
print("mf1 f1 new:", mf1_f1.score)
print("mf1 old:", mf1_old.score)
print("mf1 f1 old:", mf1_old_f1.score)
print("chrf:", chrf)

print("-------------")

sys = ['The dog runs home now.']
ref = ['The dog runs home later.']
mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True)
mf1_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True, measure_name='f1')
mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)

Exemple #27

0

Afficher le fichier

Fichier : postprocess.py Projet : youngflyasd/gtos

    prev = [' '.join(o) for o in pred_sys_stream]

    # choose one (gold or pred) and postprocess
    sys_stream = pred_sys_stream
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
    ]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=True,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)
    all_sent_chrf = [
        sacrebleu.sentence_chrf(x, y) for x, y in zip(sys_stream, ref_stream)
    ]
    avg_sent_chrf = sum(all_sent_chrf) / len(all_sent_chrf)
    if args.output:
        with open(args.pred_file + '.final', 'w') as fo:
            for x in sys_stream:
                fo.write(x + '\n')

        with open(args.pred_file + '.ref', 'w') as fo:
            for x in ref_stream:
                fo.write(x + '\n')
    print(avg_sent_chrf)
    print(bleu, chrf)

Exemple #28

0

Afficher le fichier

Fichier : metric.py Projet : Unbabel/MT-Telescope

 def score(self, src: List[str], cand: List[str],
           ref: List[str]) -> chrFResult:
     chrf = sacrebleu.corpus_chrf(cand, [ref])
     return chrFResult(chrf.score / 100, [], src, cand, ref, self.name)

Exemple #29

0

Afficher le fichier

def score_individual_books(
    book_dict: dict,
    src_iso: str,
    predictions_detok_path: str,
    scorers: Set[str],
    config: Config,
    ref_projects: Set[str],
):
    overall_sys: List[str] = []
    book_scores: List[PairScore] = []

    for book in book_dict.keys():
        for trg_iso, book_tuple in book_dict[book].items():
            pair_sys = book_tuple[0]
            pair_refs = book_tuple[1]
            overall_sys.extend(pair_sys)

            bleu_score = None
            if "bleu" in scorers:
                bleu_score = sacrebleu.corpus_bleu(
                    pair_sys,
                    pair_refs,
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            if "sentencebleu" in scorers:
                write_sentence_bleu(
                    predictions_detok_path,
                    pair_sys,
                    pair_refs,
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            other_scores: Dict[str, float] = {}
            if "chrf3" in scorers:
                chrf3_score = sacrebleu.corpus_chrf(pair_sys,
                                                    pair_refs,
                                                    order=6,
                                                    beta=3,
                                                    remove_whitespace=True)
                other_scores["CHRF3"] = np.round(
                    float(chrf3_score.score * 100), 2)

            if "meteor" in scorers:
                meteor_score = compute_meteor_score(trg_iso, pair_sys,
                                                    pair_refs)
                if meteor_score is not None:
                    other_scores["METEOR"] = meteor_score

            if "wer" in scorers:
                wer_score = compute_wer_score(pair_sys,
                                              cast(List[str], pair_refs))
                if wer_score >= 0:
                    other_scores["WER"] = wer_score

            if "ter" in scorers:
                ter_score = compute_ter_score(pair_sys, pair_refs)
                if ter_score >= 0:
                    other_scores["TER"] = ter_score
            score = PairScore(book, src_iso, trg_iso, bleu_score,
                              len(pair_sys), ref_projects, other_scores)
            book_scores.append(score)
    return book_scores

Exemple #30

0

Afficher le fichier

def test_checkpoint(
    config: Config,
    force_infer: bool,
    by_book: bool,
    ref_projects: Set[str],
    checkpoint_path: Path,
    step: int,
    scorers: Set[str],
    books: Set[int],
) -> List[PairScore]:
    config.set_seed()
    vref_paths: List[str] = []
    features_file_names: List[str] = []
    predictions_file_names: List[str] = []
    refs_patterns: List[str] = []
    predictions_detok_file_names: List[str] = []
    suffix_str = "_".join(map(lambda n: book_number_to_id(n), sorted(books)))
    if len(suffix_str) > 0:
        suffix_str += "-"
    suffix_str += "avg" if step == -1 else str(step)

    features_file_name = "test.src.txt"
    if (config.exp_dir / features_file_name).is_file():
        # all test data is stored in a single file
        vref_paths.append("test.vref.txt")
        features_file_names.append(features_file_name)
        predictions_file_names.append(f"test.trg-predictions.txt.{suffix_str}")
        refs_patterns.append("test.trg.detok*.txt")
        predictions_detok_file_names.append(
            f"test.trg-predictions.detok.txt.{suffix_str}")
    else:
        # test data is split into separate files
        for src_iso in sorted(config.src_isos):
            for trg_iso in sorted(config.trg_isos):
                if src_iso == trg_iso:
                    continue
                prefix = f"test.{src_iso}.{trg_iso}"
                features_file_name = f"{prefix}.src.txt"
                if (config.exp_dir / features_file_name).is_file():
                    vref_paths.append(f"{prefix}.vref.txt")
                    features_file_names.append(features_file_name)
                    predictions_file_names.append(
                        f"{prefix}.trg-predictions.txt.{suffix_str}")
                    refs_patterns.append(f"{prefix}.trg.detok*.txt")
                    predictions_detok_file_names.append(
                        f"{prefix}.trg-predictions.detok.txt.{suffix_str}")

    checkpoint_name = "averaged checkpoint" if step == -1 else f"checkpoint {step}"

    features_paths: List[Union[str, List[str]]] = []
    predictions_paths: List[str] = []
    for i in range(len(predictions_file_names)):
        predictions_path = config.exp_dir / predictions_file_names[i]
        if force_infer or not predictions_path.is_file():
            features_path = config.exp_dir / features_file_names[i]
            vref_path = config.exp_dir / vref_paths[i]
            if vref_path.is_file():
                features_paths.append([str(features_path), str(vref_path)])
            else:
                features_paths.append(str(features_path))
            predictions_paths.append(str(predictions_path))
    if len(predictions_paths) > 0:
        runner = create_runner(config)
        print(f"Inferencing {checkpoint_name}...")
        runner.infer_multiple(features_paths,
                              predictions_paths,
                              checkpoint_path=str(checkpoint_path))

    print(f"Scoring {checkpoint_name}...")
    default_src_iso = config.default_src_iso
    scores: List[PairScore] = []
    overall_sys: List[str] = []
    overall_refs: List[List[str]] = []
    for vref_file_name, features_file_name, predictions_file_name, refs_pattern, predictions_detok_file_name in zip(
            vref_paths, features_file_names, predictions_file_names,
            refs_patterns, predictions_detok_file_names):
        src_iso = default_src_iso
        if features_file_name != "test.src.txt":
            src_iso = features_file_name.split(".")[1]
        dataset, book_dict = load_test_data(
            vref_file_name,
            features_file_name,
            predictions_file_name,
            refs_pattern,
            predictions_detok_file_name,
            ref_projects,
            config,
            books,
            by_book,
        )

        for trg_iso, (pair_sys, pair_refs) in dataset.items():
            start_index = len(overall_sys)
            overall_sys.extend(pair_sys)
            for i, ref in enumerate(pair_refs):
                if i == len(overall_refs):
                    overall_refs.append([""] * start_index)
                overall_refs[i].extend(ref)
            # ensure that all refs are the same length as the sys
            for overall_ref in filter(lambda r: len(r) < len(overall_sys),
                                      overall_refs):
                overall_ref.extend([""] *
                                   (len(overall_sys) - len(overall_ref)))
            bleu_score = None
            if "bleu" in scorers:
                bleu_score = sacrebleu.corpus_bleu(
                    pair_sys,
                    cast(List[Iterable[str]], pair_refs),
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            if "sentencebleu" in scorers:
                write_sentence_bleu(
                    predictions_detok_file_name,
                    pair_sys,
                    cast(List[List[str]], pair_refs),
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            other_scores: Dict[str, float] = {}
            if "chrf3" in scorers:
                chrf3_score = sacrebleu.corpus_chrf(pair_sys,
                                                    cast(
                                                        List[Iterable[str]],
                                                        pair_refs),
                                                    order=6,
                                                    beta=3,
                                                    remove_whitespace=True)
                other_scores["CHRF3"] = np.round(
                    float(chrf3_score.score * 100), 2)

            if "meteor" in scorers:
                meteor_score = compute_meteor_score(
                    trg_iso, pair_sys, cast(List[Iterable[str]], pair_refs))
                if meteor_score is not None:
                    other_scores["METEOR"] = meteor_score

            if "wer" in scorers:
                wer_score = compute_wer_score(pair_sys,
                                              cast(List[str], pair_refs))
                if wer_score >= 0:
                    other_scores["WER"] = wer_score

            if "ter" in scorers:
                ter_score = compute_ter_score(
                    pair_sys, cast(List[Iterable[str]], pair_refs))
                if ter_score >= 0:
                    other_scores["TER"] = ter_score

            scores.append(
                PairScore("ALL", src_iso, trg_iso, bleu_score, len(pair_sys),
                          ref_projects, other_scores))
            if by_book is True:
                if len(book_dict) != 0:
                    book_scores = score_individual_books(
                        book_dict, src_iso, predictions_detok_file_name,
                        scorers, config, ref_projects)
                    scores.extend(book_scores)
                else:
                    print(
                        "Error: book_dict did not load correctly. Not scoring individual books."
                    )
    if len(config.src_isos) > 1 or len(config.trg_isos) > 1:
        bleu = sacrebleu.corpus_bleu(overall_sys,
                                     cast(List[Iterable[str]], overall_refs),
                                     lowercase=True)
        scores.append(
            PairScore("ALL", "ALL", "ALL", bleu, len(overall_sys),
                      ref_projects))

    scores_file_root = f"scores-{suffix_str}"
    if len(ref_projects) > 0:
        ref_projects_suffix = "_".join(sorted(ref_projects))
        scores_file_root += f"-{ref_projects_suffix}"
    with (config.exp_dir / f"{scores_file_root}.csv").open(
            "w", encoding="utf-8") as scores_file:
        if scores is not None:
            scores[0].writeHeader(scores_file)
        for results in scores:
            results.write(scores_file)
    return scores