Exemple #1
0
def get_total_metrics(sys1_path, sys2_path, ref_path, lowercase=False, max_order=1, sys1_name='sys1', sys2_name='sys2', filepath=None):
    with open(sys1_path, 'rt') as sys1_file:
        with open(sys2_path, 'rt') as sys2_file:
            with open(ref_path, 'rt') as ref_file:
                sys1_list = [line.strip() for line in sys1_file.readlines()]
                sys2_list = [line.strip() for line in sys2_file.readlines()]
                ref_list = [line.strip() for line in ref_file.readlines()]

    mf1_sys1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order)
    mf1_sys2 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order)
    mf1_sys1_f1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order, measure_name='f1')
    mf1_sys2_f1 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order, measure_name='f1')
    mf1_sys1_new = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], average='macro', word_class=True, max_order=max_order)
    mf1_sys2_new = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], average='macro', word_class=True, max_order=max_order)
    mf1_sys1_new_f1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], average='macro', word_class=True, max_order=max_order, measure_name='f1')
    mf1_sys2_new_f1 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], average='macro', word_class=True, max_order=max_order, measure_name='f1')
    bleu_sys1 = sacrebleu.corpus_bleu(sys1_list, [ref_list], lowercase=lowercase)
    bleu_sys2 = sacrebleu.corpus_bleu(sys2_list, [ref_list], lowercase=lowercase)
    chrf_sys1 = sacrebleu.corpus_chrf(sys1_list, ref_list)
    chrf_sys2 = sacrebleu.corpus_chrf(sys2_list, ref_list)
    micro_perc_sys1, macro_perc_sys1, total_list_sys1, en_list_sys1, total_sys1, en_sys1 = get_percent_en(sys1_list)
    micro_perc_sys2, macro_perc_sys2, total_list_sys2, en_list_sys2, total_sys2, en_sys2 = get_percent_en(sys2_list)


    # bleurt_checkpoint = "/Users/weiqiuyou/Documents/USC_ISI/QUM/tools/bleurt/bleurt/bleurt-base-128"
    # scorer = score.BleurtScorer(bleurt_checkpoint)
    # bleurt_sys1 = np.mean(scorer.score(ref_list, sys1_list))
    # bleurt_sys2 = np.mean(scorer.score(ref_list, sys2_list))

    report = ''
    report += f'mf1_{sys1_name}: {mf1_sys1}\n'
    report += f'mf1_{sys2_name}: {mf1_sys2}\n'
    report += f'mf1_{sys1_name}_f1: {mf1_sys1_f1}\n'
    report += f'mf1_{sys2_name}_f1: {mf1_sys2_f1}\n'
    report += f'mf1_new_{sys1_name}: {mf1_sys1_new}\n'
    report += f'mf1_new_{sys2_name}: {mf1_sys2_new}\n'
    report += f'mf1_new_{sys1_name}_f1: {mf1_sys1_new_f1}\n'
    report += f'mf1_new_{sys2_name}_f1: {mf1_sys2_new_f1}\n'
    report += f'bleu_{sys1_name}: {bleu_sys1}\n'
    report += f'bleu_{sys2_name}: {bleu_sys2}\n'
    report += f'chrf_{sys1_name}: {chrf_sys1}\n'
    report += f'chrf_{sys2_name}: {chrf_sys2}\n'
    report += f'micro_perc_{sys1_name}: {micro_perc_sys1}\tmacro_perc_{sys1_name}: {macro_perc_sys1}\n'
    report += f'micro_perc_{sys2_name}: {micro_perc_sys2}\tmacro_perc_{sys2_name}: {macro_perc_sys2}\n'
    # report += f'bleurt_{sys1_name}: {bleurt_sys1}\n'
    # report += f'bleurt_{sys2_name}: {bleurt_sys2}\n'

    print(report)
    if filepath is not None:
        with open(filepath, 'wt') as output_file:
            output_file.write(report)
Exemple #2
0
def get_scores(enc_sources,
               enc_target_sents,
               model,
               device,
               tokenizersrc,
               tokenizertrg,
               search="greedy",
               n=4):
    """
    takes a list of sentences and their translations in string form and returns score objects
    model is the trained transformer model
    tokenizer is the spm sentencpiece vocabulary in the form "name".model
    search is the decoding strategy, either greedy or beam search
    n is the beam width in beam search
    """
    model.eval()
    sp.load(tokenizertrg)
    targets = []
    outputs = []
    target_str = [sp.DecodeIds(sent.tolist()) for sent in enc_target_sents]
    output_str = []
    if search == "greedy":
        x = divide_chunks(enc_sources, 100)
        output_str = []
        for sents in x:
            print((len(output_str) / len(enc_sources)) * 100, end="\r")
            y = translate_enc_sentences(model,
                                        sents,
                                        device,
                                        tokenizertrg,
                                        max_length=150)
            output_str.extend(y)
        bleu = sacrebleu.corpus_bleu(output_str, [target_str])
        chrf = sacrebleu.corpus_chrf(output_str, [target_str])
        ter = sacrebleu.corpus_ter(output_str, [target_str])

        return bleu, chrf, ter
    elif search == "beam":
        prediction = beam_search(source, device, tokenizersrc, tokenizertrg, n)
    sp.Load(tokenizertrg)
    target = sp.DecodeIds(target.tolist())
    targets.append([target.split()])
    target_str.append(target)
    outputs.append(prediction.split())
    output_str.append(prediction)

    bleu = sacrebleu.corpus_bleu(output_str, [target_str])
    chrf = sacrebleu.corpus_chrf(output_str, [target_str])
    ter = sacrebleu.corpus_ter(output_str, [target_str])
    return bleu, chrf, ter
def compute_metrics(ref, hyp, hyp_order):
    refs = []
    hyps = []
    for id in hyp_order:
        for segment in hyp[id]:
            hyps.append(segment)
        try:
            for segment in ref[id]:
                refs.append(segment)
        except KeyError:
            sys.stderr.write('Error: there are no references for document' +
                             ' "' + id + '"\n')
            sys.exit(-1)

    metrics = []
    for n in range(len(hyps)):
        try:
            bleu = sacrebleu.corpus_bleu([hyps[n]], [[refs[n]]])
            chrf = sacrebleu.corpus_chrf([hyps[n]], [[refs[n]]])
        except EOFError:
            sys.stderr.write('Error: source and reference have different' +
                             ' lengths.\n')
            sys.exit(-1)
        metrics.append([bleu.score] + [chrf.score])
    return metrics
Exemple #4
0
def test_chrf_keep_whitespace(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses, [references],
                                  char_order=6,
                                  word_order=0,
                                  beta=3,
                                  remove_whitespace=False).score
    assert abs(score - expected_score) < EPSILON
Exemple #5
0
def test_chrf_eff_order(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses, [references],
                                  char_order=6,
                                  word_order=0,
                                  beta=3,
                                  eps_smoothing=False).score
    assert abs(score - expected_score) < EPSILON
Exemple #6
0
def test_chrf_keep_whitespace(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses,
                                  references,
                                  6,
                                  3,
                                  remove_whitespace=False)
    assert abs(score - expected_score) < EPSILON
def compute_metrics(hyp_dec_all,
                    ref_dec_all,
                    use_sacrebleu=True,
                    use_torchtext=True,
                    use_ter=False):
    metrics = {}

    # Sacrebleu
    if use_sacrebleu:
        metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf(
            hyp_dec_all, [ref_dec_all]).score
        if use_ter:  # Quite slow
            metrics["sacrebleu_ter"] = sacrebleu.corpus_ter(
                hyp_dec_all, [ref_dec_all]).score

    # Torchtext
    if use_torchtext:
        m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all],
                                  [[x.split(" ")] for x in ref_dec_all])
        metrics["torchtext_bleu"] = m_bleu_score * 100
    return metrics
Exemple #8
0
 def __call__(self, ref_path: str, hyp_path: str) -> float:
     ref_streams = load_ref_streams(ref_path, detok=True)
     sys_stream = load_sys_stream(hyp_path, detok=True)
     chrf3_score = sacrebleu.corpus_chrf(sys_stream,
                                         ref_streams,
                                         order=6,
                                         beta=3,
                                         remove_whitespace=True)
     return np.round(float(chrf3_score.score * 100), 2)
def chrf(hypotheses, references):
    """
    Character F-score from sacrebleu

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return:
    """
    return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)
Exemple #10
0
def chrf(hypotheses, references):
    """
    Character F-score from sacrebleu

    :param hypotheses:
    :param references:
    :return:
    """
    return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)
Exemple #11
0
def raw_corpus_chrf(hypotheses: Iterable[str],
                    references: Iterable[str]) -> float:
    """
    Simple wrapper around sacreBLEU's chrF implementation, without tokenization.

    :param hypotheses: Hypotheses stream.
    :param references: Reference stream.
    :return: chrF score as float between 0 and 1.
    """
    return sacrebleu.corpus_chrf(hypotheses, [references]).score
Exemple #12
0
def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> float:
    """
    Simple wrapper around sacreBLEU's chrF implementation, without tokenization.

    :param hypotheses: Hypotheses stream.
    :param references: Reference stream.
    :return: chrF score as float between 0 and 1.
    """
    return sacrebleu.corpus_chrf(hypotheses, references, order=sacrebleu.CHRF_ORDER, beta=sacrebleu.CHRF_BETA,
                                 remove_whitespace=True)
Exemple #13
0
def calculate_score_report(sys, ref, score_only):

    chrf = sacrebleu.corpus_chrf(sys, ref)
    bleu = sacrebleu.corpus_bleu(sys, ref)

    prefix = 'BLEU = ' if score_only else ''

    print('#### Score Report ####')
    print(chrf)
    print('{}{}'.format(prefix, bleu.format(score_only=score_only)))
Exemple #14
0
def chrf(items):
    """chrF++ is a tool for automatic evaluation of machine translation output
    based on character n-gram precision and recall enhanced with word n-grams.
    Source: https://github.com/m-popovic/chrF
    Paper: https://www.aclweb.org/anthology/W15-3049.pdf

    Higher is better  # TODO I think
    """
    refs = list(zip(*items))[0]
    preds = list(zip(*items))[1]
    refs, preds = _sacreformat(refs, preds)
    return sacrebleu.corpus_chrf(preds, refs).score
Exemple #15
0
def chrf(hypotheses, references, remove_whitespace=True):
    """
    Character F-score from sacrebleu

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :param remove_whitespace: (bool)
    :return:
    """
    return sacrebleu.corpus_chrf(hypotheses=hypotheses,
                                 references=[references],
                                 remove_whitespace=remove_whitespace).score
Exemple #16
0
def chrf(hypotheses, references, remove_whitespace=True):
    """
    Character F-score from sacrebleu

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :param remove_whitespace: (bool)
    :return: character f-score (0 <= chf <= 1)
             see Breaking Change in sacrebleu v2.0
    """
    score = sacrebleu.corpus_chrf(hypotheses=hypotheses,
                                  references=[references],
                                  remove_whitespace=remove_whitespace).score
    return score / 100
Exemple #17
0
 def evaluate_batch(self, summaries, references, aggregate=True):
     if aggregate:
         score = sacrebleu.corpus_chrf(summaries,
                                       references,
                                       order=self.ncorder,
                                       beta=self.beta)
         score_dict = {"chrf": score.score}
         return score_dict
     else:
         p = Pool(processes=self.n_workers)
         results = p.starmap(self.evaluate_example,
                             zip(summaries, references))
         p.close()
         return results
Exemple #18
0
def write_evals(writer, experiment, translation, file_path, ref, src):
    writer = SummaryWriter("runs/{}-{}".format(experiment, translation))

    steps = int(translation)
    output_path = "translations/{}/{}".format(experiment, translation)
    with open(output_path, "r", encoding="utf-8") as infile:
        system_output = [x.strip() for x in infile.readlines()]
        bleu = sacrebleu.corpus_bleu(system_output, [ref])
        chrf = sacrebleu.corpus_chrf(system_output, [ref])
        rhyme_score, copied, reconstructed = concurrent_score(system_output,
                                                                languages[experiment],
                                                                ref, src)
        print(experiment, translation, bleu.score, rhyme_score, copied, reconstructed)

    wall = os.stat(file_path).st_mtime
    writer.add_scalar(experiment + "/CHRF", chrf.score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/BLEU", bleu.score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Rhyme", rhyme_score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Copied", copied, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Reconstructed", reconstructed, global_step=steps, walltime=wall)
    writer.flush()
def eval_measure(gold, sys, eval_type='bleu'):
    ''' Evaluation measure

    This takes in gold labels and system outputs and evaluates their
    accuracy. It currently supports:
    * Accuracy (acc), percentage of labels that match
    * Pearson's correlation coefficient (pearson)
    * BLEU score (bleu)
    * BLEU_detok, on detokenized references and translations, with internal tokenization
    :param gold: the correct labels (reference)
    :param sys: the system outputs (hypothesis)
    :param eval_type: The type of evaluation to do (bleu, chrf3, hlepor)
    '''
    if eval_type == EVAL_TYPE_BLEU:
        # make sure score is 0-based instead of 100-based
        return corpus_bleu(sys, [gold]).score / 100.
    elif eval_type == EVAL_TYPE_CHRF3:
        return corpus_chrf(sys, [gold], beta=3).score
    elif eval_type == EVAL_TYPE_HLEPOR:
        return hlepor_score(sys, gold)
    else:
        raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
Exemple #20
0
 def chrf(self, hypo, groundtruth=None, lc=False):
     if groundtruth is None:
         ref = self._refs_for_sacre
     else:
         if isinstance(groundtruth[0], str):
             ref = [groundtruth]
         else:
             ref = groundtruth
     try:
         chrf = sacrebleu.corpus_chrf([(x.lower() if lc else x)
                                       for x in hypo],
                                      [[(x.lower() if lc else x) for x in y]
                                       for y in ref])
         return chrf.score
     except IndexError:
         logging.info("Found empty lines.")
         print(traceback.format_exc())
         return 0.
     except ZeroDivisionError:
         logging.info("Empty reference")
         print(traceback.format_exc())
         return 0.
Exemple #21
0
def validate(model,
             test_data,
             golden_file,
             beam_size=8,
             alpha=0.6,
             max_time_step=100):
    """For development Only"""
    pp = PostProcess()

    ref_stream = []
    for line in open(golden_file + '.input_clean'):
        if line.startswith('# ::tokens '):
            o = json.loads(line[len('# ::tokens '):].strip())
            ref_stream.append(' '.join(o).lower())
    # gold model output
    graph, gold_sys_stream, _, abstract = read_file(golden_file + '.preproc')
    ref_streams = [ref_stream]

    sys_stream = []
    for batch in test_data:
        res = generate_batch(model, batch, beam_size, alpha, max_time_step)
        sys_stream.extend(res['token'])

    assert len(sys_stream) == len(ref_stream)
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
    ]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=True,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)

    return bleu, chrf
Exemple #22
0
def validate(model, test_data, beam_size=8, alpha=0.6, max_time_step=100):
    """For development Only"""
    pp = PostProcess()

    ref_stream = []
    sys_stream = []
    for batch in test_data:
        res = generate_batch(model, batch, beam_size, alpha, max_time_step)
        sys_stream.extend(res['token'])
        ref_stream.extend(batch['target'])

    assert len(sys_stream) == len(ref_stream)
    sys_stream = [pp.post_process(o) for o in sys_stream]
    ref_stream = [' '.join(o) for i in ref_stream]
    ref_streams = [ref_stream]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=False,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)

    return bleu, chrf
Exemple #23
0
 def score_corpus_multiprocess(self, hypothesis: List[str],
                               references: List[List[str]]) -> float:
     if self.n_workers == 1:
         corpus_score = sb.corpus_chrf(hypothesis, references[0]).score
     else:
         batches = list(
             self._batch(hypothesis, references, n_batches=self.n_workers))
         corpus_statistics = [0 for _ in range(sb.CHRF_ORDER * 3)]
         with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
             futures = [
                 executor.submit(sb.get_corpus_statistics, b[0], b[1][0])
                 for b in batches
             ]
             progress = as_completed(futures)
             if self.verbose:
                 progress = tqdm(progress)
             for future in progress:
                 stats = future.result()
                 for i in range(sb.CHRF_ORDER * 3):
                     corpus_statistics[i] += stats[i]
         avg_precision, avg_recall = sb._avg_precision_and_recall(
             corpus_statistics, sb.CHRF_ORDER)
         corpus_score = sb._chrf(avg_precision, avg_recall)
     return corpus_score
Exemple #24
0
def raw_corpus_chrf(hypotheses: Iterable[str],
                    references: Iterable[str]) -> float:
    return sacrebleu.corpus_chrf(hypotheses, references,
                                 order=sacrebleu.CHRF_ORDER,
                                 beta=sacrebleu.CHRF_BETA,
                                 remove_whitespace=True)
Exemple #25
0
def test_chrf(hypotheses, references, expected_score):
    score = sacrebleu.corpus_chrf(hypotheses, [references], 6, 3).score
    assert abs(score - expected_score) < EPSILON
Exemple #26
0
#
# sys = ['The dog runs home hi.', 'the dog runs home hi.']
# ref = ['The dog ran home hi.', 'the dog runs home hello.']
# mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True)
# mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)
# print(mf1.score)
# print(mf1_old.score)


sys = ['The dog runs home now.']
ref = ['The dog ran home now.']
mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True)
mf1_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True, measure_name='f1')
mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)
mf1_old_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False, measure_name='f1')
chrf = sacrebleu.corpus_chrf(sys, ref)
print("sys:", sys)
print("ref:", ref)
print("mf1 new:", mf1.score)
print("mf1 f1 new:", mf1_f1.score)
print("mf1 old:", mf1_old.score)
print("mf1 f1 old:", mf1_old_f1.score)
print("chrf:", chrf)

print("-------------")

sys = ['The dog runs home now.']
ref = ['The dog runs home later.']
mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True)
mf1_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True, measure_name='f1')
mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)
Exemple #27
0
    prev = [' '.join(o) for o in pred_sys_stream]

    # choose one (gold or pred) and postprocess
    sys_stream = pred_sys_stream
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
    ]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=True,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)
    all_sent_chrf = [
        sacrebleu.sentence_chrf(x, y) for x, y in zip(sys_stream, ref_stream)
    ]
    avg_sent_chrf = sum(all_sent_chrf) / len(all_sent_chrf)
    if args.output:
        with open(args.pred_file + '.final', 'w') as fo:
            for x in sys_stream:
                fo.write(x + '\n')

        with open(args.pred_file + '.ref', 'w') as fo:
            for x in ref_stream:
                fo.write(x + '\n')
    print(avg_sent_chrf)
    print(bleu, chrf)
Exemple #28
0
 def score(self, src: List[str], cand: List[str],
           ref: List[str]) -> chrFResult:
     chrf = sacrebleu.corpus_chrf(cand, [ref])
     return chrFResult(chrf.score / 100, [], src, cand, ref, self.name)
Exemple #29
0
def score_individual_books(
    book_dict: dict,
    src_iso: str,
    predictions_detok_path: str,
    scorers: Set[str],
    config: Config,
    ref_projects: Set[str],
):
    overall_sys: List[str] = []
    book_scores: List[PairScore] = []

    for book in book_dict.keys():
        for trg_iso, book_tuple in book_dict[book].items():
            pair_sys = book_tuple[0]
            pair_refs = book_tuple[1]
            overall_sys.extend(pair_sys)

            bleu_score = None
            if "bleu" in scorers:
                bleu_score = sacrebleu.corpus_bleu(
                    pair_sys,
                    pair_refs,
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            if "sentencebleu" in scorers:
                write_sentence_bleu(
                    predictions_detok_path,
                    pair_sys,
                    pair_refs,
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            other_scores: Dict[str, float] = {}
            if "chrf3" in scorers:
                chrf3_score = sacrebleu.corpus_chrf(pair_sys,
                                                    pair_refs,
                                                    order=6,
                                                    beta=3,
                                                    remove_whitespace=True)
                other_scores["CHRF3"] = np.round(
                    float(chrf3_score.score * 100), 2)

            if "meteor" in scorers:
                meteor_score = compute_meteor_score(trg_iso, pair_sys,
                                                    pair_refs)
                if meteor_score is not None:
                    other_scores["METEOR"] = meteor_score

            if "wer" in scorers:
                wer_score = compute_wer_score(pair_sys,
                                              cast(List[str], pair_refs))
                if wer_score >= 0:
                    other_scores["WER"] = wer_score

            if "ter" in scorers:
                ter_score = compute_ter_score(pair_sys, pair_refs)
                if ter_score >= 0:
                    other_scores["TER"] = ter_score
            score = PairScore(book, src_iso, trg_iso, bleu_score,
                              len(pair_sys), ref_projects, other_scores)
            book_scores.append(score)
    return book_scores
Exemple #30
0
def test_checkpoint(
    config: Config,
    force_infer: bool,
    by_book: bool,
    ref_projects: Set[str],
    checkpoint_path: Path,
    step: int,
    scorers: Set[str],
    books: Set[int],
) -> List[PairScore]:
    config.set_seed()
    vref_paths: List[str] = []
    features_file_names: List[str] = []
    predictions_file_names: List[str] = []
    refs_patterns: List[str] = []
    predictions_detok_file_names: List[str] = []
    suffix_str = "_".join(map(lambda n: book_number_to_id(n), sorted(books)))
    if len(suffix_str) > 0:
        suffix_str += "-"
    suffix_str += "avg" if step == -1 else str(step)

    features_file_name = "test.src.txt"
    if (config.exp_dir / features_file_name).is_file():
        # all test data is stored in a single file
        vref_paths.append("test.vref.txt")
        features_file_names.append(features_file_name)
        predictions_file_names.append(f"test.trg-predictions.txt.{suffix_str}")
        refs_patterns.append("test.trg.detok*.txt")
        predictions_detok_file_names.append(
            f"test.trg-predictions.detok.txt.{suffix_str}")
    else:
        # test data is split into separate files
        for src_iso in sorted(config.src_isos):
            for trg_iso in sorted(config.trg_isos):
                if src_iso == trg_iso:
                    continue
                prefix = f"test.{src_iso}.{trg_iso}"
                features_file_name = f"{prefix}.src.txt"
                if (config.exp_dir / features_file_name).is_file():
                    vref_paths.append(f"{prefix}.vref.txt")
                    features_file_names.append(features_file_name)
                    predictions_file_names.append(
                        f"{prefix}.trg-predictions.txt.{suffix_str}")
                    refs_patterns.append(f"{prefix}.trg.detok*.txt")
                    predictions_detok_file_names.append(
                        f"{prefix}.trg-predictions.detok.txt.{suffix_str}")

    checkpoint_name = "averaged checkpoint" if step == -1 else f"checkpoint {step}"

    features_paths: List[Union[str, List[str]]] = []
    predictions_paths: List[str] = []
    for i in range(len(predictions_file_names)):
        predictions_path = config.exp_dir / predictions_file_names[i]
        if force_infer or not predictions_path.is_file():
            features_path = config.exp_dir / features_file_names[i]
            vref_path = config.exp_dir / vref_paths[i]
            if vref_path.is_file():
                features_paths.append([str(features_path), str(vref_path)])
            else:
                features_paths.append(str(features_path))
            predictions_paths.append(str(predictions_path))
    if len(predictions_paths) > 0:
        runner = create_runner(config)
        print(f"Inferencing {checkpoint_name}...")
        runner.infer_multiple(features_paths,
                              predictions_paths,
                              checkpoint_path=str(checkpoint_path))

    print(f"Scoring {checkpoint_name}...")
    default_src_iso = config.default_src_iso
    scores: List[PairScore] = []
    overall_sys: List[str] = []
    overall_refs: List[List[str]] = []
    for vref_file_name, features_file_name, predictions_file_name, refs_pattern, predictions_detok_file_name in zip(
            vref_paths, features_file_names, predictions_file_names,
            refs_patterns, predictions_detok_file_names):
        src_iso = default_src_iso
        if features_file_name != "test.src.txt":
            src_iso = features_file_name.split(".")[1]
        dataset, book_dict = load_test_data(
            vref_file_name,
            features_file_name,
            predictions_file_name,
            refs_pattern,
            predictions_detok_file_name,
            ref_projects,
            config,
            books,
            by_book,
        )

        for trg_iso, (pair_sys, pair_refs) in dataset.items():
            start_index = len(overall_sys)
            overall_sys.extend(pair_sys)
            for i, ref in enumerate(pair_refs):
                if i == len(overall_refs):
                    overall_refs.append([""] * start_index)
                overall_refs[i].extend(ref)
            # ensure that all refs are the same length as the sys
            for overall_ref in filter(lambda r: len(r) < len(overall_sys),
                                      overall_refs):
                overall_ref.extend([""] *
                                   (len(overall_sys) - len(overall_ref)))
            bleu_score = None
            if "bleu" in scorers:
                bleu_score = sacrebleu.corpus_bleu(
                    pair_sys,
                    cast(List[Iterable[str]], pair_refs),
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            if "sentencebleu" in scorers:
                write_sentence_bleu(
                    predictions_detok_file_name,
                    pair_sys,
                    cast(List[List[str]], pair_refs),
                    lowercase=True,
                    tokenize=config.data.get("sacrebleu_tokenize", "13a"),
                )

            other_scores: Dict[str, float] = {}
            if "chrf3" in scorers:
                chrf3_score = sacrebleu.corpus_chrf(pair_sys,
                                                    cast(
                                                        List[Iterable[str]],
                                                        pair_refs),
                                                    order=6,
                                                    beta=3,
                                                    remove_whitespace=True)
                other_scores["CHRF3"] = np.round(
                    float(chrf3_score.score * 100), 2)

            if "meteor" in scorers:
                meteor_score = compute_meteor_score(
                    trg_iso, pair_sys, cast(List[Iterable[str]], pair_refs))
                if meteor_score is not None:
                    other_scores["METEOR"] = meteor_score

            if "wer" in scorers:
                wer_score = compute_wer_score(pair_sys,
                                              cast(List[str], pair_refs))
                if wer_score >= 0:
                    other_scores["WER"] = wer_score

            if "ter" in scorers:
                ter_score = compute_ter_score(
                    pair_sys, cast(List[Iterable[str]], pair_refs))
                if ter_score >= 0:
                    other_scores["TER"] = ter_score

            scores.append(
                PairScore("ALL", src_iso, trg_iso, bleu_score, len(pair_sys),
                          ref_projects, other_scores))
            if by_book is True:
                if len(book_dict) != 0:
                    book_scores = score_individual_books(
                        book_dict, src_iso, predictions_detok_file_name,
                        scorers, config, ref_projects)
                    scores.extend(book_scores)
                else:
                    print(
                        "Error: book_dict did not load correctly. Not scoring individual books."
                    )
    if len(config.src_isos) > 1 or len(config.trg_isos) > 1:
        bleu = sacrebleu.corpus_bleu(overall_sys,
                                     cast(List[Iterable[str]], overall_refs),
                                     lowercase=True)
        scores.append(
            PairScore("ALL", "ALL", "ALL", bleu, len(overall_sys),
                      ref_projects))

    scores_file_root = f"scores-{suffix_str}"
    if len(ref_projects) > 0:
        ref_projects_suffix = "_".join(sorted(ref_projects))
        scores_file_root += f"-{ref_projects_suffix}"
    with (config.exp_dir / f"{scores_file_root}.csv").open(
            "w", encoding="utf-8") as scores_file:
        if scores is not None:
            scores[0].writeHeader(scores_file)
        for results in scores:
            results.write(scores_file)
    return scores