Beispiel #1
0
def test_degenerate_statistics(statistics, offset, expected_score):
    score = sacrebleu.compute_bleu(statistics[0].common,
                                   statistics[0].total,
                                   statistics[1],
                                   statistics[2],
                                   smooth_method='floor',
                                   smooth_value=offset).score / 100
    assert score == expected_score
Beispiel #2
0
def sentence_bleu(hypothesis, reference):
    bleu = _corpus_bleu(hypothesis, reference)
    for i in range(1, 4):
        bleu.counts[i] += 1
        bleu.totals[i] += 1
    bleu = compute_bleu(
        bleu.counts, bleu.totals,
        bleu.sys_len, bleu.ref_len, smooth_method='exp'
    )
    return bleu.score
Beispiel #3
0
    def reduce_metrics(self, logging_outputs, criterion):
        super().reduce_metrics(logging_outputs, criterion)

        if self.cfg.eval_wer:
            zero = torch.scalar_tensor(0.0)
            num_char_errors = sum(
                log.get("_num_char_errors", zero) for log in logging_outputs
            )
            num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs)
            num_word_errors = sum(
                log.get("_num_word_errors", zero) for log in logging_outputs
            )
            num_words = sum(log.get("_num_words", zero) for log in logging_outputs)
            metrics.log_scalar("_num_char_errors", num_char_errors)
            metrics.log_scalar("_num_chars", num_chars)
            metrics.log_scalar("_num_word_errors", num_word_errors)
            metrics.log_scalar("_num_words", num_words)
            if num_chars > 0:
                metrics.log_derived(
                    "uer",
                    lambda meters: meters["_num_char_errors"].sum
                    * 100.0
                    / meters["_num_chars"].sum
                    if meters["_num_chars"].sum > 0
                    else float("nan"),
                )
            if num_words > 0:
                metrics.log_derived(
                    "wer",
                    lambda meters: meters["_num_word_errors"].sum
                    * 100.0
                    / meters["_num_words"].sum
                    if meters["_num_words"].sum > 0
                    else float("nan"),
                )
        if self.cfg.eval_bleu:
            len_keys = ["_bleu_sys_len", "_bleu_ref_len"]
            count_keys = [f"_bleu_counts_{i}" for i in range(4)]
            total_keys = [f"_bleu_totals_{i}" for i in range(4)]
            for k in len_keys + count_keys + total_keys:
                metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs))

            import sacrebleu

            metrics.log_derived(
                "bleu",
                lambda meters: sacrebleu.compute_bleu(
                    correct=[meters[k].sum for k in count_keys],
                    total=[meters[k].sum for k in total_keys],
                    sys_len=meters["_bleu_sys_len"].sum,
                    ref_len=meters["_bleu_ref_len"].sum,
                    smooth_method="exp",
                ).score,
            )
Beispiel #4
0
 def compute_bleu(meters):
     import inspect
     import sacrebleu
     fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0]
     if 'smooth_method' in fn_sig:
         smooth = {'smooth_method': 'exp'}
     else:
         smooth = {'smooth': 'exp'}
     bleu = sacrebleu.compute_bleu(
         correct=meters['_bleu_counts'].sum,
         total=meters['_bleu_totals'].sum,
         sys_len=meters['_bleu_sys_len'].sum,
         ref_len=meters['_bleu_ref_len'].sum,
         **smooth)
     return round(bleu.score, 2)
Beispiel #5
0
 def compute_bleu(meters):
     import inspect
     import sacrebleu
     fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0]
     if "smooth_method" in fn_sig:
         smooth = {"smooth_method": "exp"}
     else:
         smooth = {"smooth": "exp"}
     bleu = sacrebleu.compute_bleu(
         correct=meters["_bleu_counts"].sum,
         total=meters["_bleu_totals"].sum,
         sys_len=meters["_bleu_sys_len"].sum,
         ref_len=meters["_bleu_ref_len"].sum,
         **smooth)
     return round(bleu.score, 2)
Beispiel #6
0
  def score_cached_corpus(self, sent_ids, cached_stats):
    """
    Score a corpus using SacreBLEU score with cache

    Args:
      sent_ids: The sentence ids for reference and output corpora
      cached_stats: A list of cached statistics

    Returns:
      A tuple containing a single value for the SacreBLEU score and a string summarizing auxiliary information
    """
    if len(cached_stats) == 0:
      return 0.0, None

    counts, totals, sys_len, ref_len = zip(*cached_stats)
    counts, totals, sys_len, ref_len = [np.sum(np.array(x)[sent_ids], 0) for x in [counts, totals, sys_len, ref_len]]

    return sacrebleu.compute_bleu(counts, totals, sys_len, ref_len, smooth_method=self.smooth_method, smooth_value=self.smooth_value, use_effective_order=self.use_effective_order).score, None
def calc_bleu_from_stats(sentence_stats: pd.DataFrame) -> sacrebleu.BLEU:
    corpus_stats = sentence_stats.sum(axis=0)
    corpus_bleu = sacrebleu.compute_bleu(
        correct=[
            corpus_stats.correct_1_grams,
            corpus_stats.correct_2_grams,
            corpus_stats.correct_3_grams,
            corpus_stats.correct_4_grams,
        ],
        total=[
            corpus_stats.total_1_grams,
            corpus_stats.total_2_grams,
            corpus_stats.total_3_grams,
            corpus_stats.total_4_grams,
        ],
        sys_len=corpus_stats.translation_length,
        ref_len=corpus_stats.reference_length,
    )
    return corpus_bleu
Beispiel #8
0
 def score_corpus_multiprocess(self, hypothesis: List[str],
                               references: List[List[str]]) -> float:
     tokenizer = get_optional_dict(self.extra_args, 'bleu_tokenizer',
                                   'none')
     if self.n_workers == 1:
         corpus_score = sb.corpus_bleu(hypothesis,
                                       references,
                                       force=True,
                                       tokenize=tokenizer).score
     else:
         batches = list(
             self._batch(hypothesis, references, n_batches=self.n_workers))
         ref_len, sys_len = 0, 0
         correct = [0 for _ in range(sb.NGRAM_ORDER)]
         total = [0 for _ in range(sb.NGRAM_ORDER)]
         with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
             futures = [
                 executor.submit(sb.corpus_bleu,
                                 b[0],
                                 b[1],
                                 force=True,
                                 tokenize=tokenizer) for b in batches
             ]
             progress = as_completed(futures)
             if self.verbose:
                 progress = tqdm(progress)
             for future in progress:
                 s = future.result()
                 ref_len += s.ref_len
                 sys_len += s.sys_len
                 for n in range(sb.NGRAM_ORDER):
                     correct[n] += s.counts[n]
                     total[n] += s.totals[n]
             corpus_score = sb.compute_bleu(correct,
                                            total,
                                            sys_len,
                                            ref_len,
                                            smooth_method='exp').score
     return corpus_score
def corpus_bleu(sys_stream,
                ref_streams,
                smooth='exp',
                smooth_floor=0.0,
                force=False,
                lowercase=False,
                tokenize=sacrebleu.DEFAULT_TOKENIZER,
                use_effective_order=False) -> sacrebleu.BLEU:
    """Produces BLEU scores along with its sufficient statistics from a source
    against one or more references.

    :param sys_stream: The system stream (a sequence of segments)
    :param ref_streams: A list of one or more reference streams (each a
                        sequence of segments)
    :param smooth: The smoothing method to use
    :param smooth_floor: For 'floor' smoothing, the floor to use
    :param force: Ignore data that looks already tokenized
    :param lowercase: Lowercase the data
    :param tokenize: The tokenizer to use
    :return: a BLEU object containing everything you'd want
    """

    # Add some robustness to the input arguments
    if isinstance(sys_stream, str):
        sys_stream = [sys_stream]
    if isinstance(ref_streams, str):
        ref_streams = [[ref_streams]]

    sys_len = 0
    ref_len = 0

    correct = [0 for n in range(sacrebleu.NGRAM_ORDER)]
    total = [0 for n in range(sacrebleu.NGRAM_ORDER)]

    fhs = [sys_stream] + ref_streams
    for lines in zip_longest(*fhs):
        if None in lines:
            raise EOFError("Source and reference streams have different "
                           "lengths!")

        if lowercase:
            lines = [x.lower() for x in lines]

        output, *refs = [
            sacrebleu.TOKENIZERS[tokenize](x.rstrip()) for x in lines
        ]

        ref_ngrams, closest_diff, closest_len = sacrebleu.ref_stats(
            output, refs)

        sys_len += len(output.split())
        ref_len += closest_len

        sys_ngrams = sacrebleu.extract_ngrams(output)
        for ngram in sys_ngrams.keys():
            n = len(ngram.split())
            correct[n - 1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
            total[n - 1] += sys_ngrams[ngram]

    correct = all_reduce(correct)
    total = all_reduce(total)
    sys_len = all_reduce(sys_len)
    ref_len = all_reduce(ref_len)

    return sacrebleu.compute_bleu(correct, total, sys_len, ref_len, smooth,
                                  smooth_floor, use_effective_order)
Beispiel #10
0
def test_scoring(statistics, expected_score):
    score = sacrebleu.compute_bleu(statistics[0].common, statistics[0].total, statistics[1], statistics[2]).score / 100
    assert abs(score - expected_score) < EPSILON
Beispiel #11
0
def compute_bleu(correct, total, hyp_len, ref_len):
    bleu = sacrebleu.compute_bleu(correct, total, hyp_len, ref_len)
    #bleu = 0.5
    return bleu.score