def test_degenerate_statistics(statistics, offset, expected_score): score = sacrebleu.compute_bleu(statistics[0].common, statistics[0].total, statistics[1], statistics[2], smooth_method='floor', smooth_value=offset).score / 100 assert score == expected_score
def sentence_bleu(hypothesis, reference): bleu = _corpus_bleu(hypothesis, reference) for i in range(1, 4): bleu.counts[i] += 1 bleu.totals[i] += 1 bleu = compute_bleu( bleu.counts, bleu.totals, bleu.sys_len, bleu.ref_len, smooth_method='exp' ) return bleu.score
def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if self.cfg.eval_wer: zero = torch.scalar_tensor(0.0) num_char_errors = sum( log.get("_num_char_errors", zero) for log in logging_outputs ) num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) num_word_errors = sum( log.get("_num_word_errors", zero) for log in logging_outputs ) num_words = sum(log.get("_num_words", zero) for log in logging_outputs) metrics.log_scalar("_num_char_errors", num_char_errors) metrics.log_scalar("_num_chars", num_chars) metrics.log_scalar("_num_word_errors", num_word_errors) metrics.log_scalar("_num_words", num_words) if num_chars > 0: metrics.log_derived( "uer", lambda meters: meters["_num_char_errors"].sum * 100.0 / meters["_num_chars"].sum if meters["_num_chars"].sum > 0 else float("nan"), ) if num_words > 0: metrics.log_derived( "wer", lambda meters: meters["_num_word_errors"].sum * 100.0 / meters["_num_words"].sum if meters["_num_words"].sum > 0 else float("nan"), ) if self.cfg.eval_bleu: len_keys = ["_bleu_sys_len", "_bleu_ref_len"] count_keys = [f"_bleu_counts_{i}" for i in range(4)] total_keys = [f"_bleu_totals_{i}" for i in range(4)] for k in len_keys + count_keys + total_keys: metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs)) import sacrebleu metrics.log_derived( "bleu", lambda meters: sacrebleu.compute_bleu( correct=[meters[k].sum for k in count_keys], total=[meters[k].sum for k in total_keys], sys_len=meters["_bleu_sys_len"].sum, ref_len=meters["_bleu_ref_len"].sum, smooth_method="exp", ).score, )
def compute_bleu(meters): import inspect import sacrebleu fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0] if 'smooth_method' in fn_sig: smooth = {'smooth_method': 'exp'} else: smooth = {'smooth': 'exp'} bleu = sacrebleu.compute_bleu( correct=meters['_bleu_counts'].sum, total=meters['_bleu_totals'].sum, sys_len=meters['_bleu_sys_len'].sum, ref_len=meters['_bleu_ref_len'].sum, **smooth) return round(bleu.score, 2)
def compute_bleu(meters): import inspect import sacrebleu fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0] if "smooth_method" in fn_sig: smooth = {"smooth_method": "exp"} else: smooth = {"smooth": "exp"} bleu = sacrebleu.compute_bleu( correct=meters["_bleu_counts"].sum, total=meters["_bleu_totals"].sum, sys_len=meters["_bleu_sys_len"].sum, ref_len=meters["_bleu_ref_len"].sum, **smooth) return round(bleu.score, 2)
def score_cached_corpus(self, sent_ids, cached_stats): """ Score a corpus using SacreBLEU score with cache Args: sent_ids: The sentence ids for reference and output corpora cached_stats: A list of cached statistics Returns: A tuple containing a single value for the SacreBLEU score and a string summarizing auxiliary information """ if len(cached_stats) == 0: return 0.0, None counts, totals, sys_len, ref_len = zip(*cached_stats) counts, totals, sys_len, ref_len = [np.sum(np.array(x)[sent_ids], 0) for x in [counts, totals, sys_len, ref_len]] return sacrebleu.compute_bleu(counts, totals, sys_len, ref_len, smooth_method=self.smooth_method, smooth_value=self.smooth_value, use_effective_order=self.use_effective_order).score, None
def calc_bleu_from_stats(sentence_stats: pd.DataFrame) -> sacrebleu.BLEU: corpus_stats = sentence_stats.sum(axis=0) corpus_bleu = sacrebleu.compute_bleu( correct=[ corpus_stats.correct_1_grams, corpus_stats.correct_2_grams, corpus_stats.correct_3_grams, corpus_stats.correct_4_grams, ], total=[ corpus_stats.total_1_grams, corpus_stats.total_2_grams, corpus_stats.total_3_grams, corpus_stats.total_4_grams, ], sys_len=corpus_stats.translation_length, ref_len=corpus_stats.reference_length, ) return corpus_bleu
def score_corpus_multiprocess(self, hypothesis: List[str], references: List[List[str]]) -> float: tokenizer = get_optional_dict(self.extra_args, 'bleu_tokenizer', 'none') if self.n_workers == 1: corpus_score = sb.corpus_bleu(hypothesis, references, force=True, tokenize=tokenizer).score else: batches = list( self._batch(hypothesis, references, n_batches=self.n_workers)) ref_len, sys_len = 0, 0 correct = [0 for _ in range(sb.NGRAM_ORDER)] total = [0 for _ in range(sb.NGRAM_ORDER)] with ProcessPoolExecutor(max_workers=self.n_workers) as executor: futures = [ executor.submit(sb.corpus_bleu, b[0], b[1], force=True, tokenize=tokenizer) for b in batches ] progress = as_completed(futures) if self.verbose: progress = tqdm(progress) for future in progress: s = future.result() ref_len += s.ref_len sys_len += s.sys_len for n in range(sb.NGRAM_ORDER): correct[n] += s.counts[n] total[n] += s.totals[n] corpus_score = sb.compute_bleu(correct, total, sys_len, ref_len, smooth_method='exp').score return corpus_score
def corpus_bleu(sys_stream, ref_streams, smooth='exp', smooth_floor=0.0, force=False, lowercase=False, tokenize=sacrebleu.DEFAULT_TOKENIZER, use_effective_order=False) -> sacrebleu.BLEU: """Produces BLEU scores along with its sufficient statistics from a source against one or more references. :param sys_stream: The system stream (a sequence of segments) :param ref_streams: A list of one or more reference streams (each a sequence of segments) :param smooth: The smoothing method to use :param smooth_floor: For 'floor' smoothing, the floor to use :param force: Ignore data that looks already tokenized :param lowercase: Lowercase the data :param tokenize: The tokenizer to use :return: a BLEU object containing everything you'd want """ # Add some robustness to the input arguments if isinstance(sys_stream, str): sys_stream = [sys_stream] if isinstance(ref_streams, str): ref_streams = [[ref_streams]] sys_len = 0 ref_len = 0 correct = [0 for n in range(sacrebleu.NGRAM_ORDER)] total = [0 for n in range(sacrebleu.NGRAM_ORDER)] fhs = [sys_stream] + ref_streams for lines in zip_longest(*fhs): if None in lines: raise EOFError("Source and reference streams have different " "lengths!") if lowercase: lines = [x.lower() for x in lines] output, *refs = [ sacrebleu.TOKENIZERS[tokenize](x.rstrip()) for x in lines ] ref_ngrams, closest_diff, closest_len = sacrebleu.ref_stats( output, refs) sys_len += len(output.split()) ref_len += closest_len sys_ngrams = sacrebleu.extract_ngrams(output) for ngram in sys_ngrams.keys(): n = len(ngram.split()) correct[n - 1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0)) total[n - 1] += sys_ngrams[ngram] correct = all_reduce(correct) total = all_reduce(total) sys_len = all_reduce(sys_len) ref_len = all_reduce(ref_len) return sacrebleu.compute_bleu(correct, total, sys_len, ref_len, smooth, smooth_floor, use_effective_order)
def test_scoring(statistics, expected_score): score = sacrebleu.compute_bleu(statistics[0].common, statistics[0].total, statistics[1], statistics[2]).score / 100 assert abs(score - expected_score) < EPSILON
def compute_bleu(correct, total, hyp_len, ref_len): bleu = sacrebleu.compute_bleu(correct, total, hyp_len, ref_len) #bleu = 0.5 return bleu.score