def _test(metric_device): engine = Engine(update) m = Bleu(ngram=4, smooth="smooth2", average="micro") m.attach(engine, "bleu") engine.run(data=list(range(size)), max_epochs=1) assert "bleu" in engine.state.metrics ref_bleu = 0 references = [] candidates = [] for _candidates, _references in data: references.append(_references[0]) candidates.append(_candidates[0]) with warnings.catch_warnings(): warnings.simplefilter("ignore") ref_bleu += corpus_bleu( references, candidates, weights=[0.25, 0.25, 0.25, 0.25], smoothing_function=SmoothingFunction().method2, ) assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu
def test_bleu_batch_macro(): bleu = Bleu(ngram=4) # Batch size 3 hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] refs = [corpus.references_1, corpus.references_2, corpus.references_2] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): warnings.simplefilter("ignore") reference_bleu_score = (sentence_bleu(refs[0], hypotheses[0]) + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2])) / 3 assert pytest.approx(bleu.compute()) == reference_bleu_score value = 0 for _hypotheses, _refs in zip(hypotheses, refs): value += bleu._sentence_bleu(_refs, _hypotheses) bleu.update(([_hypotheses], [_refs])) ref_1 = value / len(refs) ref_2 = bleu.compute() assert pytest.approx(ref_1) == reference_bleu_score assert pytest.approx(ref_2) == reference_bleu_score
def test_n_gram_counter(candidates, references): bleu = Bleu(ngram=4) hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) assert hyp_length == len(candidates) ref_lens = (len(reference) for reference in references) closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - len(candidates)), ref_len)) assert ref_length == closest_ref_len
def test_accumulation_micro_bleu(): bleu = Bleu(ngram=4, smooth="smooth2", average="micro") bleu.update(([corpus.cand_1], [corpus.references_1])) bleu.update(([corpus.cand_2a], [corpus.references_2])) bleu.update(([corpus.cand_2b], [corpus.references_2])) bleu.update(([corpus.cand_3], [corpus.references_2])) value = bleu._corpus_bleu( [ corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2 ], [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3], ) assert bleu.compute() == value
def test_wrong_inputs(): with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): Bleu(ngram=0) with pytest.raises(ValueError, match=r"Smooth is not valid"): Bleu(smooth="fake") with pytest.raises( ValueError, match=r"nb of candidates should be equal to nb of reference lists" ): Bleu()._corpus_bleu(references=[[0], [0]], candidates=[[0]]) with pytest.raises(NotComputableError): Bleu().compute()
def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8): for i in range(1, ngram_range): weights = tuple([1 / i] * i) bleu = Bleu(ngram=i, average=average, smooth=smooth) if average == "macro": with warnings.catch_warnings(): warnings.simplefilter("ignore") reference = sentence_bleu(references[0], candidates[0], weights=weights, smoothing_function=smooth_nltk_fn) assert pytest.approx(reference) == bleu._sentence_bleu( references[0], candidates[0]) elif average == "micro": with warnings.catch_warnings(): warnings.simplefilter("ignore") reference = corpus_bleu(references, candidates, weights=weights, smoothing_function=smooth_nltk_fn) assert pytest.approx(reference) == bleu._corpus_bleu( references, candidates) bleu.update((candidates, references)) assert pytest.approx(reference) == bleu.compute()
def _test(metric_device): engine = Engine(update) m = Bleu(ngram=4, smooth="smooth2") m.attach(engine, "bleu") engine.run(data=list(range(size)), max_epochs=1) assert "bleu" in engine.state.metrics ref_bleu = 0 for candidates, references in data: with warnings.catch_warnings(): warnings.simplefilter("ignore") ref_bleu += sentence_bleu( references[0], candidates[0], weights=[0.25, 0.25, 0.25, 0.25], smoothing_function=SmoothingFunction().method2, ) assert pytest.approx( engine.state.metrics["bleu"]) == ref_bleu / len(data)
def test_corpus_bleu(candidate, references): print(candidate, references) for i in range(1, 8): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") reference = corpus_bleu(references, candidate, weights=weights) bleu = Bleu(ngram=i) assert pytest.approx(reference) == bleu._corpus_bleu( references, candidate) bleu.update((candidate[0], references[0])) assert pytest.approx(reference) == bleu.compute()
def test_corpus_bleu_nltk_smooth2(candidate, references): for i in range(1, 8): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") reference = corpus_bleu( references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2) bleu = Bleu(ngram=i, smooth="nltk_smooth2") assert reference == bleu._corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) assert reference == bleu.compute()
def test_bleu_batch_micro(): bleu = Bleu(ngram=4, average="micro") # Batch size 3 hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] refs = [corpus.references_1, corpus.references_2, corpus.references_2] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): warnings.simplefilter("ignore") reference_bleu_score = corpus_bleu(refs, hypotheses) assert pytest.approx(bleu.compute()) == reference_bleu_score assert pytest.approx(bleu._corpus_bleu(refs, hypotheses)) == reference_bleu_score
def test_accumulation_macro_bleu(): bleu = Bleu(ngram=4, smooth="smooth2") bleu.update(([corpus.cand_1], [corpus.references_1])) bleu.update(([corpus.cand_2a], [corpus.references_2])) bleu.update(([corpus.cand_2b], [corpus.references_2])) bleu.update(([corpus.cand_3], [corpus.references_2])) value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) assert bleu.compute() == value / 4
def test_bleu(): bleu = Bleu(ngram=4, smooth="smooth2") bleu.update((corpus.cand_1, corpus.references_1)) bleu.update((corpus.cand_2a, corpus.references_2)) bleu.update((corpus.cand_2b, corpus.references_2)) bleu.update((corpus.cand_3, corpus.references_2)) value = bleu._corpus_bleu([corpus.references_1], [corpus.cand_1]) value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2a]) value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2b]) value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_3]) assert bleu.compute() == value / 4