def test_bleu_scores(self, pair, min_bleu_score):
        # note: this test is not testing the best performance since it only evals a small batch
        # but it should be enough to detect a regression in the output quality
        mname = f"facebook/wmt19-{pair}"
        tokenizer = self.get_tokenizer(mname)
        model = self.get_model(mname)

        src_sentences = bleu_data[pair]["src"]
        tgt_sentences = bleu_data[pair]["tgt"]

        batch = tokenizer(src_sentences,
                          return_tensors="pt",
                          truncation=True,
                          padding="longest").to(torch_device)
        outputs = model.generate(
            input_ids=batch.input_ids,
            num_beams=8,
        )
        decoded_sentences = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False)
        scores = calculate_bleu(decoded_sentences, tgt_sentences)
        print(scores)
        self.assertGreaterEqual(scores["bleu"], min_bleu_score)
Exemple #2
0
 def calc_generative_metrics(self, preds, target) -> dict:
     return calculate_bleu(preds, target)
Exemple #3
0
 def translation_metrics(pred: EvalPrediction) -> Dict:
     pred_str, label_str = decode_pred(pred)
     bleu: Dict = calculate_bleu(pred_str, label_str)
     gen_len = np.mean(lmap(non_pad_len, pred.predictions))
     bleu.update({"gen_len": gen_len})
     return bleu