def test_newline_cnn_improvement():
    k = "rougeLsum"
    score = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=[k])[k]
    score_no_sep = calculate_rouge(PRED,
                                   TGT,
                                   newline_sep=False,
                                   rouge_keys=[k])[k]
    assert score > score_no_sep
def test_single_sent_scores_dont_depend_on_newline_sep():
    pred = [
        "Her older sister, Margot Frank, died in 1945, a month earlier than previously thought.",
        'Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports .',
    ]
    tgt = [
        "Margot Frank, died in 1945, a month earlier than previously thought.",
        'Prosecutor: "No videos were used in the crash investigation" German papers say they saw a cell phone video of the final seconds on board Flight 9525.',
    ]
    assert calculate_rouge(pred, tgt, newline_sep=True) == calculate_rouge(
        pred, tgt, newline_sep=False)
def test_disaggregated_scores_are_determinstic():
    no_aggregation = calculate_rouge(PRED,
                                     TGT,
                                     bootstrap_aggregation=False,
                                     rouge_keys=["rouge2", "rougeL"])
    assert isinstance(no_aggregation, defaultdict)
    no_aggregation_just_r2 = calculate_rouge(PRED,
                                             TGT,
                                             bootstrap_aggregation=False,
                                             rouge_keys=["rouge2"])
    assert (pd.DataFrame(
        no_aggregation["rouge2"]).fmeasure.mean() == pd.DataFrame(
            no_aggregation_just_r2["rouge2"]).fmeasure.mean())
def test_pegasus_newline():

    pred = [
        """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says .<n> "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """
    ]
    tgt = [
        """ Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports . Journalists at Bild and Paris Match are "very confident" the video clip is real, an editor says . Andreas Lubitz had informed his Lufthansa training school of an episode of severe depression, airline says ."""
    ]

    prev_score = calculate_rouge(pred,
                                 tgt,
                                 rouge_keys=["rougeLsum"],
                                 newline_sep=False)["rougeLsum"]
    new_score = calculate_rouge(pred, tgt,
                                rouge_keys=["rougeLsum"])["rougeLsum"]
    assert new_score > prev_score
Example #5
0
 def _generative_step(self, batch: dict) -> dict:
     pad_token_id = self.tokenizer.pad_token_id
     source_ids, source_mask, y = SummarizationDataset.trim_seq2seq_batch(
         batch, pad_token_id)
     t0 = time.time()
     generated_ids = self.model.generate(
         input_ids=source_ids,
         attention_mask=source_mask,
         use_cache=True,
     )
     gen_time = time.time() - t0 / source_ids.shape[0]
     preds = self.ids_to_clean_text(generated_ids)
     target = self.ids_to_clean_text(y)
     loss_tensors = self._step(batch)
     base_metrics = {
         name: loss
         for name, loss in zip(self.loss_names, loss_tensors)
     }
     rouge: Dict = calculate_rouge(preds, target)
     summ_len = np.mean(lmap(len, generated_ids))
     base_metrics.update(gen_time=gen_time,
                         summ_len=summ_len,
                         preds=preds,
                         target=target,
                         **rouge)
     return base_metrics
Example #6
0
def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
    """Kwargs will be passed to calculate_rouge"""
    pred_lns = [x.strip() for x in open(pred_path).readlines()]
    tgt_lns = [x.strip() for x in open(tgt_path).readlines()][:len(pred_lns)]
    metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
    if save_path is not None:
        save_json(metrics, save_path)
    return metrics  # these print nicely
Example #7
0
def main(args):
    with open(args.source_file, "r") as f:
        source_lines = [line.strip() for line in f if line.strip()]
    with open(args.target_file, "r") as f:
        target_lines = [line.strip() for line in f if line.strip()]
    assert len(source_lines) == len(target_lines)

    model_pred_lines = {}
    for model, pred_file in args.model_preds.items():
        with open(pred_file, "r") as f:
            model_pred_lines[model] = [
                line.strip() for line in f if line.strip()
            ]
        assert len(model_pred_lines[model]) == len(source_lines), (
            len(model_pred_lines), len(source_lines), pred_file)

    line_objs = []
    for filter_str in args.filters:
        obj = collections.OrderedDict()
        obj["filter_str"] = filter_str

        ids = filter_ids(source_lines, filter_str)
        obj["count"] = len(ids)

        for model, pred_lines in model_pred_lines.items():
            filtered_tgt_lines = select_lines_by_ids(target_lines, ids)
            filtered_pred_lines = select_lines_by_ids(pred_lines, ids)
            rouge_scores = calculate_rouge(filtered_pred_lines,
                                           filtered_tgt_lines)
            for rouge_key, score in rouge_scores.items():
                obj[f"{model}_{rouge_key}"] = score
        line_objs.append(obj)

    json.dump(line_objs, open("./analysis.json", "w"))

    ordered_keys = line_objs[0].keys()
    with open(args.output_file, "w") as f:
        f.write("|".join(ordered_keys) + "\n")
        f.write("|---" * len(ordered_keys) + "|\n")
        for obj in line_objs:
            for key in ordered_keys:
                f.write("|")
                if key not in obj:
                    f.write("-")
                elif isinstance(obj[key], str):
                    f.write(obj[key])
                elif isinstance(obj[key], float):
                    f.write(f"{obj[key]:.4f}")
                elif isinstance(obj[key], int):
                    f.write(f"{obj[key]}")
                else:
                    print(type(obj[key]), obj[key])
                    raise ValueError("Unknown field")
            f.write("|\n")
def evaluate(model, tokenizer, batch_size, trg_sent_len):
    valid_sampler = RandomSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=valid_sampler,
                                  batch_size=batch_size)
    epoch_iterator = tqdm(valid_dataloader, desc="Iteration")
    # criterion = nn.CrossEntropyLoss(ignore_index=0)
    references = []
    hypotheses = []
    epoch_loss = 0

    model.eval()
    for step, batch in enumerate(epoch_iterator):
        text_tensor = batch[0].to(device)
        token_type_tensor = batch[1].to(device)
        question_tensor = batch[2].to(device)
        with torch.no_grad():
            output, attention, prediction = model(
                text_tensor,
                token_type_tensor,
                question_tensor,
                teacher_forcing_ratio=0)  # turn off teacher forcing
        rouge_l, r, h = calculate_rouge(prediction.cpu().tolist(),
                                        question_tensor.cpu().tolist(),
                                        tokenizer)
        references.extend(r)
        hypotheses.extend(h)
        # compute loss
        logit = output[1:].view(-1, output.shape[-1]).contiguous()
        # Find a way to avoid calling contiguous
        trg = question_tensor[:, 1:].reshape(-1).contiguous()

        # prediction = [(trg sent len - 1) * batch size, output dim]
        # trg = [(trg sent len - 1) * batch size]
        with torch.no_grad():
            loss = loss_calc(logit, trg)

        epoch_loss += loss
        if step % int(len(valid_dataloader) * 0.1) == 0:
            sample_t = tokenizer.decode(question_tensor[0].cpu().tolist(),
                                        True)
            sample_p = tokenizer.decode(prediction[0].cpu().tolist(), True)
            logger.info(
                f'Batch {step} loss: {loss.item()} ROUGE_L score: {rouge_l}\n'
                + f'Target {sample_t}\n' + f'Prediction {sample_p}\n\n')

    metrics_dict = nlgeval.compute_metrics(references, hypotheses)
    logger.info(metrics_dict)
    return epoch_loss / len(valid_dataloader), metrics_dict['ROUGE_L']
Example #9
0
def run_generate():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_path", type=str, help="like cnn_dm/test.source")
    parser.add_argument("output_path", type=str, help="where to save summaries")
    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,t5-base, etc.")
    parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test_reference_summaries.txt")
    parser.add_argument("--score_path", type=str, required=False, help="where to save the rouge score in json format")
    parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
    parser.add_argument("--fp16", action="store_true")
    args = parser.parse_args()
    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path).readlines()]

    generate_summaries(
        examples, args.output_path, args.model_name, batch_size=args.bs, device=args.device, fp16=args.fp16
    )
    if args.score_path is not None:
        output_lns = [x.rstrip() for x in open(args.output_path).readlines()]
        reference_lns = [x.rstrip() for x in open(args.reference_path).readlines()]

        rouge: dict = calculate_rouge(output_lns, reference_lns)

        json.dump(rouge, open(args.score_path, "w+"))
Example #10
0
 def calc_generative_metrics(self, preds, target) -> Dict:
     return calculate_rouge(preds, target)
def test_newline_irrelevant_for_other_metrics():
    k = ["rouge1", "rouge2", "rougeL"]
    score_sep = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=k)
    score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=k)
    assert score_sep == score_no_sep
Example #12
0
 def summarization_metrics(pred: EvalPrediction) -> Dict:
     pred_str, label_str = decode_pred(pred)
     rouge: Dict = calculate_rouge(pred_str, label_str)
     summ_len = np.mean(lmap(non_pad_len, pred.predictions))
     rouge.update({"gen_len": summ_len})
     return rouge