def test_newline_cnn_improvement(): k = "rougeLsum" score = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=[k])[k] score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=[k])[k] assert score > score_no_sep
def test_single_sent_scores_dont_depend_on_newline_sep(): pred = [ "Her older sister, Margot Frank, died in 1945, a month earlier than previously thought.", 'Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports .', ] tgt = [ "Margot Frank, died in 1945, a month earlier than previously thought.", 'Prosecutor: "No videos were used in the crash investigation" German papers say they saw a cell phone video of the final seconds on board Flight 9525.', ] assert calculate_rouge(pred, tgt, newline_sep=True) == calculate_rouge( pred, tgt, newline_sep=False)
def test_disaggregated_scores_are_determinstic(): no_aggregation = calculate_rouge(PRED, TGT, bootstrap_aggregation=False, rouge_keys=["rouge2", "rougeL"]) assert isinstance(no_aggregation, defaultdict) no_aggregation_just_r2 = calculate_rouge(PRED, TGT, bootstrap_aggregation=False, rouge_keys=["rouge2"]) assert (pd.DataFrame( no_aggregation["rouge2"]).fmeasure.mean() == pd.DataFrame( no_aggregation_just_r2["rouge2"]).fmeasure.mean())
def test_pegasus_newline(): pred = [ """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says .<n> "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """ ] tgt = [ """ Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports . Journalists at Bild and Paris Match are "very confident" the video clip is real, an editor says . Andreas Lubitz had informed his Lufthansa training school of an episode of severe depression, airline says .""" ] prev_score = calculate_rouge(pred, tgt, rouge_keys=["rougeLsum"], newline_sep=False)["rougeLsum"] new_score = calculate_rouge(pred, tgt, rouge_keys=["rougeLsum"])["rougeLsum"] assert new_score > prev_score
def _generative_step(self, batch: dict) -> dict: pad_token_id = self.tokenizer.pad_token_id source_ids, source_mask, y = SummarizationDataset.trim_seq2seq_batch( batch, pad_token_id) t0 = time.time() generated_ids = self.model.generate( input_ids=source_ids, attention_mask=source_mask, use_cache=True, ) gen_time = time.time() - t0 / source_ids.shape[0] preds = self.ids_to_clean_text(generated_ids) target = self.ids_to_clean_text(y) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } rouge: Dict = calculate_rouge(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, summ_len=summ_len, preds=preds, target=target, **rouge) return base_metrics
def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs): """Kwargs will be passed to calculate_rouge""" pred_lns = [x.strip() for x in open(pred_path).readlines()] tgt_lns = [x.strip() for x in open(tgt_path).readlines()][:len(pred_lns)] metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs) if save_path is not None: save_json(metrics, save_path) return metrics # these print nicely
def main(args): with open(args.source_file, "r") as f: source_lines = [line.strip() for line in f if line.strip()] with open(args.target_file, "r") as f: target_lines = [line.strip() for line in f if line.strip()] assert len(source_lines) == len(target_lines) model_pred_lines = {} for model, pred_file in args.model_preds.items(): with open(pred_file, "r") as f: model_pred_lines[model] = [ line.strip() for line in f if line.strip() ] assert len(model_pred_lines[model]) == len(source_lines), ( len(model_pred_lines), len(source_lines), pred_file) line_objs = [] for filter_str in args.filters: obj = collections.OrderedDict() obj["filter_str"] = filter_str ids = filter_ids(source_lines, filter_str) obj["count"] = len(ids) for model, pred_lines in model_pred_lines.items(): filtered_tgt_lines = select_lines_by_ids(target_lines, ids) filtered_pred_lines = select_lines_by_ids(pred_lines, ids) rouge_scores = calculate_rouge(filtered_pred_lines, filtered_tgt_lines) for rouge_key, score in rouge_scores.items(): obj[f"{model}_{rouge_key}"] = score line_objs.append(obj) json.dump(line_objs, open("./analysis.json", "w")) ordered_keys = line_objs[0].keys() with open(args.output_file, "w") as f: f.write("|".join(ordered_keys) + "\n") f.write("|---" * len(ordered_keys) + "|\n") for obj in line_objs: for key in ordered_keys: f.write("|") if key not in obj: f.write("-") elif isinstance(obj[key], str): f.write(obj[key]) elif isinstance(obj[key], float): f.write(f"{obj[key]:.4f}") elif isinstance(obj[key], int): f.write(f"{obj[key]}") else: print(type(obj[key]), obj[key]) raise ValueError("Unknown field") f.write("|\n")
def evaluate(model, tokenizer, batch_size, trg_sent_len): valid_sampler = RandomSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=batch_size) epoch_iterator = tqdm(valid_dataloader, desc="Iteration") # criterion = nn.CrossEntropyLoss(ignore_index=0) references = [] hypotheses = [] epoch_loss = 0 model.eval() for step, batch in enumerate(epoch_iterator): text_tensor = batch[0].to(device) token_type_tensor = batch[1].to(device) question_tensor = batch[2].to(device) with torch.no_grad(): output, attention, prediction = model( text_tensor, token_type_tensor, question_tensor, teacher_forcing_ratio=0) # turn off teacher forcing rouge_l, r, h = calculate_rouge(prediction.cpu().tolist(), question_tensor.cpu().tolist(), tokenizer) references.extend(r) hypotheses.extend(h) # compute loss logit = output[1:].view(-1, output.shape[-1]).contiguous() # Find a way to avoid calling contiguous trg = question_tensor[:, 1:].reshape(-1).contiguous() # prediction = [(trg sent len - 1) * batch size, output dim] # trg = [(trg sent len - 1) * batch size] with torch.no_grad(): loss = loss_calc(logit, trg) epoch_loss += loss if step % int(len(valid_dataloader) * 0.1) == 0: sample_t = tokenizer.decode(question_tensor[0].cpu().tolist(), True) sample_p = tokenizer.decode(prediction[0].cpu().tolist(), True) logger.info( f'Batch {step} loss: {loss.item()} ROUGE_L score: {rouge_l}\n' + f'Target {sample_t}\n' + f'Prediction {sample_p}\n\n') metrics_dict = nlgeval.compute_metrics(references, hypotheses) logger.info(metrics_dict) return epoch_loss / len(valid_dataloader), metrics_dict['ROUGE_L']
def run_generate(): parser = argparse.ArgumentParser() parser.add_argument("input_path", type=str, help="like cnn_dm/test.source") parser.add_argument("output_path", type=str, help="where to save summaries") parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,t5-base, etc.") parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test_reference_summaries.txt") parser.add_argument("--score_path", type=str, required=False, help="where to save the rouge score in json format") parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--fp16", action="store_true") args = parser.parse_args() examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path).readlines()] generate_summaries( examples, args.output_path, args.model_name, batch_size=args.bs, device=args.device, fp16=args.fp16 ) if args.score_path is not None: output_lns = [x.rstrip() for x in open(args.output_path).readlines()] reference_lns = [x.rstrip() for x in open(args.reference_path).readlines()] rouge: dict = calculate_rouge(output_lns, reference_lns) json.dump(rouge, open(args.score_path, "w+"))
def calc_generative_metrics(self, preds, target) -> Dict: return calculate_rouge(preds, target)
def test_newline_irrelevant_for_other_metrics(): k = ["rouge1", "rouge2", "rougeL"] score_sep = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=k) score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=k) assert score_sep == score_no_sep
def summarization_metrics(pred: EvalPrediction) -> Dict: pred_str, label_str = decode_pred(pred) rouge: Dict = calculate_rouge(pred_str, label_str) summ_len = np.mean(lmap(non_pad_len, pred.predictions)) rouge.update({"gen_len": summ_len}) return rouge