def text_eval(encoder, features_iter, model_dir, global_step, eval_tag, enable_logging, inputs_pattern="^inputs[0-9]*$", targets_key="targets", predictions_key="outputs", additional_keys=(), num_reserved=None): """Evaluates a set of text targets/predictions.""" decode_fn = lambda x: ids2str(encoder, x, num_reserved) scorers_dict = {} scorers_dict[_ROUGE_METRIC] = rouge_scorer.RougeScorer( ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True) scorers_dict[_BLEU_METRIC] = bleu_scorer.BleuScorer() scorers_dict[_REPETITION_METRIC] = repetition_scorer.RepetitionScorer( ["regs1", "regs2", "regs3", "regsTCR"]) scorers_dict[_LENGTH_METRIC] = length_scorer.LengthScorer(["word", "char"]) aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorers_dict} with LogWriter(additional_keys, model_dir, global_step, eval_tag, enable_logging) as log_writer: for i, features in enumerate(features_iter): inputs_list = [] for k in sorted(features): if re.match(inputs_pattern, k): single_inputs = decode_matrix(decode_fn, features[k]) if isinstance(single_inputs, list): inputs_list.extend(single_inputs) else: inputs_list.append(single_inputs) inputs = "\n".join(inputs_list) targets = decode_fn(features[targets_key]) preds = decode_fn(features[predictions_key]) text_dict = { "inputs": inputs_list, "targets": targets, "predictions": preds } for key in additional_keys: if key == "selected_ids": text_dict[key] = decode_selected_indices(decode_fn, features) else: text_dict[key] = decode_matrix(decode_fn, features[key]) log_writer.write(text_dict, i) for key, scorer in scorers_dict.items(): scores_i = scorer.score(targets, preds) aggregators_dict[key].add_scores(scores_i) aggregates_dict = {k: v.aggregate() for k, v in aggregators_dict.items()} length_histograms = scorers_dict[_LENGTH_METRIC].histograms(as_string=True) _write_aggregates(model_dir, global_step, eval_tag, aggregates_dict, length_histograms) _write_aggregate_summaries(model_dir, global_step, eval_tag, aggregates_dict)
def test_epoch_end(self, outputs): """ Called at the end of a testing epoch: `PyTorch Lightning Documentation <https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.html#pytorch_lightning.core.LightningModule.test_epoch_end>`__ Finds the mean of all the metrics logged by :meth:`~abstractive.AbstractiveSummarizer.test_step`. """ avg_generation_time = torch.stack( [x["generation_time"] for x in outputs] ).mean() rouge_scores_log = {} if self.hparams.test_use_pyrouge: test_rouge("tmp", "save_pred.txt", "save_gold.txt") else: aggregator = scoring.BootstrapAggregator() rouge_scores_list = [ rouge_score_set for batch_list in outputs for rouge_score_set in batch_list["rouge_scores"] ] for score in rouge_scores_list: aggregator.add_scores(score) # The aggregator returns a dictionary with keys coresponding to the rouge metric # and values that are `AggregateScore` objects. Each `AggregateScore` object is a # named tuple with a low, mid, and high value. Each value is a `Score` object, which # is also a named tuple, that contains the precision, recall, and fmeasure values. # For more info see the source code: https://github.com/google-research/google-research/blob/master/rouge/scoring.py rouge_result = aggregator.aggregate() for metric, value in rouge_result.items(): rouge_scores_log[metric + "-precision"] = value.mid.precision rouge_scores_log[metric + "-recall"] = value.mid.recall rouge_scores_log[metric + "-fmeasure"] = value.mid.fmeasure # Write the saved predictions and targets to file if self.hparams.save_percentage: predictions = [ x["prediction"] for x in outputs if x["prediction"] is not None ] targets = [x["target"] for x in outputs if x["target"] is not None] output_test_predictions_file = os.path.join( self.hparams.default_root_dir, "test_predictions.txt" ) output_test_targets_file = os.path.join( self.hparams.default_root_dir, "test_targets.txt" ) with open(output_test_predictions_file, "w+") as p_writer, open( output_test_targets_file, "w+" ) as t_writer: for prediction, target in zip(predictions, targets): p_writer.writelines(s + "\n" for s in prediction) t_writer.writelines(s + "\n" for s in target) p_writer.close() t_writer.close() # Generate logs tqdm_dict = {"generation_time": avg_generation_time} log = {**rouge_scores_log, **tqdm_dict} result = {"progress_bar": tqdm_dict, "log": log} return result
def _compute(self, preds, refs, rouge_types=None, use_agregator=True, use_stemmer=False): if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) if use_agregator: aggregator = scoring.BootstrapAggregator() else: scores = [] for r, p in zip(refs, preds): score = scorer.score(r, p) if use_agregator: aggregator.add_scores(score) else: scores.append(score) if use_agregator: y = aggregator.aggregate() else: y = {} for k in scores[0]: y[k] = list(score[k] for score in scores) return y
def _compute(self, predictions, references, rouge_types=None, use_agregator=True, use_stemmer=False): if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) if use_agregator: aggregator = scoring.BootstrapAggregator() else: scores = [] for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) if use_agregator: aggregator.add_scores(score) else: scores.append(score) if use_agregator: result = aggregator.aggregate() else: result = {} for key in scores[0]: result[key] = list(score[key] for score in scores) return result
def test_aggregate(self): np.random.seed(0) types = ["regs1", "regs2", "regs3", "regsLCR", "regsTCR"] rs = repetition_scorer.RepetitionScorer(repetition_types=types) aggregator = scoring.BootstrapAggregator() for text in ["a a a b c b", "a b a b c b", "a b a b c a b c"]: aggregator.add_scores(rs.score("", text)) aggregates = aggregator.aggregate() self.assertAlmostEqual(aggregates["regs1"].low.prediction_ratio, 5 / 6) self.assertAlmostEqual(aggregates["regs1"].high.prediction_ratio, 1) self.assertAlmostEqual(aggregates["regs1"].mid.prediction_ratio, (5 / 6 + 5 / 6 + 1) / 3) self.assertAlmostEqual(aggregates["regs2"].low.prediction_ratio, 2 / 5) self.assertAlmostEqual(aggregates["regs2"].high.prediction_ratio, 5 / 7) self.assertAlmostEqual(aggregates["regs2"].mid.prediction_ratio, (2 / 5 + 2 / 5 + 5 / 7) / 3) self.assertAlmostEqual(aggregates["regs3"].low.prediction_ratio, 0) self.assertAlmostEqual(aggregates["regs3"].high.prediction_ratio, 2 / 6) self.assertAlmostEqual(aggregates["regs3"].mid.prediction_ratio, (0 + 0 + 2 / 6) / 3) self.assertAlmostEqual(aggregates["regsLCR"].low.prediction_ratio, 3 / 6) self.assertAlmostEqual(aggregates["regsLCR"].high.prediction_ratio, 6 / 8) self.assertAlmostEqual(aggregates["regsLCR"].mid.prediction_ratio, (3 / 6 + 4 / 6 + 6 / 8) / 3) self.assertAlmostEqual(aggregates["regsTCR"].low.prediction_ratio, 3 / 6) self.assertAlmostEqual(aggregates["regsTCR"].high.prediction_ratio, 1) self.assertAlmostEqual(aggregates["regsTCR"].mid.prediction_ratio, (3 / 6 + 4 / 6 + 1) / 3)
def rouge(self, refs, preds): """ Returns `t5` style ROUGE scores. See the related implementation: https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 :param refs: A `list` of reference `strs`. :param preds: A `list` of predicted `strs`. """ rouge_types = ["rouge1", "rouge2", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types) # Add newlines between sentences to correctly compute `rougeLsum`. def _prepare_summary(summary): summary = summary.replace(" . ", ".\n") return summary # Accumulate confidence intervals. aggregator = scoring.BootstrapAggregator() for ref, pred in zip(refs, preds): ref = _prepare_summary(ref) pred = _prepare_summary(pred) aggregator.add_scores(scorer.score(ref, pred)) result = aggregator.aggregate() return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
def calculate_rouge( pred_lns: List[str], tgt_lns: List[str], use_stemmer=True, rouge_keys=ROUGE_KEYS, return_precision_and_recall=False, bootstrap_aggregation=True, newline_sep=True, ) -> Dict: scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for pred, tgt in zip(tgt_lns, pred_lns): # rougeLsum expects "\n" separated sentences within a summary if newline_sep: pred = add_newline_to_end_of_each_sentence(pred) tgt = add_newline_to_end_of_each_sentence(tgt) scores = scorer.score(pred, tgt) aggregator.add_scores(scores) if bootstrap_aggregation: result = aggregator.aggregate() if return_precision_and_recall: return extract_rouge_mid_statistics(result) # here we return dict else: return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()} else: return aggregator._scores # here we return defaultdict(list)
def _compute(self, predictions, references, use_agregator=True, use_stemmer=False): rouge_types = ["rougeL"] predictions = " ".join([str(p) for p in predictions]) references = " ".join([str(r) for r in references]) scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) if use_agregator: aggregator = scoring.BootstrapAggregator() else: scores = [] for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) if use_agregator: aggregator.add_scores(score) else: scores.append(score) if use_agregator: result = aggregator.aggregate() else: result = {} for key in scores[0]: result[key] = list(score[key] for score in scores) return result
def calculate_meteor(output_lns, reference_lns): aggregator = scoring.BootstrapAggregator() for reference_ln, output_ln in zip(reference_lns, output_lns): scores = meteor_score.single_meteor_score(reference_ln, output_ln) aggregator.add_scores({'meteor': scores}) result = aggregator.aggregate() return {k: round(v.mid * 100, 4) for k, v in result.items()}
def _rouge_calculation(hypotheses, references1, references2=[], metrics=['rougeLsum']): """Internal function for rouge scoring. If two references are provided, the best score is chosen for each instance. Args: hypotheses: list of predicted long answers references1: list of references to score hypotheses against references2: optional list of references to score hypotheses against metrics: evaluation metric Returns: dictionary representation of rouge scores """ if references2 == []: references2 = references1 scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True) aggregator1 = scoring.BootstrapAggregator() aggregator2 = scoring.BootstrapAggregator() for i in range(len(hypotheses)): scores1 = scorer.score(references1[i], hypotheses[i]) scores2 = scorer.score(references2[i], hypotheses[i]) aggregator1.add_scores(scores1) aggregator2.add_scores(scores2) scores = {m: [] for m in metrics} for m in metrics: fmeasure1 = aggregator1.aggregate()[m].mid.fmeasure fmeasure2 = aggregator2.aggregate()[m].mid.fmeasure scores[m].append(max(fmeasure1, fmeasure2)) for m in scores: scores[m] = 100 * sum(scores[m]) / len(scores[m]) return scores
def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer=True) -> Dict: scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for reference_ln, output_ln in zip(reference_lns, output_lns): scores = scorer.score(reference_ln, output_ln) aggregator.add_scores(scores) result = aggregator.aggregate() return {k: v.mid.fmeasure for k, v in result.items()}
def calculate_rouge(predicted_txts, reference_txts, rouge_keys=["rouge1", "rouge2", "rougeL"], use_stemmer=True): scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for ref_text, pred_txt in zip(reference_txts, predicted_txts): scores = scorer.score(ref_text, pred_txt) aggregator.add_scores(scores) result = aggregator.aggregate() return result
def calculate_rouge(output_lns, reference_lns, score_path): score_file = Path(score_path).open("w") scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) aggregator = scoring.BootstrapAggregator() for reference_ln, output_ln in zip(reference_lns, output_lns): scores = scorer.score(reference_ln, output_ln) aggregator.add_scores(scores) result = aggregator.aggregate() score_file.write( "ROUGE_1: \n{} \n\n ROUGE_2: \n{} \n\n ROUGE_L: \n{} \n\n".format( result["rouge1"], result["rouge2"], result["rougeL"]))
def calculate_rouge( pred_lns: List[str], tgt_lns: List[str], use_stemmer=True, rouge_keys=ROUGE_KEYS, return_precision_and_recall=False, bootstrap_aggregation=True, newline_sep=True, ) -> Dict: """Calculate rouge using rouge_scorer package. Args: pred_lns: list of summaries generated by model tgt_lns: list of groundtruth summaries (e.g. contents of val.target) use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes to improve matching. rouge_keys: which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum return_precision_and_recall: (False) whether to also return precision and recall. bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]`` newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL on multi sentence summaries (CNN/DM dataset). Returns: Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys """ scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for pred, tgt in zip(tgt_lns, pred_lns): # rougeLsum expects "\n" separated sentences within a summary if newline_sep: pred = add_newline_to_end_of_each_sentence(pred) tgt = add_newline_to_end_of_each_sentence(tgt) # change pred and tgt scores = scorer.score(pred, tgt) aggregator.add_scores(scores) if bootstrap_aggregation: result = aggregator.aggregate() if return_precision_and_recall: return extract_rouge_mid_statistics(result) # here we return dict else: return { k: round(v.mid.fmeasure * 100, 4) for k, v in result.items() } else: return aggregator._scores # here we return defaultdict(list)
def evaluate_rouge_avg(hypotheses, references, type='f', use_progress_bar=False): metrics = ['rouge1', 'rouge2', 'rougeL'] scorer = {} scorer["rouge"] = rouge_scorer.RougeScorer(metrics, use_stemmer=True) aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorer} if len(hypotheses) < len(references): print( "Warning number of papers in submission file is smaller than ground truth file", file=sys.stderr) # import pdb;pdb.set_trace() hypotheses = list(hypotheses) references = list(references) if not use_progress_bar: for j, hyp in enumerate(hypotheses): submission_summary = hyp.replace('<q>', ' ') for key, scorr in scorer.items(): scores_i = scorr.score(references[j].strip(), submission_summary) aggregators_dict[key].add_scores(scores_i) aggregates_dict = { k: v.aggregate() for k, v in aggregators_dict.items() } out_avg_scores = {} for k, v in sorted(aggregates_dict["rouge"].items()): out_avg_scores[k] = v.mid.fmeasure else: for j, hyp in tqdm(enumerate(hypotheses), total=len(hypotheses)): submission_summary = hyp.replace('<q>', ' ') for key, scorr in scorer.items(): scores_i = scorr.score(references[j].strip(), submission_summary) aggregators_dict[key].add_scores(scores_i) aggregates_dict = { k: v.aggregate() for k, v in aggregators_dict.items() } out_avg_scores = {} for k, v in sorted(aggregates_dict["rouge"].items()): out_avg_scores[k] = v.mid.fmeasure return out_avg_scores['rouge1'], out_avg_scores['rouge2'], out_avg_scores[ 'rougeL']
def calculate_rouge(output_lns: List[str], reference_lns: List[str]) -> Dict: scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=True) aggregator = scoring.BootstrapAggregator() for reference_ln, output_ln in zip(reference_lns, output_lns): scores = scorer.score(reference_ln, output_ln) aggregator.add_scores(scores) # with open(out_dir + '/cands.txt', 'a+') as c, open(out_dir + '/refs.txt', 'a+') as r, \ # open(out_dir + '/scores.txt', 'a+') as s: # c.write(output_ln + '\n') # r.write(reference_ln + '\n') # s.write(str(scores['rouge1'].fmeasure) + '\n') result = aggregator.aggregate() return {k: v.mid.fmeasure for k, v in result.items()}
def text_eval(preds_file, model_dir, global_step: int = 0, eval_tag: str = "", enable_logging: bool = True): """Evaluates a set of text targets/predictions.""" scorers_dict = { _ROUGE_METRIC: rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True), _BLEU_METRIC: bleu_scorer.BleuScorer(), _REPETITION_METRIC: repetition_scorer.RepetitionScorer( ["regs1", "regs2", "regs3", "regsTCR"]), _LENGTH_METRIC: length_scorer.LengthScorer(["word", "char"]) } aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorers_dict} with LogWriter((), model_dir, 0, "", enable_logging) as log_writer: with open(preds_file) as csv_file: reader = csv.DictReader(csv_file) for i, row in enumerate(reader): text_dict = { "inputs": row['prompt'], "targets": row['targets'], "predictions": row['predictions'] } log_writer.write(text_dict, i) for key, scorer in scorers_dict.items(): scores_i = scorer.score(row['targets'], row['predictions']) aggregators_dict[key].add_scores(scores_i) aggregates_dict = {k: v.aggregate() for k, v in aggregators_dict.items()} length_histograms = scorers_dict[_LENGTH_METRIC].histograms(as_string=True) _write_aggregates(model_dir, global_step, eval_tag, aggregates_dict, length_histograms) _write_aggregate_summaries(model_dir, global_step, eval_tag, aggregates_dict)
def calculate_rouge(output_lns: List[str], reference_lns: List[str], cleaned_up_tokenization_spaces=False, use_stemmer=True) -> Dict: scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() split_txt = ". " if cleaned_up_tokenization_spaces else " . " for reference_ln, output_ln in zip(reference_lns, output_lns): # rouge_score expects \n separated sentences within a summary reference_ln_formatted = " . \n".join(reference_ln.split(". ")) output_ln_formatted = " . \n".join(output_ln.split(split_txt)) scores = scorer.score(reference_ln_formatted, output_ln_formatted) aggregator.add_scores(scores) result = aggregator.aggregate() return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
def compute_rouge(predictions, references, rouge_types=None, use_stemmer=True): if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeLsum"] scorer = rouge_scorer.RougeScorer( rouge_types=rouge_types, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) aggregator.add_scores(score) result = aggregator.aggregate() result = { key: round(value.mid.fmeasure * 100, 4) for key, value in result.items() } return result
def rouge_scores(preds: List[List[torch.Tensor]], targets: List[List[torch.Tensor]], tokenizer, use_stemmer=False, use_aggregator=False): # largely copied from https://github.com/huggingface/nlp/blob/master/metrics/rouge/rouge.py#L84 rouge_types = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'] scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) refs, hyps = [], [] for p, t in zip(preds, targets): assert len(p) == len(t) refs.extend(p) hyps.extend(t) if use_aggregator: aggregator = scoring.BootstrapAggregator() scores = None else: aggregator = None scores = [] for ref, pred in zip(refs, hyps): if isinstance(ref, torch.Tensor): ref = tokenizer.decode(ref).lower() if isinstance(pred, torch.Tensor): pred = tokenizer.decode(pred).lower() score = scorer.score(ref, pred) if use_aggregator: aggregator.add_scores(score) else: scores.append(score) if use_aggregator: result = aggregator.aggregate() else: result = {} for key in scores[0]: result[key] = list(score[key] for score in scores) return result
def edit_rouge(targets, predictions): """Measures a variety of different ROUGE scores.""" # We do not measure ROUGE-L for updates since LCS is likely entirely contained # in source. scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeLsum"]) aggregator = scoring.BootstrapAggregator() for prediction, target in zip(predictions, targets): all_scores = {} target_additions = rendering_utils.extract_additions( source=target["normalized_inputs"], target=target["normalized_targets"], ) target_additions = " ".join(target_additions) prediction_additions = rendering_utils.extract_additions( source=target["normalized_inputs"], target=prediction["normalized_targets"], ) prediction_additions = " ".join(prediction_additions) addition_scores = scorer.score( target=target_additions, prediction=prediction_additions, ) if target_additions.strip() or prediction_additions.strip(): all_scores.update( {f"update_{k}": v for k, v in addition_scores.items()}) else: all_scores.update( {f"update_{k}": 100.0 for k, _ in addition_scores.items()}) aggregator.add_scores(all_scores) result = aggregator.aggregate() return {key: value.mid.fmeasure * 100 for key, value in result.items()}
def rouge_dict(gathered_dict, target_key='labels', prediction_key='predictions', score_keys=None): """Computes rouge score. Args: targets: list of strings predictions: list of strings score_keys: list of strings with the keys to compute. Returns: dict with score_key: rouge score across all targets and predictions """ targets = gathered_dict[target_key] predictions = gathered_dict[prediction_key] if score_keys is None: score_keys = ["rouge1", "rouge2", "rougeLsum"] scorer = rouge_scorer.RougeScorer(score_keys) aggregator = scoring.BootstrapAggregator() def _prepare_summary(summary): # Make sure the summary is not bytes-type # Add newlines between sentences so that rougeLsum is computed correctly. summary = summary.replace(" . ", " .\n") return summary for prediction, target in zip(predictions, targets): target = _prepare_summary(target) prediction = _prepare_summary(prediction) aggregator.add_scores( scorer.score(target=target, prediction=prediction)) result = aggregator.aggregate() return { key: { 'score': result[key].mid.fmeasure * 100, 'count': len(targets) } for key in score_keys }
def rouge(targets, predictions, score_keys=None): """Computes rouge score. Args: targets: list of strings predictions: list of strings score_keys: list of strings with the keys to compute. Returns: dict with score_key: rouge score across all targets and predictions """ if score_keys is None: score_keys = ["rouge1", "rouge2", "rougeLsum"] scorer = rouge_scorer.RougeScorer(score_keys) aggregator = scoring.BootstrapAggregator() def _prepare_summary(summary): # Make sure the summary is not bytes-type summary = tf.compat.as_text(summary) # Add newlines between sentences so that rougeLsum is computed correctly. summary = summary.replace(" . ", " .\n") return summary for prediction, target in zip(predictions, targets): target = _prepare_summary(target) prediction = _prepare_summary(prediction) aggregator.add_scores( scorer.score(target=target, prediction=prediction)) result = aggregator.aggregate() for key in score_keys: logging.info( "%s = %.2f, 95%% confidence [%.2f, %.2f]", key, result[key].mid.fmeasure * 100, result[key].low.fmeasure * 100, result[key].high.fmeasure * 100, ) return {key: result[key].mid.fmeasure * 100 for key in score_keys}
def compute(self, predictions, references): rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] rouge = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True) aggregator = scoring.BootstrapAggregator() # TODO expecting pretokenized data, do we want to imitate Rouge-155 tokenizer somehow? for refs, pred in zip(references.whitespace_tokenized, predictions.whitespace_tokenized): # ROUGE multi-ref jackknifing if len(refs) > 1: scores = [] for ref in refs: scores.append(rouge.score(ref, pred)) # get best score for all leave-one-out sets best_scores = [] for leave in range(len(refs)): cur_scores = [s for s in scores] del cur_scores[leave] best_scores.append({rouge_type: max([s[rouge_type] for s in cur_scores], key=lambda s: s.fmeasure) for rouge_type in rouge_types}) # average the leave-one-out bests to produce the final score score = {rouge_type: scoring.Score(np.mean([b[rouge_type].precision for b in best_scores]), np.mean([b[rouge_type].recall for b in best_scores]), np.mean([b[rouge_type].fmeasure for b in best_scores])) for rouge_type in rouge_types} else: score = rouge.score(refs[0], pred) aggregator.add_scores(score) result = aggregator.aggregate() # convert the named tuples to plain nested dicts result = {rouge_type: {vtype: dict(val._asdict()) for vtype, val in result[rouge_type]._asdict().items()} for rouge_type in rouge_types} return result
sources.append(source) target_lists.append(targets) # Exact and SARI scores exact = score_lib.compute_exact_score(predictions, target_lists) sari, keep, addition, deletion = score_lib.compute_sari_scores( sources, predictions, target_lists) print(f'Exact score: {100*exact:.3f}') print(f'SARI score: {100*sari:.3f}') print(f' KEEP score: {100*keep:.3f}') print(f' ADDITION score: {100*addition:.3f}') print(f' DELETION score: {100*deletion:.3f}') # ROUGE-L scores scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) aggregator = scoring.BootstrapAggregator() scores = [] for target, pred in zip(target_lists, predictions): aggregator.add_scores(scorer.score(target[0], pred)) aggregates = aggregator.aggregate() print("\nROUGE scores:") print( "----------------------------------------------------------------") print("score_type\t\tlow\t\tmid\t\thigh") print( "----------------------------------------------------------------") for score_type, aggregate in sorted(aggregates.items()): print("%s-Recall: \t%f\t%f\t%f" % (score_type, aggregate.low.recall, aggregate.mid.recall,
def calculate_metrics(model): _, _, dev = preprocess_QG() sentences = pd.DataFrame(dev, columns=['Complex', 'Simple']) sentences = sentences.groupby( ['Complex']).agg(lambda x: tuple(x)).applymap(list).reset_index() questions = [] with open("/content/tgt-dev.txt", 'r') as f: lines = f.readlines() for l in lines: questions.append(l[:-1]) contexts = [] with open("/content/src-dev.txt", 'r') as f: lines = f.readlines() for l in lines: contexts.append(l[:-1]) filename = "/content/val.source" sep = '<sep>' with open(filename, 'r') as f: lines = f.readlines() rouge_scores = [] sari_scores = [] results = [] scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) aggregator = scoring.BootstrapAggregator() for i, line in enumerate(tqdm(lines)): if sep in line: line = line[:line.find(sep) + len(sep)] s = "" try: s = run_model(line + sep) s = s[s.find(sep) + len(sep):] except: s = run_model(line + sep) results.append(s[0]) ref_questions_idx = [ i for i, cont in enumerate(contexts) if cont + sep == line[:-len(sep)] ] ref_questions = [questions[i] for i in ref_questions_idx] fm_ = [0] for r in ref_questions: scores = scorer.score(s, r) fm_.append([ round(v.fmeasure * 100, 4) for k, v in scores.items() ][0]) rouge_scores.append(max(fm_)) else: s = run_model(line)[0] results.append(s) ref = list(sentences.loc[sentences['Complex'].str.contains( line[:-len(sep)])]['Simple']) ref = [str(r[0]) for r in ref] sari_scores.append(SARIsent(line, s, ref)) if i % 10 == 0 and i != 0: print('Current avg rouge {}, max = {}'.format( np.mean(rouge_scores), np.max(rouge_scores))) k = np.argmax(rouge_scores) * 2 print('Max rouge for context {} Result of the model == {}'. format(lines[k], results[k])) print('\nCurrent avg sari {}, max = {}'.format( np.mean(sari_scores), np.max(sari_scores))) k = np.argmax(sari_scores) * 2 + 1 print( 'Max sari for context {} Result of the model == {}'.format( lines[k], results[k])) if i > 1000: break