def extract_oracle(extract_list, summ_list): """Choose sentence with maximum average rouge.""" # Choose sentence with maximum average rouge. score_accum = [] for e in extract_list: e_trunc = p2s_eval.get_summary_truncated( p2s_eval.get_summary_first_sentence(e), p2s_eval.TRUNC_LEN ) # get_summary_first_sentence may not be necessary accum_rouge_1r_trunc = 0 for s in summ_list: s_e_trunc_score = my_rouge_scorer.score(s, e_trunc) # for computing accumulative rouge accum_rouge_1r_trunc += s_e_trunc_score['rouge1'].recall score_accum.append(accum_rouge_1r_trunc) e_id_o = np.argmax(score_accum) e_o = extract_list[e_id_o] # Compute average rouge for the oracle sentence agg = scoring.BootstrapAggregator() e_o_trunc = p2s_eval.get_summary_truncated( p2s_eval.get_summary_first_sentence(e_o), p2s_eval.TRUNC_LEN) # get_summary_first_sentence may not be necessary for s in summ_list: e_o_trunc_score = my_rouge_scorer.score(s, e_o_trunc) agg.add_scores(e_o_trunc_score) agg_o = agg.aggregate() score_o = { rouge_type: agg_o[rouge_type].mid for rouge_type in agg_o # mid=mean } nwords_o = p2s_eval.count_words(e_o) return (score_o, nwords_o, e_o)
def testAssertsOnInvalidRougeTypes(self): scorer = rouge_scorer.RougeScorer(["rougex"], False) with self.assertRaises(ValueError): io.compute_scores_and_write_to_csv(test_util.TARGETS_FILE, test_util.PREDICTIONS_FILE, "", scorer, scoring.BootstrapAggregator())
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') with open(FLAGS.mturk_csv) as f: ignored = 0 total = 0 reader = csv.DictReader(f) scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) aggregators = {} for i in range(5): aggregators[str(i)] = scoring.BootstrapAggregator() for row in reader: # TODO(peterjliu): Average across replicas for same example? total += 1 sentences = re.split('[.!]', row[_STORY]) sentences.pop() if len(sentences) != 5: # TODO(peterjliu): Just read sentences from raw csv file. logging.error('ignored %s %s', sentences, row[_STORY]) ignored += 1 continue summary = row[_SUMM] for i in range(5): aggregators[str(i)].add_scores( scorer.score(summary, sentences[i])) for i in range(5): print('ROUGE-1 for sentence-%d' % i) print(aggregators[str(i)].aggregate()) logging.info('total %d, ignored %d', total, ignored)
def testConfidenceIntervalsAgainstRouge155WithStemmingMultiLine(self): scorer = rouge_scorer.RougeScorer( ["rouge1", "rouge2", "rougeLsum"], use_stemmer=True) aggregator = scoring.BootstrapAggregator() t_files = [os.path.join(test_util.PYROUGE_DIR, 'target_multi.%d.txt' % i) for i in range(0, 250)] p_files = [os.path.join(test_util.PYROUGE_DIR, 'prediction_multi.%d.txt' % i) for i in range(0, 250)] targets = [test_util.get_text(x) for x in t_files] predictions = [test_util.get_text(x) for x in p_files] assert len(targets) == len(predictions) assert len(targets) == 250 for target, prediction in zip(targets, predictions): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() # DIR = testdata/pyrouge_evaluate_plain_text_files # pyrouge_evaluate_plain_text_files -s $DIR -sfp "prediction_multi.(.*).txt" # -m $DIR -mfp target_multi.#ID#.txt self.assertSimilarAggregates((0.58963, 0.59877, 0.60822), # P (0.37327, 0.38091, 0.38914), # R (0.45607, 0.46411, 0.47244), # F result["rouge1"]) self.assertSimilarAggregates((0.35429, 0.36516, 0.37665), # P (0.22341, 0.23109, 0.23916), # R (0.27312, 0.28209, 0.29133), # F result["rouge2"]) self.assertSimilarAggregates((0.58604, 0.59491, 0.60444), # P (0.37084, 0.37846, 0.38671), # R (0.45305, 0.46113, 0.46946), # F result["rougeLsum"])
def testMultipleRougeTypes(self): scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=False) aggregator = scoring.BootstrapAggregator() for target, prediction in zip(self.targets[:5], self.predictions[:5]): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() self.assertSameElements(list(result.keys()), ["rouge1", "rougeL"])
def testConfidenceIntervalsAgainstRouge155(self): scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False) aggregator = scoring.BootstrapAggregator() for target, prediction in zip(self.targets, self.predictions): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() self.assertSimilarAggregates( (0.48695, 0.49879, 0.51131), (0.31106, 0.31950, 0.32849), (0.37614, 0.38554, 0.39581), result["rouge1"])
def testConfidenceIntervalsAgainstRouge155WithStemming(self): scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) aggregator = scoring.BootstrapAggregator() for target, prediction in zip(self.targets, self.predictions): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() self.assertSimilarAggregates( (0.51027, 0.52434, 0.53788), (0.32563, 0.33580, 0.34548), (0.39380, 0.40524, 0.41661), result["rouge1"])
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") scorer = rouge_scorer.RougeScorer(FLAGS.rouge_types, FLAGS.use_stemmer) aggregator = scoring.BootstrapAggregator() if FLAGS.aggregate else None io.compute_scores_and_write_to_csv(FLAGS.target_filepattern, FLAGS.prediction_filepattern, FLAGS.output_filename, scorer, aggregator, delimiter=FLAGS.delimiter)
def testProducesValidOutput(self): with tempfile.NamedTemporaryFile() as output_file: output_filename = output_file.name scorer = rouge_scorer.RougeScorer(["rouge1"], False) io.compute_scores_and_write_to_csv(test_util.TARGETS_FILE, test_util.PREDICTIONS_FILE, output_filename, scorer, scoring.BootstrapAggregator()) with open(output_filename) as f: csv_lines = f.readlines() output_types = tuple((line.split(",")[0] for line in csv_lines)) self.assertEqual(output_types[0], "score_type") self.assertSameElements(output_types[1:], ["rouge1-P", "rouge1-R", "rouge1-F"])
def extract_ave(e, summ_list): """Average rouge between ith sentence and human summaries.""" agg = scoring.BootstrapAggregator() e_trunc = p2s_eval.get_summary_truncated( p2s_eval.get_summary_first_sentence(e), p2s_eval.TRUNC_LEN) # get_summary_first_sentence may not be necessary for s in summ_list: s_e_trunc_score = my_rouge_scorer.score(s, e_trunc) agg.add_scores(s_e_trunc_score) agg_ave = agg.aggregate() score_ave = { rouge_type: agg_ave[rouge_type].mid for rouge_type in agg_ave # mid=mean } nwords_e = p2s_eval.count_words(e) return (score_ave, nwords_e)
def testConsistentPercentiles(self): aggregator = scoring.BootstrapAggregator(confidence_interval=0.9) aggregator.add_scores({ "rouge1": scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2) }) aggregator.add_scores( {"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)}) aggregator.add_scores( {"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)}) result = aggregator.aggregate() self.assertSimilarAggregates((1 / 3, 2 / 3, 3 / 3), (1 / 9, 4 / 9, 7 / 9), (1 / 6, 3 / 6, 5 / 6), result["rouge1"], delta=1e-8)
def testLargeConfidence(self): aggregator = scoring.BootstrapAggregator(confidence_interval=0.0) aggregator.add_scores({ "rouge1": scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2) }) aggregator.add_scores( {"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)}) aggregator.add_scores( {"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)}) result = aggregator.aggregate() self.assertSimilarAggregates((2 / 3, 2 / 3, 2 / 3), (4 / 9, 4 / 9, 4 / 9), (3 / 6, 3 / 6, 3 / 6), result["rouge1"], delta=1e-8)
def human_ave(summ_list): """Average pairwise rouge between two human summaries.""" agg = scoring.BootstrapAggregator() for s1_id, s1 in enumerate(summ_list): for s2_id, s2 in enumerate(summ_list): if s1_id >= s2_id: # only compute for s1_id < s2_id continue s2_trunc = p2s_eval.get_summary_truncated( p2s_eval.get_summary_first_sentence(s2), p2s_eval.TRUNC_LEN) s1_s2_trunc_score = my_rouge_scorer.score(s1, s2_trunc) agg.add_scores(s1_s2_trunc_score) agg_ave = agg.aggregate() score_ave = { rouge_type: agg_ave[rouge_type].mid for rouge_type in agg_ave # mid=mean } nwords_ave = np.mean([p2s_eval.count_words(s) for s in summ_list]) return (score_ave, nwords_ave)
def compute_metrics(self, model_sent2summ): """Compute metrics. Args: model_sent2summ: dict of int list to list of summaries Returns: Metrics object. """ # TODO(peterjliu): Check that we have the right number of examples. if len(list(model_sent2summ.keys())) != len(list( self.sent2summ.keys())): logging.info('number of keys mismatch: %d %d', len(list(model_sent2summ.keys())), len(list(self.sent2summ.keys()))) targets = [] predictions = [] agg = scoring.BootstrapAggregator() t_agg = scoring.BootstrapAggregator() for first_sent, model_summary in six.iteritems(model_sent2summ): first_model_summary = get_summary_first_sentence(model_summary) trunc_model_summary = get_summary_truncated( first_model_summary, TRUNC_LEN) try: for s in self.sent2summ[first_sent]: agg.add_scores( self.rouge_scorer.score( s, # reference first model_summary)) t_agg.add_scores( self.rouge_scorer.score( s, # reference first trunc_model_summary)) targets.append(s) predictions.append(model_summary) except KeyError: logging.error('key not found %s', first_sent) raise Exception( 'key not found %s. %s' % (str(first_sent), str(list(self.sent2summ.keys())))) rouge_scores = agg.aggregate() trunc_rouge_scores = t_agg.aggregate() m = Metrics() m.add_metric(Metrics.ROUGE1_F, rouge_scores['rouge1'].mid.fmeasure) m.add_metric(Metrics.ROUGE1_R, rouge_scores['rouge1'].mid.recall) m.add_metric(Metrics.ROUGE1_P, rouge_scores['rouge1'].mid.precision) m.add_metric(Metrics.ROUGE2_F, rouge_scores['rouge2'].mid.fmeasure) m.add_metric(Metrics.ROUGE2_R, rouge_scores['rouge2'].mid.recall) m.add_metric(Metrics.ROUGE2_P, rouge_scores['rouge2'].mid.precision) m.add_metric(Metrics.ROUGEL_F, rouge_scores['rougeL'].mid.fmeasure) m.add_metric(Metrics.ROUGEL_R, rouge_scores['rougeL'].mid.recall) m.add_metric(Metrics.ROUGEL_P, rouge_scores['rougeL'].mid.precision) # Truncated rouge m.add_metric(Metrics.ROUGE1_R_TRUNC, trunc_rouge_scores['rouge1'].mid.recall) m.add_metric(Metrics.ROUGE2_R_TRUNC, trunc_rouge_scores['rouge2'].mid.recall) m.add_metric(Metrics.ROUGEL_R_TRUNC, trunc_rouge_scores['rougeL'].mid.recall) m.add_metric(Metrics.ROUGE1_F_TRUNC, trunc_rouge_scores['rouge1'].mid.fmeasure) m.add_metric(Metrics.ROUGE2_F_TRUNC, trunc_rouge_scores['rouge2'].mid.fmeasure) m.add_metric(Metrics.ROUGEL_F_TRUNC, trunc_rouge_scores['rougeL'].mid.fmeasure) m.add_metric( Metrics.PERIODS, sum([get_summary_n_periods(s) for s in predictions]) / len(predictions)) m.add_metric( Metrics.WORDS, sum([count_words(s) for s in predictions]) / len(predictions)) return m
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.io.gfile.mkdir(FLAGS.output_dir) data_file = os.path.join( FLAGS.data_dir, 'rocstories_gt.' + six.ensure_str(FLAGS.eval_subset) + '.tfrecord') seq_ex_list = util.get_seq_exs(data_file) print('Input data %s' % data_file) # Human summary baselines. # We have 3 human summaries for each example, and # 2 human performance variants: # 1. 'a': average pairwise rouge between two summaries # 2. 'm': maximum pairwise rouge between any two summaries agg_human = {} nwords_human = {} for h in ['a', 'm']: agg_human[h] = scoring.BootstrapAggregator() nwords_human[h] = [] # Extractive baselines # 1. '1','2','3','4','5': rouge between ith sentence and human summary # 2. 'o': for each example, choose sentence with maximum average rouge agg_extract = {} nwords_extract = {} for e in [str(x) for x in list(range(5))] + ['o']: agg_extract[e] = scoring.BootstrapAggregator() nwords_extract[e] = [] # human performance sent2oracle = {} for ex in seq_ex_list: summ_list = p2s_eval.get_summaries(ex) summ_list = [x.decode('utf-8') for x in summ_list] # human eval score, nwords = human_ave(summ_list) agg_human['a'].add_scores(score) nwords_human['a'].append(nwords) score, nwords = human_max(summ_list) agg_human['m'].add_scores(score) nwords_human['m'].append(nwords) # extractive eval extract_list = get_extracts(ex) extract_list = [x.decode('utf-8') for x in extract_list] for e_id, e in enumerate(extract_list): score, nwords = extract_ave(e, summ_list) agg_extract[str(e_id)].add_scores(score) nwords_extract[str(e_id)].append(nwords) score, nwords, e_o = extract_oracle(extract_list, summ_list) agg_extract['o'].add_scores(score) nwords_extract['o'].append(nwords) # save story and oracle sentence for future use first = p2s_eval.get_first_sentence(ex) if first in sent2oracle: logging.fatal('duplicate first sentence: %s', str(first)) sent2oracle[first] = (' '.join(extract_list), e_o) # (story, oracle) # write each example and the corresponding oracle to disk tk, _ = util.get_tokenizer_with_special(FLAGS.vocab_file, []) def detok(s): return tk.decode(util.strip_after_eos(s)) keys_sorted = sorted(sent2oracle.keys(), key=detok) out_file = os.path.join( FLAGS.output_dir, 'rocstories_gt.' + six.ensure_str(FLAGS.eval_subset) + '.firstsent2oracle.txt') with tf.gfile.Open(out_file, 'w') as f: for k in keys_sorted: f.write('%s\n' % (sent2oracle[k][1])) # print out rouge scores for human performance print_agg_score('human average', agg_human['a'], nwords_human['a']) print_agg_score('human max', agg_human['m'], nwords_human['m']) for e_id in range(5): print_agg_score('extractive baseline{}'.format(e_id), agg_extract[str(e_id)], nwords_extract[str(e_id)]) print_agg_score('extractive oracle', agg_extract['o'], nwords_extract['o'])
def testAssertsOnInvalidInputFiles(self): scorer = rouge_scorer.RougeScorer(["rouge1"], False) with self.assertRaises(ValueError): io.compute_scores_and_write_to_csv("invalid*", "invalid*", "invalid", scorer, scoring.BootstrapAggregator())