def extract_oracle(extract_list, summ_list):
    """Choose sentence with maximum average rouge."""
    # Choose sentence with maximum average rouge.
    score_accum = []
    for e in extract_list:
        e_trunc = p2s_eval.get_summary_truncated(
            p2s_eval.get_summary_first_sentence(e), p2s_eval.TRUNC_LEN
        )  # get_summary_first_sentence may not be necessary

        accum_rouge_1r_trunc = 0
        for s in summ_list:
            s_e_trunc_score = my_rouge_scorer.score(s, e_trunc)
            # for computing accumulative rouge
            accum_rouge_1r_trunc += s_e_trunc_score['rouge1'].recall
        score_accum.append(accum_rouge_1r_trunc)
    e_id_o = np.argmax(score_accum)
    e_o = extract_list[e_id_o]

    # Compute average rouge for the oracle sentence
    agg = scoring.BootstrapAggregator()
    e_o_trunc = p2s_eval.get_summary_truncated(
        p2s_eval.get_summary_first_sentence(e_o),
        p2s_eval.TRUNC_LEN)  # get_summary_first_sentence may not be necessary
    for s in summ_list:
        e_o_trunc_score = my_rouge_scorer.score(s, e_o_trunc)
        agg.add_scores(e_o_trunc_score)
    agg_o = agg.aggregate()
    score_o = {
        rouge_type: agg_o[rouge_type].mid
        for rouge_type in agg_o  # mid=mean
    }
    nwords_o = p2s_eval.count_words(e_o)
    return (score_o, nwords_o, e_o)
Esempio n. 2
0
 def testAssertsOnInvalidRougeTypes(self):
     scorer = rouge_scorer.RougeScorer(["rougex"], False)
     with self.assertRaises(ValueError):
         io.compute_scores_and_write_to_csv(test_util.TARGETS_FILE,
                                            test_util.PREDICTIONS_FILE, "",
                                            scorer,
                                            scoring.BootstrapAggregator())
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    with open(FLAGS.mturk_csv) as f:
        ignored = 0
        total = 0
        reader = csv.DictReader(f)

        scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
        aggregators = {}
        for i in range(5):
            aggregators[str(i)] = scoring.BootstrapAggregator()
        for row in reader:
            # TODO(peterjliu): Average across replicas for same example?
            total += 1
            sentences = re.split('[.!]', row[_STORY])
            sentences.pop()
            if len(sentences) != 5:
                # TODO(peterjliu): Just read sentences from raw csv file.
                logging.error('ignored %s %s', sentences, row[_STORY])
                ignored += 1
                continue
            summary = row[_SUMM]
            for i in range(5):
                aggregators[str(i)].add_scores(
                    scorer.score(summary, sentences[i]))
        for i in range(5):
            print('ROUGE-1 for sentence-%d' % i)
            print(aggregators[str(i)].aggregate())
        logging.info('total %d, ignored %d', total, ignored)
Esempio n. 4
0
  def testConfidenceIntervalsAgainstRouge155WithStemmingMultiLine(self):
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeLsum"], use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()
    t_files = [os.path.join(test_util.PYROUGE_DIR, 'target_multi.%d.txt' % i) for i in range(0, 250)]
    p_files = [os.path.join(test_util.PYROUGE_DIR, 'prediction_multi.%d.txt' % i) for i in range(0, 250)]

    targets = [test_util.get_text(x) for x in t_files]
    predictions = [test_util.get_text(x) for x in p_files]
    assert len(targets) == len(predictions)
    assert len(targets) == 250
    for target, prediction in zip(targets, predictions):
      aggregator.add_scores(scorer.score(target, prediction))
    result = aggregator.aggregate()

    # DIR = testdata/pyrouge_evaluate_plain_text_files
    #  pyrouge_evaluate_plain_text_files -s $DIR -sfp "prediction_multi.(.*).txt"
    #    -m $DIR -mfp target_multi.#ID#.txt
    self.assertSimilarAggregates((0.58963, 0.59877, 0.60822),    # P
                                 (0.37327, 0.38091, 0.38914),    # R
                                 (0.45607, 0.46411, 0.47244),    # F
                                 result["rouge1"])
    self.assertSimilarAggregates((0.35429, 0.36516, 0.37665),    # P
                                 (0.22341, 0.23109, 0.23916),    # R
                                 (0.27312, 0.28209, 0.29133),    # F
                                 result["rouge2"])
    self.assertSimilarAggregates((0.58604, 0.59491, 0.60444),    # P
                                 (0.37084, 0.37846, 0.38671),    # R
                                 (0.45305, 0.46113, 0.46946),    # F
                                 result["rougeLsum"])
Esempio n. 5
0
  def testMultipleRougeTypes(self):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=False)
    aggregator = scoring.BootstrapAggregator()
    for target, prediction in zip(self.targets[:5], self.predictions[:5]):
      aggregator.add_scores(scorer.score(target, prediction))
    result = aggregator.aggregate()

    self.assertSameElements(list(result.keys()), ["rouge1", "rougeL"])
    def testConfidenceIntervalsAgainstRouge155(self):
        scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False)
        aggregator = scoring.BootstrapAggregator()
        for target, prediction in zip(self.targets, self.predictions):
            aggregator.add_scores(scorer.score(target, prediction))
        result = aggregator.aggregate()

        self.assertSimilarAggregates(
            (0.48695, 0.49879, 0.51131), (0.31106, 0.31950, 0.32849),
            (0.37614, 0.38554, 0.39581), result["rouge1"])
Esempio n. 7
0
    def testConfidenceIntervalsAgainstRouge155WithStemming(self):
        scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
        aggregator = scoring.BootstrapAggregator()
        for target, prediction in zip(self.targets, self.predictions):
            aggregator.add_scores(scorer.score(target, prediction))
        result = aggregator.aggregate()

        self.assertSimilarAggregates(
            (0.51027, 0.52434, 0.53788), (0.32563, 0.33580, 0.34548),
            (0.39380, 0.40524, 0.41661), result["rouge1"])
Esempio n. 8
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")
    scorer = rouge_scorer.RougeScorer(FLAGS.rouge_types, FLAGS.use_stemmer)
    aggregator = scoring.BootstrapAggregator() if FLAGS.aggregate else None
    io.compute_scores_and_write_to_csv(FLAGS.target_filepattern,
                                       FLAGS.prediction_filepattern,
                                       FLAGS.output_filename,
                                       scorer,
                                       aggregator,
                                       delimiter=FLAGS.delimiter)
Esempio n. 9
0
 def testProducesValidOutput(self):
   with tempfile.NamedTemporaryFile() as output_file:
     output_filename = output_file.name
     scorer = rouge_scorer.RougeScorer(["rouge1"], False)
     io.compute_scores_and_write_to_csv(test_util.TARGETS_FILE,
                                        test_util.PREDICTIONS_FILE,
                                        output_filename, scorer,
                                        scoring.BootstrapAggregator())
     with open(output_filename) as f:
       csv_lines = f.readlines()
     output_types = tuple((line.split(",")[0] for line in csv_lines))
     self.assertEqual(output_types[0], "score_type")
     self.assertSameElements(output_types[1:],
                             ["rouge1-P", "rouge1-R", "rouge1-F"])
def extract_ave(e, summ_list):
    """Average rouge between ith sentence and human summaries."""
    agg = scoring.BootstrapAggregator()
    e_trunc = p2s_eval.get_summary_truncated(
        p2s_eval.get_summary_first_sentence(e),
        p2s_eval.TRUNC_LEN)  # get_summary_first_sentence may not be necessary
    for s in summ_list:
        s_e_trunc_score = my_rouge_scorer.score(s, e_trunc)
        agg.add_scores(s_e_trunc_score)
    agg_ave = agg.aggregate()
    score_ave = {
        rouge_type: agg_ave[rouge_type].mid
        for rouge_type in agg_ave  # mid=mean
    }
    nwords_e = p2s_eval.count_words(e)
    return (score_ave, nwords_e)
    def testConsistentPercentiles(self):
        aggregator = scoring.BootstrapAggregator(confidence_interval=0.9)
        aggregator.add_scores({
            "rouge1":
            scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2)
        })
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)})
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)})
        result = aggregator.aggregate()

        self.assertSimilarAggregates((1 / 3, 2 / 3, 3 / 3),
                                     (1 / 9, 4 / 9, 7 / 9),
                                     (1 / 6, 3 / 6, 5 / 6),
                                     result["rouge1"],
                                     delta=1e-8)
    def testLargeConfidence(self):
        aggregator = scoring.BootstrapAggregator(confidence_interval=0.0)
        aggregator.add_scores({
            "rouge1":
            scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2)
        })
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)})
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)})
        result = aggregator.aggregate()

        self.assertSimilarAggregates((2 / 3, 2 / 3, 2 / 3),
                                     (4 / 9, 4 / 9, 4 / 9),
                                     (3 / 6, 3 / 6, 3 / 6),
                                     result["rouge1"],
                                     delta=1e-8)
def human_ave(summ_list):
    """Average pairwise rouge between two human summaries."""
    agg = scoring.BootstrapAggregator()
    for s1_id, s1 in enumerate(summ_list):
        for s2_id, s2 in enumerate(summ_list):
            if s1_id >= s2_id:  # only compute for s1_id < s2_id
                continue
            s2_trunc = p2s_eval.get_summary_truncated(
                p2s_eval.get_summary_first_sentence(s2), p2s_eval.TRUNC_LEN)
            s1_s2_trunc_score = my_rouge_scorer.score(s1, s2_trunc)
            agg.add_scores(s1_s2_trunc_score)
    agg_ave = agg.aggregate()
    score_ave = {
        rouge_type: agg_ave[rouge_type].mid
        for rouge_type in agg_ave  # mid=mean
    }
    nwords_ave = np.mean([p2s_eval.count_words(s) for s in summ_list])
    return (score_ave, nwords_ave)
Esempio n. 14
0
    def compute_metrics(self, model_sent2summ):
        """Compute metrics.

    Args:
      model_sent2summ: dict of int list to list of summaries

    Returns:
      Metrics object.
    """
        # TODO(peterjliu): Check that we have the right number of examples.
        if len(list(model_sent2summ.keys())) != len(list(
                self.sent2summ.keys())):
            logging.info('number of keys mismatch: %d %d',
                         len(list(model_sent2summ.keys())),
                         len(list(self.sent2summ.keys())))

        targets = []
        predictions = []
        agg = scoring.BootstrapAggregator()
        t_agg = scoring.BootstrapAggregator()
        for first_sent, model_summary in six.iteritems(model_sent2summ):
            first_model_summary = get_summary_first_sentence(model_summary)
            trunc_model_summary = get_summary_truncated(
                first_model_summary, TRUNC_LEN)
            try:
                for s in self.sent2summ[first_sent]:
                    agg.add_scores(
                        self.rouge_scorer.score(
                            s,  # reference first
                            model_summary))
                    t_agg.add_scores(
                        self.rouge_scorer.score(
                            s,  # reference first
                            trunc_model_summary))
                    targets.append(s)
                    predictions.append(model_summary)
            except KeyError:
                logging.error('key not found %s', first_sent)
                raise Exception(
                    'key not found %s. %s' %
                    (str(first_sent), str(list(self.sent2summ.keys()))))

        rouge_scores = agg.aggregate()
        trunc_rouge_scores = t_agg.aggregate()

        m = Metrics()
        m.add_metric(Metrics.ROUGE1_F, rouge_scores['rouge1'].mid.fmeasure)
        m.add_metric(Metrics.ROUGE1_R, rouge_scores['rouge1'].mid.recall)
        m.add_metric(Metrics.ROUGE1_P, rouge_scores['rouge1'].mid.precision)

        m.add_metric(Metrics.ROUGE2_F, rouge_scores['rouge2'].mid.fmeasure)
        m.add_metric(Metrics.ROUGE2_R, rouge_scores['rouge2'].mid.recall)
        m.add_metric(Metrics.ROUGE2_P, rouge_scores['rouge2'].mid.precision)

        m.add_metric(Metrics.ROUGEL_F, rouge_scores['rougeL'].mid.fmeasure)
        m.add_metric(Metrics.ROUGEL_R, rouge_scores['rougeL'].mid.recall)
        m.add_metric(Metrics.ROUGEL_P, rouge_scores['rougeL'].mid.precision)

        # Truncated rouge
        m.add_metric(Metrics.ROUGE1_R_TRUNC,
                     trunc_rouge_scores['rouge1'].mid.recall)
        m.add_metric(Metrics.ROUGE2_R_TRUNC,
                     trunc_rouge_scores['rouge2'].mid.recall)
        m.add_metric(Metrics.ROUGEL_R_TRUNC,
                     trunc_rouge_scores['rougeL'].mid.recall)
        m.add_metric(Metrics.ROUGE1_F_TRUNC,
                     trunc_rouge_scores['rouge1'].mid.fmeasure)
        m.add_metric(Metrics.ROUGE2_F_TRUNC,
                     trunc_rouge_scores['rouge2'].mid.fmeasure)
        m.add_metric(Metrics.ROUGEL_F_TRUNC,
                     trunc_rouge_scores['rougeL'].mid.fmeasure)

        m.add_metric(
            Metrics.PERIODS,
            sum([get_summary_n_periods(s)
                 for s in predictions]) / len(predictions))
        m.add_metric(
            Metrics.WORDS,
            sum([count_words(s) for s in predictions]) / len(predictions))
        return m
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    tf.io.gfile.mkdir(FLAGS.output_dir)

    data_file = os.path.join(
        FLAGS.data_dir,
        'rocstories_gt.' + six.ensure_str(FLAGS.eval_subset) + '.tfrecord')
    seq_ex_list = util.get_seq_exs(data_file)
    print('Input data %s' % data_file)

    # Human summary baselines.
    # We have 3 human summaries for each example, and
    # 2 human performance variants:
    #   1. 'a': average pairwise rouge between two summaries
    #   2. 'm': maximum pairwise rouge between any two summaries
    agg_human = {}
    nwords_human = {}
    for h in ['a', 'm']:
        agg_human[h] = scoring.BootstrapAggregator()
        nwords_human[h] = []

    # Extractive baselines
    #   1. '1','2','3','4','5': rouge between ith sentence and human summary
    #   2. 'o': for each example, choose sentence with maximum average rouge
    agg_extract = {}
    nwords_extract = {}
    for e in [str(x) for x in list(range(5))] + ['o']:
        agg_extract[e] = scoring.BootstrapAggregator()
        nwords_extract[e] = []

    # human performance
    sent2oracle = {}
    for ex in seq_ex_list:
        summ_list = p2s_eval.get_summaries(ex)
        summ_list = [x.decode('utf-8') for x in summ_list]

        # human eval
        score, nwords = human_ave(summ_list)
        agg_human['a'].add_scores(score)
        nwords_human['a'].append(nwords)

        score, nwords = human_max(summ_list)
        agg_human['m'].add_scores(score)
        nwords_human['m'].append(nwords)

        # extractive eval
        extract_list = get_extracts(ex)
        extract_list = [x.decode('utf-8') for x in extract_list]
        for e_id, e in enumerate(extract_list):
            score, nwords = extract_ave(e, summ_list)
            agg_extract[str(e_id)].add_scores(score)
            nwords_extract[str(e_id)].append(nwords)

        score, nwords, e_o = extract_oracle(extract_list, summ_list)
        agg_extract['o'].add_scores(score)
        nwords_extract['o'].append(nwords)

        # save story and oracle sentence for future use
        first = p2s_eval.get_first_sentence(ex)
        if first in sent2oracle:
            logging.fatal('duplicate first sentence: %s', str(first))
        sent2oracle[first] = (' '.join(extract_list), e_o)  # (story, oracle)

    # write each example and the corresponding oracle to disk
    tk, _ = util.get_tokenizer_with_special(FLAGS.vocab_file, [])

    def detok(s):
        return tk.decode(util.strip_after_eos(s))

    keys_sorted = sorted(sent2oracle.keys(), key=detok)

    out_file = os.path.join(
        FLAGS.output_dir, 'rocstories_gt.' +
        six.ensure_str(FLAGS.eval_subset) + '.firstsent2oracle.txt')
    with tf.gfile.Open(out_file, 'w') as f:
        for k in keys_sorted:
            f.write('%s\n' % (sent2oracle[k][1]))

    # print out rouge scores for human performance
    print_agg_score('human average', agg_human['a'], nwords_human['a'])
    print_agg_score('human max', agg_human['m'], nwords_human['m'])
    for e_id in range(5):
        print_agg_score('extractive baseline{}'.format(e_id),
                        agg_extract[str(e_id)], nwords_extract[str(e_id)])
    print_agg_score('extractive oracle', agg_extract['o'], nwords_extract['o'])
Esempio n. 16
0
 def testAssertsOnInvalidInputFiles(self):
   scorer = rouge_scorer.RougeScorer(["rouge1"], False)
   with self.assertRaises(ValueError):
     io.compute_scores_and_write_to_csv("invalid*", "invalid*", "invalid",
                                        scorer, scoring.BootstrapAggregator())