Exemple #1
0
    def test_rouge(self):
        # Test the first several instances in the TAC 2008 data to ensure that
        # our computation of ROUGE matches the values released by NIST
        instances = ReferenceBasedDatasetReader().read(_summaries_file_path)
        metrics_list = JsonlReader(_metrics_file_path, Metrics).read()
        metric_names = ['rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4', 'rouge-w-1.2']
        rouge = Rouge(max_ngram=4,
                      use_porter_stemmer=True,
                      remove_stopwords=False,
                      compute_rouge_l=True,
                      skip_bigram_gap_length=4,
                      wlcs_weight=1.2)

        peer_instances, peer_metrics = self._filter_by_type(instances, metrics_list, 'peer')
        reference_instances, reference_metrics = self._filter_by_type(instances, metrics_list, 'reference')

        num_to_check = 25
        actual_metrics_dicts = score_instances(peer_instances[:num_to_check], [rouge])
        for expected_metrics in peer_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric] == expected_metrics.metrics[metric]
                assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']

        actual_metrics_dicts = score_instances(reference_instances[:num_to_check], [rouge])
        for expected_metrics in reference_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
    def test_hong2014(self):
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)

        use_porter_stemmer = True
        remove_stopwords = False
        compute_rouge_l = True
        max_words = 100
        rouge = Rouge(max_ngram=2,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords,
                      max_words=max_words,
                      compute_rouge_l=compute_rouge_l)
        python_rouge = PythonRouge(use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords,
                                   max_words=max_words,
                                   compute_rouge_l=compute_rouge_l)
        expected_metrics, _ = rouge.evaluate(centroid, duc2004)
        actual_metrics, _ = python_rouge.evaluate(centroid, duc2004)
        assert math.isclose(expected_metrics['rouge-1']['precision'],
                            actual_metrics['python-rouge-1']['precision'],
                            abs_tol=1e-2)
        assert math.isclose(expected_metrics['rouge-1']['recall'],
                            actual_metrics['python-rouge-1']['recall'],
                            abs_tol=2e-2)
        assert math.isclose(expected_metrics['rouge-1']['f1'],
                            actual_metrics['python-rouge-1']['f1'],
                            abs_tol=2e-2)
        assert math.isclose(expected_metrics['rouge-2']['precision'],
                            actual_metrics['python-rouge-2']['precision'],
                            abs_tol=1e-2)
        assert math.isclose(expected_metrics['rouge-2']['recall'],
                            actual_metrics['python-rouge-2']['recall'],
                            abs_tol=1e-2)
        assert math.isclose(expected_metrics['rouge-2']['f1'],
                            actual_metrics['python-rouge-2']['f1'],
                            abs_tol=1e-2)
        # Rouge-L is a little further off, but still reasonably close enough that I'm not worried
        assert math.isclose(expected_metrics['rouge-l']['precision'],
                            actual_metrics['python-rouge-l']['precision'],
                            abs_tol=1e-1)
        assert math.isclose(expected_metrics['rouge-l']['recall'],
                            actual_metrics['python-rouge-l']['recall'],
                            abs_tol=1e-1)
        assert math.isclose(expected_metrics['rouge-l']['f1'],
                            actual_metrics['python-rouge-l']['f1'],
                            abs_tol=1e-1)
Exemple #3
0
    def test_python_rouge_multiling(self):
        use_porter_stemmer = True
        remove_stopwords = False
        compute_rouge_l = True
        max_words = 100

        rouge = Rouge(max_ngram=2,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords,
                      max_words=max_words,
                      compute_rouge_l=compute_rouge_l)
        python_rouge = PythonRouge(use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords,
                                   max_words=max_words,
                                   compute_rouge_l=compute_rouge_l)
        expected_metrics, _ = rouge.evaluate(self.summaries,
                                             self.references_list)
        actual_metrics, _ = python_rouge.evaluate(self.summaries,
                                                  self.references_list)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)
Exemple #4
0
    def test_task2_rouge(self):
        # Test the first several instances in the DUC 2007 data to ensure that
        # our computation of ROUGE matches the values released by NIST. In this year,
        # they did not release the raw output for the non-jackknifing runs, so
        # we cannot compare those scores
        instances = ReferenceBasedDatasetReader(
            _task2_summaries_file_path).read()
        metrics_list = JsonlReader(_task2_metrics_file_path, Metrics).read()
        metric_names = [
            'rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4',
            'rouge-w-1.2'
        ]
        rouge = Rouge(max_ngram=4,
                      use_porter_stemmer=True,
                      remove_stopwords=False,
                      compute_rouge_l=True,
                      skip_bigram_gap_length=4,
                      wlcs_weight=1.2)

        peer_instances, peer_metrics = self._filter_by_type(
            instances, metrics_list, 'peer')
        reference_instances, reference_metrics = self._filter_by_type(
            instances, metrics_list, 'reference')

        num_to_check = 25
        actual_metrics_dicts = score_instances(peer_instances[:num_to_check],
                                               [rouge])
        for expected_metrics in peer_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric + '_jk'] == pytest.approx(
                    expected_metrics.metrics[metric + '_jk'], abs=1e-3)

        actual_metrics_dicts = score_instances(
            reference_instances[:num_to_check], [rouge])
        for expected_metrics in reference_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[
                    metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
    def test_multi_all(self):
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)
        classy04 = load_summaries(_classy04_file_path)
        classy11 = load_summaries(_classy11_file_path)

        rouge = Rouge(max_words=100)

        summaries_list = list(zip(*[centroid, classy04, classy11]))
        metrics_lists = rouge.score_multi_all(summaries_list, duc2004)
        metrics_lists = list(zip(*metrics_lists))
        metrics_list = [rouge.aggregate(metrics_list) for metrics_list in metrics_lists]

        expected_metrics_list = []
        for dataset in [centroid, classy04, classy11]:
            expected_metrics_list.append(rouge.aggregate(rouge.score_all(dataset, duc2004)))

        assert metrics_list == expected_metrics_list
Exemple #6
0
def compute_rouge(preds, targets):
    """ Computes ROUGE-L for the generated sequences. """
    rouge = Rouge(compute_rouge_l=True)
    rouge_out = rouge.evaluate(preds, [[tgt] for tgt in targets])
    return rouge_out[0]['rouge-l']['f1']
    def test_hong2014(self):
        """
        Tests to ensure that the Rouge scores for the summaries from Hong et al. 2014
        (http://www.lrec-conf.org/proceedings/lrec2014/pdf/1093_Paper.pdf) do not
        change. The hard-coded scores are very close to the scores reported in the paper.
        """
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)
        classy04 = load_summaries(_classy04_file_path)
        classy11 = load_summaries(_classy11_file_path)
        dpp = load_summaries(_dpp_file_path)
        freq_sum = load_summaries(_freq_sum_file_path)
        greedy_kl = load_summaries(_greedy_kl_file_path)
        icsi_summ = load_summaries(_icsi_summ_file_path)
        lexrank = load_summaries(_lexrank_file_path)
        occams_v = load_summaries(_occams_v_file_path)
        reg_sum = load_summaries(_reg_sum_file_path)
        submodular = load_summaries(_submodular_file_path)
        ts_sum = load_summaries(_ts_sum_file_path)

        rouge = Rouge(max_words=100)

        # Reported: 36.41, 7.97, 1.21
        metrics, _ = rouge.evaluate(centroid, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 36.41, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 7.97, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.21, places=2)

        # Reported: 37.62, 8.96, 1.51
        metrics, _ = rouge.evaluate(classy04, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.61, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.96, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.51, places=2)

        # Reported: 37.22, 9.20, 1.48
        metrics, _ = rouge.evaluate(classy11, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.22, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.20, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.48, places=2)

        # Reported: 39.79, 9.62, 1.57
        metrics, _ = rouge.evaluate(dpp, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 39.79, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.62, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.57, places=2)

        # Reported: 35.30, 8.11, 1.00
        metrics, _ = rouge.evaluate(freq_sum, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.30, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.11, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.00, places=2)

        # Reported: 37.98, 8.53, 1.26
        metrics, _ = rouge.evaluate(greedy_kl, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.98, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.53, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.26, places=2)

        # Reported: 38.41, 9.78, 1.73
        metrics, _ = rouge.evaluate(icsi_summ, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.41, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.78, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.73, places=2)

        # Reported: 35.95, 7.47, 0.82
        metrics, _ = rouge.evaluate(lexrank, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.95, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 7.47, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 0.82, places=2)

        # Reported: 38.50, 9.76, 1.33
        metrics, _ = rouge.evaluate(occams_v, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.50, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.76, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.33, places=2)

        # Reported: 38.57, 9.75, 1.60
        metrics, _ = rouge.evaluate(reg_sum, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.56, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.75, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.60, places=2)

        # Reported: 39.18, 9.35, 1.39
        metrics, _ = rouge.evaluate(submodular, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 39.18, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.35, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.39, places=2)

        # Reported: 35.88, 8.15, 1.03
        metrics, _ = rouge.evaluate(ts_sum, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.88, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.14, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.03, places=2)
Exemple #8
0
    def test_python_rouge_correctness(self):
        summary = [
            "His tenacity holds despite the summary trials and harsh punishments for Xu, Wang Youcai and Qin Yongmin prominent party principals from the provinces who were sentenced to 11 and 12 years and despite threatening signs from the ruling Communist Party.",
            "The dissidents Xu Wenli, who was sentenced Monday to 13 years in prison, Wang Youcai, who received an 11-year sentence, and Qin Yongming, who was reported to have received 12 years were charged with subversion.",
            "As police moved against Xu's friends, labor rights campaigner Liu Nianchun was taken from a prison camp outside Beijing and, with his wife and daughter, was put on a plane to Canada and then New York, his first taste of freedom in more than 3 1/2 years."
        ]
        gold_summaries = [
            [
                "While China plans to sign the International Covenant on Civil and Political Rights at the U.N., it is still harassing and arresting human rights campaigners.",
                "Three prominent leaders of the China Democratic Party were put to trial and sentenced to 11-, 12- and 13-year prison terms.",
                "Germany and the U.S. condemned the arrests.",
                "A labor rights activist was released and exiled to the U.S. to blunt any opposition to Communist rule.",
                "U.S. policy to encourage trade and diplomacy in hope of democratic reforms evidences failure, but the U.S. is continuing its policy of encouragement.",
                "Friends of jailed dissidents state that they will continue to campaign for change."
            ],
            [
                "The US trade-driven policy of expanded ties encouraging Chinese democracy is questioned.",
                "China signed rights treaties and dissidents used new laws to set up China Democracy Party, but China violates the new laws by persecuting dissidents.",
                "It regularly frees activists from prison then exiles them so they lose local influence.",
                "It arrested an activist trying to register a rights monitoring group.",
                "CP leader Jiang's hard-line speech and publicity for activists sentenced to long prison terms signals a renewed Chinese crackdown.",
                "A rights activist expected to be sacrificed in the cause of democracy.",
                "Germany called China's sentencing of dissidents unacceptable."
            ],
            [
                "After 2 years of wooing the West by signing international accords, apparently relaxing controls on free speech, and releasing and exiling three dissenters, China cracked down against political dissent in Dec 1998.",
                "Leaders of the China Democracy Party (CDP) were arrested and three were sentenced to jail terms of 11 to 13 years.",
                "The West, including the US, UK and Germany, reacted strongly.",
                "Clinton's China policy of engagement was questioned.",
                "China's Jiang Zemin stated economic reform is not a prelude to democracy and vowed to crush any challenges to the Communist Party or \"social stability\".",
                "The CDP vowed to keep working, as more leaders awaited arrest."
            ],
            [
                "Xu Wenli, Wang Youchai, and Qin Yongmin, leading dissidents and prominent members of the China Democracy Party, were found guilty of subversion and sentenced to 13, 11, and 12 years in prison, respectively.",
                "Soon after the sentencing, China's president, Jiang Zemin, delivered speeches in which he asserted that Western political system must not be adopted and vowed to crush challenges to Communist Party rule.",
                "The harsh sentences and speeches signal a crackdown on dissent, but Zha Jianguo, another Democracy Party leader, says he will continue to push for change.",
                "Western nations condemned the sentences as violations of U.N. rights treaties signed by China."
            ]
        ]

        compute_rouge_l = True
        use_porter_stemmer = False
        remove_stopwords = False
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)

        use_porter_stemmer = False
        remove_stopwords = True
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)

        use_porter_stemmer = True
        remove_stopwords = False
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)

        use_porter_stemmer = True
        remove_stopwords = True
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)
Exemple #9
0
 def test_rouge_order_invariant(self):
     metric = Rouge(max_words=100)
     self.assert_order_invariant(metric)
Exemple #10
0
 def test_rouge(self):
     # This is a regression test, not necessarily a test for correctness
     metric = Rouge(max_ngram=2, compute_rouge_l=True)
     expected_output = [
         {
             'rouge-1': {'recall': 40.516000000000005, 'precision': 41.699999999999996, 'f1': 41.099000000000004},
             'rouge-2': {'recall': 10.233, 'precision': 10.533, 'f1': 10.381},
             'rouge-l': {'recall': 36.258, 'precision': 37.317, 'f1': 36.78}
         },
         {
             'rouge-1': {'recall': 48.258, 'precision': 47.765, 'f1': 48.010000000000005},
             'rouge-2': {'recall': 19.301, 'precision': 19.103, 'f1': 19.200999999999997},
             'rouge-l': {'recall': 44.774, 'precision': 44.317, 'f1': 44.544}
         },
         {
             'rouge-1': {'recall': 49.416, 'precision': 48.659, 'f1': 49.035000000000004},
             'rouge-2': {'recall': 16.406000000000002, 'precision': 16.154, 'f1': 16.279},
             'rouge-l': {'recall': 45.72, 'precision': 45.019, 'f1': 45.367000000000004}
         },
         {
             'rouge-1': {'recall': 44.466, 'precision': 44.038, 'f1': 44.251000000000005},
             'rouge-2': {'recall': 11.891, 'precision': 11.776, 'f1': 11.833},
             'rouge-l': {'recall': 40.971000000000004, 'precision': 40.577000000000005, 'f1': 40.772999999999996}
         },
         {
             'rouge-1': {'recall': 42.403999999999996, 'precision': 41.473, 'f1': 41.933},
             'rouge-2': {'recall': 10.477, 'precision': 10.245999999999999, 'f1': 10.36},
             'rouge-l': {'recall': 37.649, 'precision': 36.822, 'f1': 37.230999999999995}
         },
         {
             'rouge-1': {'recall': 43.857, 'precision': 43.061, 'f1': 43.455},
             'rouge-2': {'recall': 13.395000000000001, 'precision': 13.150999999999998, 'f1': 13.272},
             'rouge-l': {'recall': 40.555, 'precision': 39.818, 'f1': 40.183}},
         {
             'rouge-1': {'recall': 52.39, 'precision': 51.568999999999996, 'f1': 51.976},
             'rouge-2': {'recall': 20.4, 'precision': 20.079, 'f1': 20.238},
             'rouge-l': {'recall': 47.211, 'precision': 46.471000000000004, 'f1': 46.838}},
         {
             'rouge-1': {'recall': 51.186, 'precision': 51.593999999999994, 'f1': 51.388999999999996},
             'rouge-2': {'recall': 20.238, 'precision': 20.4, 'f1': 20.319000000000003},
             'rouge-l': {'recall': 46.64, 'precision': 47.012, 'f1': 46.825}},
         {
             'rouge-1': {'recall': 38.635999999999996, 'precision': 52.641000000000005, 'f1': 44.564},
             'rouge-2': {'recall': 13.691, 'precision': 18.681, 'f1': 15.801000000000002},
             'rouge-l': {'recall': 35.829, 'precision': 48.815999999999995, 'f1': 41.326}
         },
         {
             'rouge-1': {'recall': 51.73799999999999, 'precision': 51.6, 'f1': 51.669},
             'rouge-2': {'recall': 23.49, 'precision': 23.427, 'f1': 23.458000000000002},
             'rouge-l': {'recall': 49.332, 'precision': 49.2, 'f1': 49.266}
         },
         {
             'rouge-1': {'recall': 48.79, 'precision': 48.016, 'f1': 48.4},
             'rouge-2': {'recall': 21.053, 'precision': 20.717, 'f1': 20.884},
             'rouge-l': {'recall': 47.782000000000004, 'precision': 47.024, 'f1': 47.4}
         },
         {
             'rouge-1': {'recall': 44.711, 'precision': 45.344, 'f1': 45.025},
             'rouge-2': {'recall': 15.03, 'precision': 15.244, 'f1': 15.136},
             'rouge-l': {'recall': 44.112, 'precision': 44.737, 'f1': 44.422}
         }
     ]
     super().assert_expected_output(metric, expected_output)