Beispiel #1
0
    def test_python_rouge_correctness(self):
        summary = [
            "His tenacity holds despite the summary trials and harsh punishments for Xu, Wang Youcai and Qin Yongmin prominent party principals from the provinces who were sentenced to 11 and 12 years and despite threatening signs from the ruling Communist Party.",
            "The dissidents Xu Wenli, who was sentenced Monday to 13 years in prison, Wang Youcai, who received an 11-year sentence, and Qin Yongming, who was reported to have received 12 years were charged with subversion.",
            "As police moved against Xu's friends, labor rights campaigner Liu Nianchun was taken from a prison camp outside Beijing and, with his wife and daughter, was put on a plane to Canada and then New York, his first taste of freedom in more than 3 1/2 years."
        ]
        gold_summaries = [
            [
                "While China plans to sign the International Covenant on Civil and Political Rights at the U.N., it is still harassing and arresting human rights campaigners.",
                "Three prominent leaders of the China Democratic Party were put to trial and sentenced to 11-, 12- and 13-year prison terms.",
                "Germany and the U.S. condemned the arrests.",
                "A labor rights activist was released and exiled to the U.S. to blunt any opposition to Communist rule.",
                "U.S. policy to encourage trade and diplomacy in hope of democratic reforms evidences failure, but the U.S. is continuing its policy of encouragement.",
                "Friends of jailed dissidents state that they will continue to campaign for change."
            ],
            [
                "The US trade-driven policy of expanded ties encouraging Chinese democracy is questioned.",
                "China signed rights treaties and dissidents used new laws to set up China Democracy Party, but China violates the new laws by persecuting dissidents.",
                "It regularly frees activists from prison then exiles them so they lose local influence.",
                "It arrested an activist trying to register a rights monitoring group.",
                "CP leader Jiang's hard-line speech and publicity for activists sentenced to long prison terms signals a renewed Chinese crackdown.",
                "A rights activist expected to be sacrificed in the cause of democracy.",
                "Germany called China's sentencing of dissidents unacceptable."
            ],
            [
                "After 2 years of wooing the West by signing international accords, apparently relaxing controls on free speech, and releasing and exiling three dissenters, China cracked down against political dissent in Dec 1998.",
                "Leaders of the China Democracy Party (CDP) were arrested and three were sentenced to jail terms of 11 to 13 years.",
                "The West, including the US, UK and Germany, reacted strongly.",
                "Clinton's China policy of engagement was questioned.",
                "China's Jiang Zemin stated economic reform is not a prelude to democracy and vowed to crush any challenges to the Communist Party or \"social stability\".",
                "The CDP vowed to keep working, as more leaders awaited arrest."
            ],
            [
                "Xu Wenli, Wang Youchai, and Qin Yongmin, leading dissidents and prominent members of the China Democracy Party, were found guilty of subversion and sentenced to 13, 11, and 12 years in prison, respectively.",
                "Soon after the sentencing, China's president, Jiang Zemin, delivered speeches in which he asserted that Western political system must not be adopted and vowed to crush challenges to Communist Party rule.",
                "The harsh sentences and speeches signal a crackdown on dissent, but Zha Jianguo, another Democracy Party leader, says he will continue to push for change.",
                "Western nations condemned the sentences as violations of U.N. rights treaties signed by China."
            ]
        ]

        compute_rouge_l = True
        use_porter_stemmer = False
        remove_stopwords = False
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)

        use_porter_stemmer = False
        remove_stopwords = True
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)

        use_porter_stemmer = True
        remove_stopwords = False
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)

        use_porter_stemmer = True
        remove_stopwords = True
        rouge = Rouge(max_ngram=2,
                      compute_rouge_l=compute_rouge_l,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords)
        python_rouge = PythonRouge(compute_rouge_l=compute_rouge_l,
                                   use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords)
        expected_metrics = rouge.score(summary, gold_summaries)
        actual_metrics = python_rouge.score(summary, gold_summaries)
        self.assert_same_as_rouge(actual_metrics, expected_metrics)
Beispiel #2
0
class QAScoringMetric(Metric):
    def __init__(self) -> None:
        super().__init__(['summary'], ['answered_questions'],
                         jackknifer=AnsweredQuestionsJackknifer())
        self.rouge = PythonRouge(ngram_orders=[1],
                                 remove_stopwords=True,
                                 use_porter_stemmer=True)

    def _calculate_exact_match(self, answer: str, prediction: str,
                               probability: float,
                               null_probability: float) -> Dict[str, float]:
        if prediction is None:
            em_null = 0.0
        else:
            em = int(prediction == answer)
            em_null = 0
            if probability > null_probability:
                em_null = em

        return {
            'exact-match': em_null,
        }

    def _calculate_f1(self, answer: str, prediction: str, probability: float,
                      null_probability: float) -> Dict[str, float]:
        if prediction is None:
            f1_null = 0.0
        else:
            f1 = self.rouge.score(prediction, [answer])['python-rouge-1']['f1']
            f1_null = 0
            if probability > null_probability:
                f1_null = f1

        return {
            'f1': f1_null,
        }

    def _calculate_is_answerable(self, probability: float,
                                 null_probability: float) -> Dict[str, float]:
        return {'is-answerable': int(probability > null_probability)}

    def _calculate_human_is_correct(self, is_correct: bool):
        return {'human-is-correct': int(is_correct)}

    def _score_question(
            self, answered_questions: List[AnsweredQuestion]) -> MetricsDict:
        # Average over answers
        metrics = []
        for aq in answered_questions:
            em = self._calculate_exact_match(aq.answer, aq.prediction,
                                             aq.probability,
                                             aq.null_probability)
            f1 = self._calculate_f1(aq.answer, aq.prediction, aq.probability,
                                    aq.null_probability)
            is_answerable = self._calculate_is_answerable(
                aq.probability, aq.null_probability)
            human_is_correct = self._calculate_human_is_correct(aq.is_correct)
            metrics.append(
                MetricsDict(
                    dict(**em, **f1, **is_answerable, **human_is_correct)))
        return sum(metrics) / len(metrics)

    def _score_prompt(
            self, answered_questions: List[AnsweredQuestion]) -> MetricsDict:
        # Average over questions
        metrics = []
        for _, aqs in itertools.groupby(answered_questions,
                                        key=lambda aq: aq.question_id):
            metrics.append(self._score_question(list(aqs)))
        return sum(metrics) / len(metrics)

    def _score_reference(
            self, answered_questions: List[AnsweredQuestion]) -> MetricsDict:
        # Sort over the prompts then questions in one shot to setup the groupbys
        answered_questions.sort(key=lambda aq: (aq.prompt_id, aq.question_id))

        # Average over prompts
        metrics = []
        group_to_metrics = defaultdict(list)
        for _, aqs in itertools.groupby(answered_questions,
                                        key=lambda aq: aq.prompt_id):
            aqs = list(aqs)
            prompt_metrics = self._score_prompt(aqs)
            metrics.append(prompt_metrics)

            # All questions of the same prompt should be in the same group, so we can just take
            # the first one
            if aqs[0].group_id is not None:
                group_to_metrics[aqs[0].group_id].append(prompt_metrics)

        metrics = sum(metrics) / len(metrics)
        return metrics

    def _score(
            self, answered_questions_list: List[List[AnsweredQuestion]]
    ) -> MetricsDict:
        # Average over references
        metrics = []
        for answered_questions in answered_questions_list:
            metrics.append(self._score_reference(answered_questions))
        final_metrics = sum(metrics) / len(metrics)
        return MetricsDict({'qa-eval': final_metrics})

    def score_multi_all(
        self, summaries_list: List[List[SummaryType]],
        answered_questions_lists: List[List[List[AnsweredQuestion]]]
    ) -> List[List[MetricsDict]]:
        metrics_list = []
        for summaries, answered_questions_list in zip(
                summaries_list, answered_questions_lists):
            metrics_list.append([])
            for _ in summaries:
                metrics_list[-1].append(self._score(answered_questions_list))
        return metrics_list