Example #1
0
 def compute(self, guess: str, answers: Iterable[str]) -> F1Metric:
     if guess is None or answers is None:
         return F1Metric(0, 0)
     guess = RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, guess)
     answers = [
         RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, a)
         for a in answers
     ]
     if not any(len(a) for a in answers):
         # no rare words in labels, set denominator to zero
         return F1Metric(0, 0)
     return F1Metric.compute(guess, answers)
    def test_custom_eval(self):
        """
        Test whether custom evaluation works.
        """
        with testing_utils.capture_output():
            parser = setup_args()
            opt = parser.parse_args([
                '--task',
                'wizard_of_wikipedia',
                '--datatype',
                'valid',
                '--label-type',
                'chosen_sent',
            ])
            teacher = create_task_agent_from_taskname(opt)[0]

        title = 'Gardening'
        cands = list('four')

        text = "Gardening\nI like Gardening, even when I've only been doing it for a short time."
        response = 'I live on a farm, we garden all year long, it is very relaxing.'
        checked_sent = (
            'Gardening is considered by many people to be a relaxing activity.'
        )
        checked_sent_label = f'{title}{TOKEN_KNOWLEDGE}{checked_sent}'

        retrieval_metric_keys = [
            'passage_r@1', 'passage_r@5', 'title_r@1', 'title_r@5'
        ]

        chosen_sent_teacher_action = Message({
            'text':
            text,
            'labels': [checked_sent_label],
            'title': [title],
            'checked_sentence': [checked_sent],
        })
        correct_chosen_sent_response = Message({
            'text':
            checked_sent_label,
            'title_candidates': [title] + cands,
            'text_candidates': [checked_sent_label] + cands,
        })
        top5_chosen_sent_response = Message({
            'text':
            f'hello{TOKEN_KNOWLEDGE}goodbye',
            'title_candidates':
            cands + [title],
            'text_candidates':
            cands + [checked_sent_label],
        })
        incorrect_chosen_sent_response = Message({
            'text': f'hello{TOKEN_KNOWLEDGE}goodbye',
            'title_candidates': cands,
            'text_candidates': cands,
        })

        response_teacher_action = Message({
            'text': text,
            'labels': [response],
            'checked_sentence': checked_sent
        })
        high_f1_response = Message({'text': checked_sent})
        low_f1_response = Message({'text': 'incorrect'})

        # 1) Test with correct top sentence
        teacher.reset_metrics()
        teacher.custom_evaluation(
            chosen_sent_teacher_action,
            [checked_sent_label],
            correct_chosen_sent_response,
        )
        report = teacher.report()
        for k in retrieval_metric_keys:
            assert k in report
            assert report[k] == AverageMetric(1)

        # 2) Test with top sentence in top 5
        teacher.reset_metrics()
        teacher.custom_evaluation(chosen_sent_teacher_action,
                                  [checked_sent_label],
                                  top5_chosen_sent_response)
        report = teacher.report()
        for k in retrieval_metric_keys:
            assert k in report
            assert report[k] == AverageMetric(
                1) if '5' in k else AverageMetric(0)

        # 3) Test with no top sentences
        teacher.reset_metrics()
        teacher.custom_evaluation(
            chosen_sent_teacher_action,
            [checked_sent_label],
            incorrect_chosen_sent_response,
        )
        report = teacher.report()
        for k in retrieval_metric_keys:
            assert k in report
            assert report[k] == AverageMetric(0)

        # 4) Test knowledge f1 with high f1
        teacher.label_type = 'response'
        teacher.reset_metrics()
        teacher.custom_evaluation(response_teacher_action, [response],
                                  high_f1_response)
        report = teacher.report()
        assert 'knowledge_f1' in report
        assert report['knowledge_f1'] == F1Metric(1)

        # 5) Test knowledge f1 with low f1
        teacher.reset_metrics()
        teacher.custom_evaluation(response_teacher_action, [response],
                                  low_f1_response)
        report = teacher.report()
        assert 'knowledge_f1' in report
        assert report['knowledge_f1'] == F1Metric(0)