Beispiel #1
0
    def test_rouge(self):
        # Test the first several instances in the TAC 2008 data to ensure that
        # our computation of ROUGE matches the values released by NIST
        instances = ReferenceBasedDatasetReader().read(_summaries_file_path)
        metrics_list = JsonlReader(_metrics_file_path, Metrics).read()
        metric_names = ['rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4', 'rouge-w-1.2']
        rouge = Rouge(max_ngram=4,
                      use_porter_stemmer=True,
                      remove_stopwords=False,
                      compute_rouge_l=True,
                      skip_bigram_gap_length=4,
                      wlcs_weight=1.2)

        peer_instances, peer_metrics = self._filter_by_type(instances, metrics_list, 'peer')
        reference_instances, reference_metrics = self._filter_by_type(instances, metrics_list, 'reference')

        num_to_check = 25
        actual_metrics_dicts = score_instances(peer_instances[:num_to_check], [rouge])
        for expected_metrics in peer_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric] == expected_metrics.metrics[metric]
                assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']

        actual_metrics_dicts = score_instances(reference_instances[:num_to_check], [rouge])
        for expected_metrics in reference_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
Beispiel #2
0
    def test_task2_rouge(self):
        # Test the first several instances in the DUC 2007 data to ensure that
        # our computation of ROUGE matches the values released by NIST. In this year,
        # they did not release the raw output for the non-jackknifing runs, so
        # we cannot compare those scores
        instances = ReferenceBasedDatasetReader(
            _task2_summaries_file_path).read()
        metrics_list = JsonlReader(_task2_metrics_file_path, Metrics).read()
        metric_names = [
            'rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4',
            'rouge-w-1.2'
        ]
        rouge = Rouge(max_ngram=4,
                      use_porter_stemmer=True,
                      remove_stopwords=False,
                      compute_rouge_l=True,
                      skip_bigram_gap_length=4,
                      wlcs_weight=1.2)

        peer_instances, peer_metrics = self._filter_by_type(
            instances, metrics_list, 'peer')
        reference_instances, reference_metrics = self._filter_by_type(
            instances, metrics_list, 'reference')

        num_to_check = 25
        actual_metrics_dicts = score_instances(peer_instances[:num_to_check],
                                               [rouge])
        for expected_metrics in peer_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric + '_jk'] == pytest.approx(
                    expected_metrics.metrics[metric + '_jk'], abs=1e-3)

        actual_metrics_dicts = score_instances(
            reference_instances[:num_to_check], [rouge])
        for expected_metrics in reference_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[
                    metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
Beispiel #3
0
    def run_score(self, args: argparse.Namespace) -> None:
        prepare_global_logging(file_path=args.log_file, silent=args.silent)

        dataset_reader = get_dataset_reader_from_argument(args.dataset_reader)
        metric = get_metric_from_arguments(self.metric_type, args)
        input_files = args.input_files

        instances = dataset_reader.read(*input_files)
        metrics_dicts = score_instances(instances, [metric])

        save_score_results(metrics_dicts, args.output_jsonl, args.silent)