def test_rouge(self): # Test the first several instances in the TAC 2008 data to ensure that # our computation of ROUGE matches the values released by NIST instances = ReferenceBasedDatasetReader().read(_summaries_file_path) metrics_list = JsonlReader(_metrics_file_path, Metrics).read() metric_names = ['rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4', 'rouge-w-1.2'] rouge = Rouge(max_ngram=4, use_porter_stemmer=True, remove_stopwords=False, compute_rouge_l=True, skip_bigram_gap_length=4, wlcs_weight=1.2) peer_instances, peer_metrics = self._filter_by_type(instances, metrics_list, 'peer') reference_instances, reference_metrics = self._filter_by_type(instances, metrics_list, 'reference') num_to_check = 25 actual_metrics_dicts = score_instances(peer_instances[:num_to_check], [rouge]) for expected_metrics in peer_metrics[:num_to_check]: instance_id = expected_metrics.instance_id summarizer_id = expected_metrics.summarizer_id actual_metrics = actual_metrics_dicts[instance_id][summarizer_id] for metric in metric_names: assert actual_metrics.metrics[metric] == expected_metrics.metrics[metric] assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk'] actual_metrics_dicts = score_instances(reference_instances[:num_to_check], [rouge]) for expected_metrics in reference_metrics[:num_to_check]: instance_id = expected_metrics.instance_id summarizer_id = expected_metrics.summarizer_id actual_metrics = actual_metrics_dicts[instance_id][summarizer_id] for metric in metric_names: assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
def test_task2_rouge(self): # Test the first several instances in the DUC 2007 data to ensure that # our computation of ROUGE matches the values released by NIST. In this year, # they did not release the raw output for the non-jackknifing runs, so # we cannot compare those scores instances = ReferenceBasedDatasetReader( _task2_summaries_file_path).read() metrics_list = JsonlReader(_task2_metrics_file_path, Metrics).read() metric_names = [ 'rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4', 'rouge-w-1.2' ] rouge = Rouge(max_ngram=4, use_porter_stemmer=True, remove_stopwords=False, compute_rouge_l=True, skip_bigram_gap_length=4, wlcs_weight=1.2) peer_instances, peer_metrics = self._filter_by_type( instances, metrics_list, 'peer') reference_instances, reference_metrics = self._filter_by_type( instances, metrics_list, 'reference') num_to_check = 25 actual_metrics_dicts = score_instances(peer_instances[:num_to_check], [rouge]) for expected_metrics in peer_metrics[:num_to_check]: instance_id = expected_metrics.instance_id summarizer_id = expected_metrics.summarizer_id actual_metrics = actual_metrics_dicts[instance_id][summarizer_id] for metric in metric_names: assert actual_metrics.metrics[metric + '_jk'] == pytest.approx( expected_metrics.metrics[metric + '_jk'], abs=1e-3) actual_metrics_dicts = score_instances( reference_instances[:num_to_check], [rouge]) for expected_metrics in reference_metrics[:num_to_check]: instance_id = expected_metrics.instance_id summarizer_id = expected_metrics.summarizer_id actual_metrics = actual_metrics_dicts[instance_id][summarizer_id] for metric in metric_names: assert actual_metrics.metrics[ metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
def run_score(self, args: argparse.Namespace) -> None: prepare_global_logging(file_path=args.log_file, silent=args.silent) dataset_reader = get_dataset_reader_from_argument(args.dataset_reader) metric = get_metric_from_arguments(self.metric_type, args) input_files = args.input_files instances = dataset_reader.read(*input_files) metrics_dicts = score_instances(instances, [metric]) save_score_results(metrics_dicts, args.output_jsonl, args.silent)