def score_multi_all( self, summaries_list: List[List[SummaryField]], references_list: List[List[ReferencesField]] ) -> List[List[MetricsDict]]: # Just take the summaries themselves, not the fields summaries_list = [[field.summary for field in fields] for fields in summaries_list] references_list = [field.references for field in references_list] with TemporaryDirectory() as temp_dir: self._save_summaries(temp_dir, summaries_list, references_list) self._run_step1(temp_dir) self._run_step2(temp_dir) self._run_step3(temp_dir) stdout = self._run_step4(temp_dir) # There is a weird way to score a summary given multiple references # in the original code. They multiply the highest recall score by # (num_references - 1) and adds that to the second to last score and # divides by num_references. # (See https://github.com/igorbrigadir/ROUGE-BEwTE/blob/f69a85556c889b805c89c5c71d7b77a983e75a05/src/main/java/bewte/BEwT_E.java#L419) # I don't understand this because it depends on the order that the # summaries are processed. We instead compute the average over the references. metrics_lists = self._parse_stdout(stdout) return metrics_lists
def score_multi_all(self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]], **kwargs) -> List[List[MetricsDict]]: summaries_list = self._flatten_summaries(summaries_list) references_list = self._flatten_summaries(references_list) logger.info(f'Serializing the summaries and references to a file') num_summaries = 0 with TemporaryDirectory() as temp_dir: input_file = f'{temp_dir}/input.jsonl' output_file = f'{temp_dir}/output.jsonl' with JsonlWriter(input_file) as out: for summaries, references in zip(summaries_list, references_list): for summary in summaries: out.write({ 'summary': summary, 'references': references }) num_summaries += 1 logger.info(f'Wrote {num_summaries} (summary, references) pairs') commands = [f'cd {self.s3_root}/S3'] if self.environment_name is not None: commands.append(f'source {os.environ["CONDA_INIT"]}') commands.append(f'conda activate {self.environment_name}') commands.append( f'python2.7 run_batch.py {input_file} {output_file} {self.embeddings_file} {self.model_dir}' ) command = ' && '.join(commands) logger.info(f'Running command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) process.communicate() scores = JsonlReader(output_file).read() assert len(scores) == num_summaries metrics_list = [] index = 0 for summaries in summaries_list: metrics_list.append([]) for _ in summaries: metrics_list[-1].append( MetricsDict({ 's3': { 'pyr': scores[index]['pyr'], 'resp': scores[index]['resp'], } })) index += 1 return metrics_list
def test_correlate_reference(self): # We have to use the TAC 2008 data because the MultiLing data is too small for # the bootstrapping to work with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'stat-sig-test', '--metrics-jsonl-files', _metrics_file_path, '--dependent-metric', 'overall_responsiveness', '--metric-A', 'rouge-1_jk_precision', '--metric-B', 'rouge-2_jk_recall', '--summarizer-type', 'all', '--hypothesis-test', 'bootstrap-both', '--output-file', f'{temp_dir}/correlations.json', '--random-seed', '6', '--silent' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert correlations['dependent_metric'] == 'overall_responsiveness' assert correlations['metric_A'] == 'rouge-1_jk_precision' assert correlations['metric_B'] == 'rouge-2_jk_recall' assert correlations['summarizer_type'] == 'all' assert correlations['test_method'] == 'bootstrap-both' self.assertAlmostEqual(correlations['alpha'], 0.05, places=4) assert correlations['two_tailed'] is False assert correlations[ 'H0'] == 'r(rouge-1_jk_precision, overall_responsiveness) <= r(rouge-2_jk_recall, overall_responsiveness)' assert correlations[ 'H1'] == 'r(rouge-1_jk_precision, overall_responsiveness) > r(rouge-2_jk_recall, overall_responsiveness)' assert correlations['summary_level']['pearson']['pvalue'] == 0.829 assert correlations['summary_level']['pearson'][ 'is_significant'] is False assert correlations['summary_level']['spearman']['pvalue'] == 0.938 assert correlations['summary_level']['spearman'][ 'is_significant'] is False assert correlations['summary_level']['kendall']['pvalue'] == 0.929 assert correlations['summary_level']['kendall'][ 'is_significant'] is False assert correlations['system_level']['pearson']['pvalue'] == 0.603 assert correlations['system_level']['pearson'][ 'is_significant'] is False assert correlations['system_level']['spearman']['pvalue'] == 0.945 assert correlations['system_level']['spearman'][ 'is_significant'] is False assert correlations['system_level']['kendall']['pvalue'] == 0.977 assert correlations['system_level']['kendall'][ 'is_significant'] is False assert correlations['global']['pearson']['pvalue'] == 0.49 assert correlations['global']['pearson']['is_significant'] is False assert correlations['global']['spearman']['pvalue'] == 0.831 assert correlations['global']['spearman']['is_significant'] is False assert correlations['global']['kendall']['pvalue'] == 0.811 assert correlations['global']['kendall']['is_significant'] is False
def test_correlation(self): with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', _metrics_file_path, '--metrics', 'chaganty2018_overall', 'chaganty2018_rouge-1_recall', '--summarizer-type', 'peer', '--output-file', f'{temp_dir}/correlations.json', '--silent' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r'))
def test_numeric_metric(self): with TemporaryDirectory() as temp_dir: output_file = f'{temp_dir}/metrics.jsonl' command = [ 'python', '-m', 'sacrerouge', 'score', _numeric_config_file_path, output_file ] process = Popen(command, stdout=PIPE, stderr=PIPE) process.communicate() metrics_list = JsonlReader(output_file, Metrics).read() assert len(metrics_list) == 5 assert metrics_list[0].instance_id == 'D1' assert metrics_list[1].instance_id == 'D1' assert metrics_list[2].instance_id == 'D1' assert metrics_list[3].instance_id == 'D1' assert metrics_list[4].instance_id == 'D1' assert metrics_list[0].summarizer_id == '1' assert metrics_list[1].summarizer_id == '2' assert metrics_list[2].summarizer_id == 'A' assert metrics_list[3].summarizer_id == 'B' assert metrics_list[4].summarizer_id == 'C' assert metrics_list[0].summarizer_type == 'peer' assert metrics_list[1].summarizer_type == 'peer' assert metrics_list[2].summarizer_type == 'reference' assert metrics_list[3].summarizer_type == 'reference' assert metrics_list[4].summarizer_type == 'reference' # test: 1 * 10 + 1 * 100 + 1 * 1000 == 1110 # test_jk = ((1 * 10 + 1 * 100) + (1 * 10 + 1 * 1000) + (1 * 100 + 1 * 1000)) / 3 == 740 assert metrics_list[0].metrics == {'test': 1110, 'test_jk': 740} # test: 2 * 10 + 2 * 100 + 2 * 1000 == 2220 # test_jk = ((2 * 10 + 2 * 100) + (2 * 10 + 2 * 1000) + (2 * 100 + 2 * 1000)) / 3 == 1480 assert metrics_list[1].metrics == { 'test': 2220, 'test_jk': 1480 } # 2 * 10 + 2 * 100 + 2 * 1000 # test_jk = 10 * 100 + 10 * 1000 == 11000 assert metrics_list[2].metrics == { 'test_jk': 11000 } # 10 * 100 + 10 * 1000 # test_jk = 100 * 10 + 100 * 1000 == 101000 assert metrics_list[3].metrics == { 'test_jk': 101000 } # 100 * 10 + 100 * 1000 # test_jk = 1000 * 10 + 10000 * 100 == 110000 assert metrics_list[4].metrics == { 'test_jk': 110000 } # 1000 * 10 + 10000 * 100
def test_numeric_metric(self): with TemporaryDirectory() as temp_dir: macro_file = f'{temp_dir}/macro.json' micro_file = f'{temp_dir}/micro.jsonl' command = [ 'python', '-m', 'sacrerouge', 'evaluate', '--config', _numeric_config_file_path, '--macro-output-json', macro_file, '--micro-output-jsonl', micro_file, '--silent' ] process = Popen(command, stdout=PIPE, stderr=PIPE) process.communicate() macro_metrics = json.load(open(macro_file, 'r')) micro_metrics_list = JsonlReader(micro_file, Metrics).read() assert macro_metrics == {'metrics': {'test': 45066}} assert len(micro_metrics_list) == 5 assert micro_metrics_list[0].instance_id == 'D1' assert micro_metrics_list[1].instance_id == 'D1' assert micro_metrics_list[2].instance_id == 'D1' assert micro_metrics_list[3].instance_id == 'D1' assert micro_metrics_list[4].instance_id == 'D1' assert micro_metrics_list[0].summarizer_id == '1' assert micro_metrics_list[1].summarizer_id == '2' assert micro_metrics_list[2].summarizer_id == 'A' assert micro_metrics_list[3].summarizer_id == 'B' assert micro_metrics_list[4].summarizer_id == 'C' assert micro_metrics_list[0].summarizer_type == 'peer' assert micro_metrics_list[1].summarizer_type == 'peer' assert micro_metrics_list[2].summarizer_type == 'reference' assert micro_metrics_list[3].summarizer_type == 'reference' assert micro_metrics_list[4].summarizer_type == 'reference' assert micro_metrics_list[0].metrics == { 'test': 1110 } # 1 * 10 + 1 * 100 + 1 * 1000 assert micro_metrics_list[1].metrics == { 'test': 2220 } # 2 * 10 + 2 * 100 + 2 * 1000 assert micro_metrics_list[2].metrics == { 'test': 11000 } # 10 * 100 + 10 * 1000 assert micro_metrics_list[3].metrics == { 'test': 101000 } # 100 * 10 + 100 * 1000 assert micro_metrics_list[4].metrics == { 'test': 110000 } # 1000 * 10 + 10000 * 100
def _run(self, summaries_list: List[List[SummaryType]], documents_list: List[List[str]]) -> Tuple[List[MetricsDict], List[List[MetricsDict]]]: with TemporaryDirectory() as temp_dir: mappings_file_path = f'{temp_dir}/mappings.txt' with open(mappings_file_path, 'w') as out: for i, (summaries, documents) in enumerate(zip(summaries_list, documents_list)): document_dir = f'{temp_dir}/documents/{i}' for j, document in enumerate(documents): document_file_path = f'{document_dir}/{j}.txt' self._save_summary_like(document, document_file_path) for j, summary in enumerate(summaries): summary_file_path = f'{temp_dir}/summaries/{i}-{j}.txt' self._save_summary_like(summary, summary_file_path) out.write(f'{i} {j} {document_dir} {summary_file_path}\n') config_file_path = f'{temp_dir}/config' with open(config_file_path, 'w') as out: perform_stemming = 'Y' if self.use_stemmer else 'N' out.write(f'performStemming = {perform_stemming}\n') remove_stopwords = 'Y' if self.remove_stopwords else 'N' out.write(f'removeStopWords = {remove_stopwords}\n') out.write(f'stopFilePath = {self.data_dir}/smart_common_words.txt\n') out.write(f'divergence = Y\n') out.write(f'frequencyFeatures = Y\n') out.write(f'cosineOverlap = Y\n') out.write(f'topicWordFeatures = Y\n') out.write(f'backgroundCorpusFreqCounts = {self.data_dir}/bgFreqCounts.unstemmed.txt\n') out.write(f'backgroundIdfUnstemmed = {self.data_dir}/bgIdfValues.unstemmed.txt\n') out.write(f'backgroundIdfStemmed = {self.data_dir}/bgIdfValues.stemmed.txt\n') command = [ 'java', '-cp', self.jar_path, 'edu.upenn.seas.simetrix.InputBasedEvaluation', mappings_file_path, config_file_path ] logger.info(f'Running SIMetrix command: "{command}"') process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if stderr: raise Exception(f'SIMetrix failed with stderr: {stderr.decode()}') macro_results = self._parse_macro_file(f'{temp_dir}/mappings.txt.ieval.macro') micro_results = self._parse_micro_file(f'{temp_dir}/mappings.txt.ieval.micro') return macro_results, micro_results
def _run( self, summaries_list: List[List[SummaryType]] ) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: summaries_file = f'{temp_dir}/summaries.jsonl' predictions_file = f'{temp_dir}/predictions.json' # Save all of the summaries to a file with JsonlWriter(summaries_file) as out: for summaries in summaries_list: for summary in summaries: out.write({'summary': self._flatten_summary(summary)}) commands = [f'cd {self.sum_qe_root}'] if self.environment_name: commands += [f'source activate {self.environment_name}'] commands += [ ' '.join([ 'python', '-m', 'src.BERT_experiments.predict', summaries_file, self.model_file, predictions_file ]) ] redirect = None if self.verbose else PIPE process = Popen(' && '.join(commands), stdout=redirect, stderr=redirect, shell=True) stdout, stderr = process.communicate() predictions = json.loads(open(predictions_file, 'r').read()) index = 0 metrics_lists = [] for summaries in summaries_list: metrics_lists.append([]) for summary in summaries: preds = predictions[index] metrics_lists[-1].append( MetricsDict({ 'SumQE': { 'Q1': preds[0], 'Q2': preds[1], 'Q3': preds[2], 'Q4': preds[3], 'Q5': preds[4] } })) index += 1 return metrics_lists
def _run( self, summaries_list: List[List[SummaryType]], references_list: List[List[SummaryType]] ) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: files_tsv_path = f'{temp_dir}/files.tsv' with open(files_tsv_path, 'w') as out: for i, (summaries, references) in enumerate( zip(summaries_list, references_list)): reference_filenames = [] for j, reference in enumerate(references): filename = f'{temp_dir}/references/{i}/{j}.txt' self._save_summary(reference, filename) reference_filenames.append(filename) peer_filenames = [] for j, summary in enumerate(summaries): filename = f'{temp_dir}/peers/{i}/{j}.txt' self._save_summary(summary, filename) peer_filenames.append(filename) out.write( f'{",".join(reference_filenames)}\t{",".join(peer_filenames)}\n' ) output_file = f'{temp_dir}/output.tsv' args = ' '.join([ f'-files={files_tsv_path}', f'-output={output_file}', f'-minN={self.min_n}', f'-maxN={self.max_n}', f'-dwin={self.d_window}', f'-minScore={self.min_score}', f'-maxScore={self.max_score}' ]) commands = [ f'cd {self.autosummeng_root}', f'mvn exec:java@NPowERBatch -Dexec.args=\'{args}\'' ] command = ' && '.join(commands) logger.info(f'Running AutoSummENG command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) stdout, stderr = process.communicate() return self._parse_output_file(output_file)
def test_plots(self): # Tests to ensure plot files exist with TemporaryDirectory() as temp_dir: system_plot_file = f'{temp_dir}/system.pdf' global_plot_file = f'{temp_dir}/global.pdf' command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all', '--output-file', f'{temp_dir}/correlations.json', '--silent', '--system-level-output-plot', system_plot_file, '--global-output-plot', global_plot_file ] subprocess.run(command, check=True) assert os.path.exists(system_plot_file) assert os.path.exists(global_plot_file)
def _run( self, summaries_list: List[List[SummaryType]], references_list: List[List[SummaryType]] ) -> Tuple[MetricsDict, List[List[MetricsDict]]]: summaries_list = self._flatten_summaries(summaries_list) references_list = self._flatten_summaries(references_list) with TemporaryDirectory() as temp_dir: # As far as I can tell, the input only allows for one reference # per input, so we need to write an instance for every pair and then # aggregate the output summaries_file = f'{temp_dir}/summaries.txt' references_file = f'{temp_dir}/references.txt' index = 0 tuple_to_indices = defaultdict(list) with open(summaries_file, 'w') as out_summaries: with open(references_file, 'w') as out_references: for i, (summaries, references) in enumerate( zip(summaries_list, references_list)): for j, summary in enumerate(summaries): for reference in references: out_summaries.write(summary + '\n') out_references.write(reference + '\n') tuple_to_indices[(i, j)].append(index) index += 1 # Run meteor command = [ 'java', '-jar', f'{self.meteor_root}/meteor-1.5/meteor-1.5.jar', summaries_file, references_file, '-l', 'en', '-norm' ] process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if stderr: raise Exception( f'Meteor failed with stderr: {stderr.decode()}') final_score, individual_scores = self._parse_meteor_stdout( stdout.decode()) macro_metrics = MetricsDict({'METEOR': final_score}) micro_metrics_list = self._aggregate_summary_scores( summaries_list, references_list, tuple_to_indices, individual_scores) return macro_metrics, micro_metrics_list
def test_evaluate(self): with TemporaryDirectory() as temp_dir: macro_file = f'{temp_dir}/macro.json' micro_file = f'{temp_dir}/micro.jsonl' command = [ 'python', '-m', 'sacrerouge', 'evaluate', _config_file_path, macro_file, micro_file, '--silent' ] process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() macro_metrics = json.load(open(macro_file, 'r')) micro_metrics_list = JsonlReader(micro_file, Metrics).read() self._check_macro(macro_metrics) self._check_micro_list(micro_metrics_list)
def score_multi_all(self, summaries_list: List[List[SummaryType]], documents_list: List[List[DocumentType]], **kwargs) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: input_dir = f'{temp_dir}/input' output_file = f'{temp_dir}/output.json' logger.info(f'Serializing data to {input_dir}') os.makedirs(input_dir) for i, (summaries, documents) in enumerate(zip(summaries_list, documents_list)): instance_dir = f'{input_dir}/{i}' documents_dir = f'{instance_dir}/input_docs' summaries_dir = f'{instance_dir}/summaries' self._save_documents(documents, documents_dir) self._save_summaries(summaries, summaries_dir) commands = [f'cd {self.supert_root}'] if self.environment_name is not None: commands.append(f'source {os.environ["CONDA_INIT"]}') commands.append(f'conda activate {self.environment_name}') commands.append(f'python run_batch.py {input_dir} {output_file}') command = ' && '.join(commands) logger.info(f'Running command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) process.communicate() logger.info(f'Loading output from {output_file}') output = json.loads(open(output_file, 'r').read()) metrics_list = [] for i, summaries in enumerate(summaries_list): metrics_list.append([]) for j in range(len(summaries)): score = output[str(i)][str(j)] # SUPERT will output None if the summary was empty, so we replace that with a 0.0 if score is None: score = 0.0 metrics_list[-1].append(MetricsDict({'supert': score})) return metrics_list
def test_command_line(self): # This is a regression test and does not test for accuracy with TemporaryDirectory() as temp_dir: with open(f'{temp_dir}/A.json', 'w') as out: out.write(json.dumps(self.correlations_A)) with open(f'{temp_dir}/B.json', 'w') as out: out.write(json.dumps(self.correlations_B)) command = [ 'python', '-m', 'sacrerouge', 'stat-sig-test', '--summary-level-correlations-A', f'{temp_dir}/A.json', '--summary-level-correlations-B', f'{temp_dir}/B.json', '--output-file', f'{temp_dir}/results.json', '--silent' ] subprocess.run(command, check=True) results = json.load(open(f'{temp_dir}/results.json', 'r')) self._verify_results(results)
def test_correlate_reference(self): # This is a regression test for the "correlate" command. It does not test if it's accurate # TODO This needs to be a better test. There are too few summarization systems to get interesting # correlations with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'reference', '--output-file', f'{temp_dir}/correlations.json', '--silent' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert correlations['metric1'] == 'rouge-1_jk_precision' assert correlations['metric2'] == 'grade' assert correlations['summarizer_type'] == 'reference' assert correlations['summary_level']['pearson'][ 'r'] == pytest.approx(0.3333, abs=1e-4) assert correlations['summary_level']['spearman'][ 'rho'] == pytest.approx(0.3333, abs=1e-4) assert correlations['summary_level']['kendall'][ 'tau'] == pytest.approx(0.3333, abs=1e-4) assert correlations['summary_level']['num_summary_groups'] == 3 assert correlations['system_level']['pearson'][ 'r'] == pytest.approx(1.0, abs=1e-4) assert correlations['system_level']['spearman'][ 'rho'] == pytest.approx(1.0, abs=1e-4) assert correlations['system_level']['kendall'][ 'tau'] == pytest.approx(1.0, abs=1e-4) assert correlations['system_level']['num_summarizers'] == 2 assert correlations['global']['pearson']['r'] == pytest.approx( 0.6225273481541307, abs=1e-4) assert correlations['global']['spearman']['rho'] == pytest.approx( 0.8285714285714287, abs=1e-4) assert correlations['global']['kendall']['tau'] == pytest.approx( 0.7333333333333333, abs=1e-4) assert correlations['global']['num_summaries'] == 6
def test_evaluate(self): # This is a regression test and does not ensure correctness with TemporaryDirectory() as temp_dir: macro_file = f'{temp_dir}/macro.json' micro_file = f'{temp_dir}/micro.jsonl' command = [ 'python', '-m', 'sacrerouge', 'evaluate', '--config', _config_file_path, '--macro-output-json', macro_file, '--micro-output-jsonl', micro_file, '--silent' ] process = Popen(command, stdout=PIPE, stderr=PIPE) process.communicate() macro_metrics = json.load(open(macro_file, 'r')) micro_metrics_list = JsonlReader(micro_file, Metrics).read() self._check_macro(macro_metrics) self._check_micro_list(micro_metrics_list)
def test_all_summary_level_correlations(self): # This is a regression test for the "correlate" command. It does not test if it's accurate with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all', '--output-file', f'{temp_dir}/correlations.json', '--silent' ] subprocess.run(command, check=True) # Check the original correlations correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert correlations['metric1'] == 'rouge-1_jk_precision' assert correlations['metric2'] == 'grade' assert correlations['summarizer_type'] == 'all' assert correlations['summary_level']['pearson'][ 'r'] == pytest.approx(0.4365526945989437, abs=1e-4) assert correlations['summary_level']['spearman'][ 'rho'] == pytest.approx(0.3720759220056127, abs=1e-4) assert correlations['summary_level']['kendall'][ 'tau'] == pytest.approx(0.1719691730561296, abs=1e-4) assert correlations['summary_level']['num_summary_groups'] == 3 assert correlations['system_level']['pearson'][ 'r'] == pytest.approx(0.28732601225892834, abs=1e-4) assert correlations['system_level']['spearman'][ 'rho'] == pytest.approx(0.19999999999999998, abs=1e-4) assert correlations['system_level']['kendall'][ 'tau'] == pytest.approx(0.0, abs=1e-4) assert correlations['system_level']['num_summarizers'] == 4 assert correlations['global']['pearson']['r'] == pytest.approx( 0.34183806349510004, abs=1e-4) assert correlations['global']['spearman']['rho'] == pytest.approx( 0.4035707976004214, abs=1e-4) assert correlations['global']['kendall']['tau'] == pytest.approx( 0.28603877677367767, abs=1e-4) assert correlations['global']['num_summaries'] == 12
def test_evaluate_default(self): # I manually ran evaluate with these parameters and this method checks to make sure # those values are equal to the output here with TemporaryDirectory() as temp_dir: macro_file = f'{temp_dir}/macro.json' micro_file = f'{temp_dir}/micro.jsonl' command = [ 'python', '-m', 'sacrerouge', 'python-rouge', 'evaluate', macro_file, micro_file, '--dataset-reader', 'reference-based', '--input-files', MULTILING_SUMMARIES, '--silent' ] process = Popen(command, stdout=PIPE, stderr=PIPE) process.communicate() macro_metrics = json.load(open(macro_file, 'r')) micro_metrics_list = JsonlReader(micro_file, Metrics).read() self._check_macro_default(macro_metrics) self._check_micro_list_default(micro_metrics_list)
def test_dang_2008_table_6_example(self): with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', _metrics_file_path, '--metrics', 'overall_responsiveness', 'linguistic_quality', '--summarizer-type', 'reference', '--output-file', f'{temp_dir}/correlations.json', '--silent' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert correlations['system_level']['spearman'][ 'rho'] == pytest.approx(0.910, 1e-2) assert correlations['system_level']['pearson'][ 'r'] == pytest.approx(0.778, 1e-2) assert correlations['system_level']['num_summarizers'] == 8 # Kendall's tau is not reported in the paper, but this should break if # anything changes in the code assert correlations['system_level']['kendall'][ 'tau'] == pytest.approx(0.836, 1e-2)
def test_skip_calculations(self): # Ensures the flags to skip calculating specific correlations work with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all', '--output-file', f'{temp_dir}/correlations.json', '--silent', '--skip-summary-level' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert 'summary_level' not in correlations command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all', '--output-file', f'{temp_dir}/correlations.json', '--silent', '--skip-system-level' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert 'system_level' not in correlations command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all', '--output-file', f'{temp_dir}/correlations.json', '--silent', '--skip-global' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert 'global' not in correlations
def test_correlate_peer(self): # This is a regression test for the "correlate" command. It does not test if it's accurate # TODO This needs to be a better test. There are too few summarization systems to get interesting # correlations with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_precision', 'grade', '--summarizer-type', 'peer', '--output-file', f'{temp_dir}/correlations.json', '--silent' ] subprocess.run(command, check=True) correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert correlations['summary_level']['pearson'][ 'r'] == pytest.approx(-0.3333, abs=1e-4) assert correlations['summary_level']['spearman'][ 'rho'] == pytest.approx(-0.3333, abs=1e-4) assert correlations['summary_level']['kendall'][ 'tau'] == pytest.approx(-0.3333, abs=1e-4) assert correlations['summary_level']['num_summary_groups'] == 3 assert correlations['system_level']['pearson'][ 'r'] == pytest.approx(-1.0, abs=1e-4) assert correlations['system_level']['spearman'][ 'rho'] == pytest.approx(-1.0, abs=1e-4) assert correlations['system_level']['kendall'][ 'tau'] == pytest.approx(-1.0, abs=1e-4) assert correlations['system_level']['num_summarizers'] == 2 assert correlations['global']['pearson']['r'] == pytest.approx( -0.33056857901135617, abs=1e-4) assert correlations['global']['spearman']['rho'] == pytest.approx( -0.3768511731740915, abs=1e-4) assert correlations['global']['kendall']['tau'] == pytest.approx( -0.27602622373694163, abs=1e-4) assert correlations['global']['num_summaries'] == 6
def score_multi_all( self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]] ) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: temp_dir = os.path.abspath(temp_dir) summaries_file = f'{temp_dir}/summaries.json' questions_file = f'{temp_dir}/questions.jsonl' metadata_file = f'{temp_dir}/metadata.json' answers_file = f'{temp_dir}/answers.jsonl' instance_id_to_reference_ids = self._save_summaries( summaries_file, summaries_list, references_list) metadata = self._run_preprocess(summaries_file, questions_file, metadata_file) ids_to_scores = self._run_answer_questions(questions_file, answers_file) metrics_lists = self._get_metrics(summaries_list, references_list, instance_id_to_reference_ids, ids_to_scores, metadata) return metrics_lists
def _run( self, summaries_list: List[List[SummaryType]] ) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: summaries_file = f'{temp_dir}/summaries.jsonl' predictions_file = f'{temp_dir}/predictions.json' # Save all of the summaries to a file, keeping track of the indices # that are empty summaries empty_summaries = set() with JsonlWriter(summaries_file) as out: index = 0 for summaries in summaries_list: for summary in summaries: summary = self._flatten_summary(summary) if len(summary) > 0: out.write({'summary': summary}) else: empty_summaries.add(index) index += 1 commands = [ f'cd {self.sum_qe_root}', ' '.join([ self.python_binary, '-m', 'src.BERT_experiments.predict', summaries_file, self.model_file, predictions_file ]) ] command = ' && '.join(commands) logger.info(f'Running SumQE command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) stdout, stderr = process.communicate() predictions = json.loads(open(predictions_file, 'r').read()) index = 0 output_index = 0 metrics_lists = [] for summaries in summaries_list: metrics_lists.append([]) for _ in summaries: if index in empty_summaries: metrics_lists[-1].append( MetricsDict({ 'SumQE': { 'Q1': 0.0, 'Q2': 0.0, 'Q3': 0.0, 'Q4': 0.0, 'Q5': 0.0 } })) else: preds = predictions[output_index] metrics_lists[-1].append( MetricsDict({ 'SumQE': { 'Q1': preds[0], 'Q2': preds[1], 'Q3': preds[2], 'Q4': preds[3], 'Q5': preds[4] } })) output_index += 1 index += 1 return metrics_lists
def _run( self, summaries_list: List[List[SummaryType]], references_list: List[List[SummaryType]] ) -> Tuple[List[MetricsDict], List[List[MetricsDict]]]: with TemporaryDirectory() as temp_dir: summary_filenames_list = [] reference_filenames_list = [] for i, (summaries, references) in enumerate( zip(summaries_list, references_list)): summary_filenames_list.append([]) reference_filenames_list.append([]) for j, summary in enumerate(summaries): summary_filename = f'{i}/model.{j}.txt' summary_filenames_list[-1].append(summary_filename) self._save_summary(summary, f'{temp_dir}/{summary_filename}') for j, reference in enumerate(references): symbol = chr(j + 65) reference_filename = f'{i}/gold.{symbol}.txt' reference_filenames_list[-1].append(reference_filename) self._save_summary(reference, f'{temp_dir}/{reference_filename}') config_filename = f'{temp_dir}/config.xml' self._save_config_file(config_filename, summary_filenames_list, reference_filenames_list) command = [ self.rouge_script_location, '-e', self.rouge_eval_home, '-n', str(self.max_ngram), '-a', '-c', '95', '-r', '1000', '-p', '0.5', '-t', '0', '-d' ] if self.use_porter_stemmer: command += ['-m'] if self.remove_stopwords: command += ['-s'] if self.max_bytes is not None: command += ['-b', str(self.max_bytes)] if self.max_words is not None: command += ['-l', str(self.max_words)] if not self.compute_rouge_l: command += ['-x'] if self.skip_bigram_gap_length is not None: command += ['-2', str(self.skip_bigram_gap_length), '-u'] if self.wlcs_weight is not None: command += ['-w', str(self.wlcs_weight)] if self.scoring_function == 'average': command += ['-f', 'A'] elif self.scoring_function == 'max': command += ['-f', 'B'] else: raise Exception( f'Unrecognized scoring function: "{self.scoring_function}"' ) command += [config_filename] # We used to fail if anything was written to stderr, but ROUGE writes # a warning if the number of peers per reference set is different, which # is expected in some situations for us (if we just have more summaries # to score for some reference sets than others). Therefore, we no longer fail # if stderr is not empty. logger.info(f'Running ROUGE command: "{" ".join(command)}"') process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() macro_metrics_list, micro_metrics_lists = self._parse_rouge_stdout( stdout.decode()) return macro_metrics_list, micro_metrics_lists
def test_all_summary_level_correlations(self): # This is a regression test for the "correlate" command. It does not test if it's accurate with TemporaryDirectory() as temp_dir: command = [ 'python', '-m', 'sacrerouge', 'correlate', '--metrics-jsonl-files', MULTILING_METRICS, '--metrics', 'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all', '--output-file', f'{temp_dir}/correlations.json', '--silent', '--summary-level-correlations-output', f'{temp_dir}/summary-level.json' ] subprocess.run(command, check=True) # Check the original correlations correlations = json.load(open(f'{temp_dir}/correlations.json', 'r')) assert correlations['summary_level']['pearson'][ 'r'] == pytest.approx(0.4365526945989437, abs=1e-4) assert correlations['summary_level']['spearman'][ 'rho'] == pytest.approx(0.3720759220056127, abs=1e-4) assert correlations['summary_level']['kendall'][ 'tau'] == pytest.approx(0.1719691730561296, abs=1e-4) assert correlations['summary_level']['num_summary_groups'] == 3 assert correlations['system_level']['pearson'][ 'r'] == pytest.approx(0.28732601225892834, abs=1e-4) assert correlations['system_level']['spearman'][ 'rho'] == pytest.approx(0.19999999999999998, abs=1e-4) assert correlations['system_level']['kendall'][ 'tau'] == pytest.approx(0.0, abs=1e-4) assert correlations['system_level']['num_summarizers'] == 4 assert correlations['global']['pearson']['r'] == pytest.approx( 0.34183806349510004, abs=1e-4) assert correlations['global']['spearman']['rho'] == pytest.approx( 0.4035707976004214, abs=1e-4) assert correlations['global']['kendall']['tau'] == pytest.approx( 0.28603877677367767, abs=1e-4) assert correlations['global']['num_summaries'] == 12 # Check the individual summary-level correlations summary_level = json.load( open(f'{temp_dir}/summary-level.json', 'r')) assert len(summary_level['pearson']) == 3 assert summary_level['pearson']['M000'] == pytest.approx( 0.3216337604513384, abs=1e-4) assert summary_level['pearson']['M001'] == pytest.approx( 0.38969747442783453, abs=1e-4) assert summary_level['pearson']['M002'] == pytest.approx( 0.598326848917658, abs=1e-4) assert len(summary_level['spearman']) == 3 assert summary_level['spearman']['M000'] == pytest.approx( 0.6000000000000001, abs=1e-4) assert summary_level['spearman']['M001'] == pytest.approx( 0.316227766016838, abs=1e-4) assert summary_level['spearman']['M002'] == pytest.approx( 0.19999999999999998, abs=1e-4) assert len(summary_level['kendall']) == 3 assert summary_level['kendall']['M000'] == pytest.approx( 0.3333333333333334, abs=1e-4) assert summary_level['kendall']['M001'] == pytest.approx( 0.18257418583505539, abs=1e-4) assert summary_level['kendall']['M002'] == pytest.approx(0.0, abs=1e-4)
def score_multi_all( self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]] ) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: # Save the summaries to a file. Each file has one summary per line. # For multiple references, each reference is used to evaluate the same # summary independently, so the system summary is repeated. candidate_file = f'{temp_dir}/candidates.txt' reference_file = f'{temp_dir}/references.txt' score_file = f'{temp_dir}/scores.txt' with open(candidate_file, 'w') as out_candidates: with open(reference_file, 'w') as out_references: for summaries, references in zip(summaries_list, references_list): for summary in summaries: for reference in references: if isinstance(summary, list): out_candidates.write(' '.join(summary) + '\n') else: out_candidates.write(summary + '\n') if isinstance(reference, list): out_references.write(' '.join(reference) + '\n') else: out_references.write(reference + '\n') # Run through BLEURT commands = [f'cd {self.bleurt_root}'] if self.environment_name is not None: commands.append(f'source {os.environ["CONDA_INIT"]}') commands.append(f'conda activate {self.environment_name}') commands.append(f'python -m bleurt.score ' f'-candidate_file={candidate_file} ' f'-reference_file={reference_file} ' f'-bleurt_checkpoint={self.checkpoint_dir} ' f'-scores_file={score_file} ' f'-bleurt_batch_size={self.batch_size}') command = ' && '.join(commands) logger.info(f'Running command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) process.communicate() # Load the results scores = list(map(float, open(score_file, 'r').read().splitlines())) metrics_lists = [] index = 0 for summaries, references in zip(summaries_list, references_list): metrics_lists.append([]) for _ in summaries: reference_scores = [] for _ in references: reference_scores.append(scores[index]) index += 1 average = sum(reference_scores) / len(reference_scores) max_ = max(reference_scores) metrics_lists[-1].append( MetricsDict( {'bleurt': { 'average': average, 'max': max_ }})) return metrics_lists
def test_score(self): with TemporaryDirectory() as temp_dir: output_file = f'{temp_dir}/metrics.jsonl' command = [ 'python', '-m', 'sacrerouge', 'score', _config_file_path, output_file ] process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() instances = JsonlReader(_summaries_file_path).read() metrics_list = JsonlReader(output_file, Metrics).read() metrics_dicts = defaultdict(dict) assert len(instances) == len(metrics_list) for instance, metrics in zip(instances, metrics_list): assert metrics.instance_id == instance['instance_id'] assert metrics.summarizer_id == instance['summarizer_id'] assert metrics.summarizer_type == instance['summarizer_type'] metrics_dicts[metrics.instance_id][metrics.summarizer_id] = metrics if metrics.summarizer_type == 'reference': assert 'python-rouge-1_jk' in metrics.metrics assert 'python-rouge-2_jk' in metrics.metrics else: assert 'python-rouge-1' in metrics.metrics assert 'python-rouge-2' in metrics.metrics assert 'python-rouge-1_jk' in metrics.metrics assert 'python-rouge-2_jk' in metrics.metrics # Test a couple of instances. I did not check to see if these are correct, # but the test will check if the results have changed assert metrics_dicts['d0801-A']['0'].metrics == { 'python-rouge-1': { 'precision': 29.444444444444446, 'recall': 26.700251889168765, 'f1': 28.005284015852048 }, 'python-rouge-2': { 'precision': 2.8089887640449436, 'recall': 2.5445292620865136, 'f1': 2.67022696929239 }, 'python-rouge-1_jk': { 'precision': 29.444444444444443, 'recall': 26.719572295067344, 'f1': 28.015250464050713 }, 'python-rouge-2_jk': { 'precision': 2.808988764044944, 'recall': 2.549772468714448, 'f1': 2.6730599647266313 } } assert metrics_dicts['d0805-A']['B'].metrics == { 'python-rouge-1_jk': { 'precision': 37.84722222222222, 'recall': 36.21262458471761, 'f1': 37.011884550084886 }, 'python-rouge-2_jk': { 'precision': 9.12280701754386, 'recall': 8.724832214765101, 'f1': 8.919382504288166 } }
def score_multi_all( self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]] ) -> List[List[MetricsDict]]: # The original code for PyrEval processes exactly 1 pyramid at a time. Therefore, the whole pipeline needs # to be run once per item in `references_list`. Each execution of the pipeline will load the Stanford CoreNLP # models and run them over the text. Loading the models takes a lot of time, and the preprocessing of # the same summary may run multiple times (for instance in jackknifing). # # To save time, our implementation passes all of the unique peer and reference summaries through the # preprocessing step of the pipeline at once, then runs the analysis step per-pyramid afterward. This # significantly increases the speed of the processing. # Identify the unique summaries so less preprocessing needs to be done summaries_list = self._flatten_summaries(summaries_list) references_list = self._flatten_summaries(references_list) all_summaries, summary_to_index = self._index_summaries( summaries_list, references_list) with TemporaryDirectory() as temp_dir: # First, clear the PyrEval directory in case the last run was messed up self._clean_directories() # All of the summaries are saved in the "peers" folder, even if they are references. The PyrEval code # normally runs separate steps to process the peer and model directories, which is slower because it requires # loading the Stanford models twice, but the preprocessing is the same. self._save_summaries(all_summaries, f'{self.pyreval_root}/Raw/peers') self._run_through_preprocessing() # The PyrEval code will create an xml for summary i called i.xml and a directory with more data for # that file. The directory names aren't consistent because they're created by enumerating glob results # (which I think are not always deterministically sorted, or I don't want to rely on the assumption # that they are sorted). So we have to figure out the mapping from the summary index to the directory file_index_to_dir = self._map_file_index_to_directory( f'{self.pyreval_root}/Preprocess/peer_summaries') # All of the preprocessed summaries are now moved out of the PyrEval directory (or else they would be # used in the rest of the processing) to a temporary directory self._move_summaries_to_temp_dir(temp_dir) # Remove any extra data which could interfere with processing self._clean_directories() # Now build the pyramids and score metrics_dict_lists = [] for i, (summaries, references) in enumerate( zip(summaries_list, references_list)): array_index_to_tgt_index = self._copy_summaries_for_processing( summaries, summary_to_index, file_index_to_dir, f'{temp_dir}/peers', f'{self.pyreval_root}/Preprocess/peer_summaries', False, True) self._copy_summaries_for_processing( references, summary_to_index, file_index_to_dir, f'{temp_dir}/peers', f'{self.pyreval_root}/Preprocess/wise_crowd_summaries', True, False) metrics_list = self._score_summaries(array_index_to_tgt_index) metrics_dict_lists.append(metrics_list) # Clean for the next iteration self._clean_directories() return metrics_dict_lists
def test_score_default(self): with TemporaryDirectory() as temp_dir: # I manually ran the scoring function with these parameters, and this test makes sure # those are equal to the output here output_file = f'{temp_dir}/metrics.jsonl' command = [ 'python', '-m', 'sacrerouge', 'python-rouge', 'score', '--output-jsonl', output_file, '--dataset-reader', 'reference-based', '--input-files', MULTILING_SUMMARIES, '--silent' ] process = Popen(command, stdout=PIPE, stderr=PIPE) process.communicate() instances = JsonlReader(MULTILING_SUMMARIES).read() metrics_list = JsonlReader(output_file, Metrics).read() metrics_dicts = defaultdict(dict) assert len(instances) == len(metrics_list) for instance, metrics in zip(instances, metrics_list): assert metrics.instance_id == instance['instance_id'] assert metrics.summarizer_id == instance['summarizer_id'] assert metrics.summarizer_type == instance['summarizer_type'] metrics_dicts[metrics.instance_id][ metrics.summarizer_id] = metrics if metrics.summarizer_type == 'reference': assert 'python-rouge-1_jk' in metrics.metrics assert 'python-rouge-2_jk' in metrics.metrics else: assert 'python-rouge-1' in metrics.metrics assert 'python-rouge-2' in metrics.metrics assert 'python-rouge-1_jk' in metrics.metrics assert 'python-rouge-2_jk' in metrics.metrics assert metrics_dicts['M000']['1'].metrics == { 'python-rouge-1': { 'precision': 41.699867197875164, 'recall': 40.516129032258064, 'f1': 41.09947643979057 }, 'python-rouge-2': { 'precision': 10.533333333333333, 'recall': 10.233160621761659, 'f1': 10.38107752956636 }, 'python-rouge-1_jk': { 'precision': 41.699867197875164, 'recall': 40.514662613316766, 'f1': 41.098355761265616 }, 'python-rouge-2_jk': { 'precision': 10.533333333333333, 'recall': 10.226158358122346, 'f1': 10.3773782079838 } } assert metrics_dicts['M001']['B'].metrics == { 'python-rouge-1_jk': { 'precision': 51.59362549800797, 'recall': 51.18577075098815, 'f1': 51.3888888888889 }, 'python-rouge-2_jk': { 'precision': 20.4, 'recall': 20.238095238095237, 'f1': 20.318725099601597 } }
def test_score_arguments(self): with TemporaryDirectory() as temp_dir: # I manually ran the scoring function with these parameters, and this test makes sure # those are equal to the output here output_file = f'{temp_dir}/metrics.jsonl' command = [ 'python', '-m', 'sacrerouge', 'python-rouge', 'score', '--output-jsonl', output_file, '--dataset-reader', 'reference-based', '--input-files', MULTILING_SUMMARIES, '--ngram_orders', '[3]', '--use_porter_stemmer', 'false', '--remove_stopwords', 'true', '--compute_rouge_l', 'true', '--dataset-reader', 'reference-based', '--silent' ] process = Popen(command, stdout=PIPE, stderr=PIPE) process.communicate() instances = JsonlReader(MULTILING_SUMMARIES).read() metrics_list = JsonlReader(output_file, Metrics).read() metrics_dicts = defaultdict(dict) assert len(instances) == len(metrics_list) for instance, metrics in zip(instances, metrics_list): assert metrics.instance_id == instance['instance_id'] assert metrics.summarizer_id == instance['summarizer_id'] assert metrics.summarizer_type == instance['summarizer_type'] metrics_dicts[metrics.instance_id][ metrics.summarizer_id] = metrics if metrics.summarizer_type == 'reference': assert 'python-rouge-3_jk' in metrics.metrics assert 'python-rouge-l_jk' in metrics.metrics else: assert 'python-rouge-3' in metrics.metrics assert 'python-rouge-l' in metrics.metrics assert 'python-rouge-3_jk' in metrics.metrics assert 'python-rouge-l_jk' in metrics.metrics assert metrics_dicts['M000']['1'].metrics == { 'python-rouge-3': { 'precision': 3.0952380952380953, 'recall': 3.110047846889952, 'f1': 3.1026252983293556 }, 'python-rouge-l': { 'precision': 20.657276995305164, 'recall': 20.754716981132077, 'f1': 20.705882352941174 }, 'python-rouge-3_jk': { 'precision': 3.095238095238095, 'recall': 3.073768703921825, 'f1': 3.0843425372732653 }, 'python-rouge-l_jk': { 'precision': 20.657276995305164, 'recall': 20.72789236755767, 'f1': 20.69018908745478 } } assert metrics_dicts['M001']['B'].metrics == { 'python-rouge-3_jk': { 'precision': 3.75, 'recall': 3.4615384615384617, 'f1': 3.6 }, 'python-rouge-l_jk': { 'precision': 33.60655737704918, 'recall': 31.060606060606062, 'f1': 32.28346456692913 } }