def save_data(summaries, metrics, output_dir: str): for language in summaries.keys(): code = LANGUAGE_CODES[language] with JsonlWriter( f'{output_dir}/{code}.summaries.jsonl') as out_summaries: with JsonlWriter( f'{output_dir}/{code}.metrics.jsonl') as out_metrics: for instance_id in sorted(summaries[language].keys()): for summarizer_id in sorted( summaries[language][instance_id].keys()): summary = summaries[language][instance_id][ summarizer_id] instance_metrics = metrics[language][instance_id][ summarizer_id] references = get_references(summaries[language], instance_id, summarizer_id) out_summaries.write({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': summary['summarizer_type'], 'summary': summary, 'references': references }) out_metrics.write( Metrics(instance_id, summarizer_id, summary['summarizer_type'], instance_metrics))
def get_initial_micro_list(instances: List[EvalInstance]) -> List[Metrics]: micro_list = [] for instance in instances: micro_list.append( Metrics(instance.instance_id, instance.summarizer_id, instance.summarizer_type)) return micro_list
def save_metrics(summaries: Dict[str, Dict[str, List[str]]], metrics: Dict[str, Dict[str, List[int]]], output_dir: str): with JsonlWriter(f'{output_dir}/task1.summaries.jsonl') as out_summaries: with JsonlWriter(f'{output_dir}/task1.metrics.jsonl') as out_metrics: for instance_id in sorted(summaries.keys()): for summarizer_id in summaries[instance_id].keys(): summary = summaries[instance_id][summarizer_id] instance_metrics = metrics[instance_id][summarizer_id] references = get_references(summaries, instance_id, summarizer_id) out_summaries.write({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': summary['summarizer_type'], 'summary': summary, 'references': references }) out_metrics.write( Metrics(instance_id, summarizer_id, summary['summarizer_type'], instance_metrics))
def _load_generic_scores(input_file: str): data = json.load(open(input_file, 'r')) instances = [] metrics_list = [] documents = {} for i, instance in enumerate(data): document = instance['text'].strip() summary = instance['summary'].strip() scores = instance['scores'] summarizer_id = str(i) if document not in documents: documents[document] = str(len(documents)) instance_id = documents[document] instances.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'summary': { 'text': summary }, 'document': { 'text': document } }) metrics_list.append( Metrics(instance_id, summarizer_id, 'peer', MetricsDict({'generic_quality': scores}))) return instances, metrics_list
def save_mds_metrics(summaries: Dict[str, Dict[str, List[str]]], metrics: Dict[str, Dict[str, List[int]]], task_name: str, output_dir: str): for length in ['10', '50', '100', '200']: with JsonlWriter(f'{output_dir}/{task_name}.{length}.summaries.jsonl' ) as out_summaries: with JsonlWriter(f'{output_dir}/{task_name}.{length}.metrics.jsonl' ) as out_metrics: for instance_id in sorted(summaries.keys()): for summarizer_id in summaries[instance_id][length].keys(): summary = summaries[instance_id][length][summarizer_id] instance_metrics = metrics[instance_id][length][ summarizer_id] if len(instance_metrics) == 0: continue references = get_mds_references( summaries, instance_id, length, summarizer_id) out_summaries.write({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': summary['summarizer_type'], 'summary': summary, 'references': references }) out_metrics.write( Metrics(instance_id, summarizer_id, summary['summarizer_type'], instance_metrics))
def main(args): random.seed(args.random_seed) instances = JsonlReader(args.input_jsonl, Metrics).read() mapping = map_summarizer_id_to_results(instances) # Pick the specific references which are sampled for each instance_id. Be consistent and use these to score # all of the peer summaries instance_to_references = get_instance_id_to_references(instances) sampled_references = sample_references(instance_to_references, args.num_references) with JsonlWriter(args.output_file) as out: for (instance_id, summarizer_id, summarizer_type), metrics_dict in mapping.items(): reference_ids = sampled_references[instance_id] metrics = sum( [metrics_dict[reference_id] for reference_id in reference_ids]) / len(reference_ids) # Hacky: APES_num_correct should be the sum, not the average, so undo the division if 'APES' in metrics: metrics['APES']['num_correct'] *= len(reference_ids) out.write( Metrics(instance_id, summarizer_id, summarizer_type, metrics))
def _get_initial_metrics_dicts( instances: List[EvalInstance]) -> Dict[str, Dict[str, Metrics]]: metrics_dicts = defaultdict(dict) for instance in instances: metrics = Metrics(instance.instance_id, instance.summarizer_id, instance.summarizer_type) metrics_dicts[instance.instance_id][instance.summarizer_id] = metrics return metrics_dicts
def save_summaries_and_metrics(summaries, metrics, output_dir: str): with JsonlWriter(f'{output_dir}/task1.A-B.summaries.jsonl') as out_summaries_A_B: with JsonlWriter(f'{output_dir}/task1.A.summaries.jsonl') as out_summaries_A: with JsonlWriter(f'{output_dir}/task1.B.summaries.jsonl') as out_summaries_B: with JsonlWriter(f'{output_dir}/task1.A-B.metrics.jsonl') as out_metrics_A_B: with JsonlWriter(f'{output_dir}/task1.A.metrics.jsonl') as out_metrics_A: with JsonlWriter(f'{output_dir}/task1.B.metrics.jsonl') as out_metrics_B: for instance_id in sorted(summaries.keys()): for summarizer_id in sorted(summaries[instance_id].keys()): summary_A = summaries[instance_id][summarizer_id]['A'] summary_B = summaries[instance_id][summarizer_id]['B'] references_A = get_references(summaries, instance_id, summarizer_id, 'A') references_B = get_references(summaries, instance_id, summarizer_id, 'B') metrics_A = metrics[instance_id]['A'][summarizer_id] metrics_B = metrics[instance_id]['B'][summarizer_id] summary_instance_A = { 'instance_id': f'{instance_id}-A', 'summarizer_id': summarizer_id, 'summarizer_type': summary_A['summarizer_type'], 'summary': summary_A, 'references': references_A, } summary_instance_B = { 'instance_id': f'{instance_id}-B', 'summarizer_id': summarizer_id, 'summarizer_type': summary_B['summarizer_type'], 'summary': summary_B, 'references': references_B, 'metrics': metrics_B } metric_instance_A = Metrics(f'{instance_id}-A', summarizer_id, summary_A['summarizer_type'], metrics_A) metric_instance_B = Metrics(f'{instance_id}-B', summarizer_id, summary_B['summarizer_type'], metrics_B) out_summaries_A_B.write(summary_instance_A) out_summaries_A_B.write(summary_instance_B) out_summaries_A.write(summary_instance_A) out_summaries_B.write(summary_instance_B) out_metrics_A_B.write(metric_instance_A) out_metrics_A_B.write(metric_instance_B) out_metrics_A.write(metric_instance_A) out_metrics_B.write(metric_instance_B)
def test_convert_to_matrices(self): metrics_list = [ Metrics('1', 'A', 'peer', { 'm1': 1, 'm2': 2, 'm3': 3 }), Metrics('2', 'A', 'peer', { 'm1': 4, 'm2': 5 }), Metrics('1', 'B', 'peer', { 'm1': 6, 'm2': 7, 'm3': 8 }), Metrics('2', 'B', 'peer', { 'm1': 9, 'm2': 10, 'm3': 11 }), ] m1 = convert_to_matrices(metrics_list, 'm1') np.testing.assert_array_equal(m1, [[1, 4], [6, 9]]) m1, m2 = convert_to_matrices(metrics_list, 'm1', 'm2') np.testing.assert_array_equal(m1, [[1, 4], [6, 9]]) np.testing.assert_array_equal(m2, [[2, 5], [7, 10]]) m3 = convert_to_matrices(metrics_list, 'm3') np.testing.assert_array_equal(m3, [[3, np.nan], [8, 11]]) metrics_list = [ Metrics('1', 'A', 'peer', { 'm1': 1, 'm2': 2 }), Metrics('2', 'A', 'peer', { 'm1': 4, 'm2': 5 }), Metrics('1', 'B', 'peer', { 'm1': 6, 'm2': 7 }), Metrics('3', 'B', 'peer', { 'm1': 2, 'm2': 9 }), ] m1 = convert_to_matrices(metrics_list, 'm1') np.testing.assert_array_equal(m1, [[1, 4, np.nan], [6, np.nan, 2]])
def save_data(model_summaries, peer_summaries, metrics, output_dir): for language in model_summaries.keys(): with JsonlWriter(f'{output_dir}/{language}.summaries.jsonl') as out_summaries: if language in metrics: out_metrics = open(f'{output_dir}/{language}.metrics.jsonl', 'w') else: out_metrics = None for instance_id in sorted(model_summaries[language].keys()): references = list(model_summaries[language][instance_id][key] for key in sorted(model_summaries[language][instance_id].keys())) for summarizer_id in sorted(peer_summaries[language][instance_id].keys()): summary = peer_summaries[language][instance_id][summarizer_id] out_summaries.write({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'summary': summary, 'references': references }) if out_metrics is not None: instance_metrics = metrics[language][instance_id][summarizer_id] out_metrics.write(jsons.dumps(Metrics(instance_id, summarizer_id, 'peer', instance_metrics)) + '\n') for i, reference in enumerate(references): summarizer_id = reference['summarizer_id'] out_summaries.write({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'reference', 'summary': reference, 'references': references[:i] + references[i + 1:] }) if out_metrics is not None: instance_metrics = metrics[language][instance_id][summarizer_id] out_metrics.write(jsons.dumps(Metrics(instance_id, summarizer_id, 'reference', instance_metrics)) + '\n') if out_metrics is not None: out_metrics.close()
def test_serialization(self): instance_id = 'd500' summarizer_id = '5' summarizer_type = 'peer' metrics_dict = {'a': 4, 'b': {'c': [1, 2]}} metrics = Metrics(instance_id, summarizer_id, summarizer_type, metrics_dict) serialized = jsons.dumps(metrics) assert serialized == '{"instance_id": "d500", "summarizer_id": "5", "summarizer_type": "peer", "metrics": {"a": 4, "b": {"c": [1, 2]}}}' deserialized = jsons.loads(serialized, Metrics) assert metrics == deserialized assert isinstance(deserialized.metrics, MetricsDict)
def _load_dailynews_aspects(input_file: str): data = json.load(open(input_file, 'r')) instances = [] metrics_list = [] documents = {} for i, instance in enumerate(data): document = instance['text'].strip() summary = instance['summary'].strip() fluent = instance['map_quality_scores']['Fluent']['scores'] understandable = instance['map_quality_scores']['Understandable'][ 'scores'] informative = instance['map_quality_scores']['Informative']['scores'] compact = instance['map_quality_scores']['Compact']['scores'] overall = instance['map_quality_scores']['Overall']['scores'] summarizer_id = str(i) if document not in documents: documents[document] = str(len(documents)) instance_id = documents[document] instances.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'summary': { 'text': summary }, 'document': { 'text': document } }) metrics_list.append( Metrics( instance_id, summarizer_id, 'peer', MetricsDict({ 'fluent': fluent, 'understandable': understandable, 'informative': informative, 'compact': compact, 'overall': overall, }))) return instances, metrics_list
def convert_to_sacrerouge_instances_and_metrics( annotated_data: Dict, filtered_data: Dict, split: str) -> Tuple[List[Dict], List[Metrics]]: instances = [] metrics_list = [] for instance_dict in annotated_data.values(): instance_id = str(instance_dict['doc_id']) for summarizer_id, summarizer_dict in instance_dict[ 'system_summaries'].items(): summarizer_id = summarizer_id[:-4] # strip .txt if split == 'abs' and summarizer_id == 'bart_out': # This should not be included in the abstractive models. See the Readme about # this dataset continue document_text = filtered_data[summarizer_id][instance_id]['src'] if '<t>' in document_text: document_text = split_into_sentences(document_text) assert len(document_text) > 0 document = {'text': document_text} summary_sentences = split_into_sentences( filtered_data[summarizer_id][instance_id]['out']) assert len(summary_sentences) > 0 summary = {'text': summary_sentences} reference_sentences = split_into_sentences( filtered_data[summarizer_id][instance_id]['ref']) assert len(reference_sentences) > 0 reference = { 'summarizer_id': 'ground-truth', 'summarizer_type': 'reference', 'text': reference_sentences } metrics_dict = MetricsDict({ 'rouge-1': { 'precision': float(summarizer_dict['scores']['rouge_1_precision']), 'recall': float(summarizer_dict['scores']['rouge_1_recall']), 'f1': float(summarizer_dict['scores']['rouge_1_f_score']) }, 'rouge-2': { 'precision': float(summarizer_dict['scores']['rouge_2_precision']), 'recall': float(summarizer_dict['scores']['rouge_2_recall']), 'f1': float(summarizer_dict['scores']['rouge_2_f_score']) }, 'rouge-l': { 'precision': float(summarizer_dict['scores']['rouge_l_precision']), 'recall': float(summarizer_dict['scores']['rouge_l_recall']), 'f1': float(summarizer_dict['scores']['rouge_l_f_score']) }, 'js-2': float(summarizer_dict['scores']['js-2']), 'MoverScore': float(summarizer_dict['scores']['mover_score']), 'bertscore': { 'precision': float(summarizer_dict['scores']['bert_precision_score']), 'recall': float(summarizer_dict['scores']['bert_recall_score']), 'f1': float(summarizer_dict['scores']['bert_precision_score']), }, 'litepyramid': { 'recall': float(summarizer_dict['scores']['litepyramid_recall']) } }) instances.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'references': [reference], 'summary': summary, 'documents': [document] }) metrics_list.append( Metrics(instance_id, summarizer_id, 'peer', metrics_dict)) return instances, metrics_list
def load_judgments( file_path: str ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Metrics]]: summaries = [] summaries_with_crowd = [] metrics_list = [] with JsonlReader(file_path) as f: for instance in f: instance_id = instance['id'] summarizer_id = instance['model_id'] filename = instance['filepath'] summary = {'text': instance['decoded']} references = instance['references'] expert_annotations = instance['expert_annotations'] turker_annotations = instance['turker_annotations'] document = instance['text'] # It appears that the first reference is always the ground-truth, the others are crowdsourced, although # this is not 100% confirmed. It is only based on me looking through a handful of examples. assert len(references) == 11 references[0] = { 'summarizer_id': 'ground-truth', 'summarizer_type': 'reference', 'text': references[0] } for i in range(1, 11): references[i] = { 'summarizer_id': f'turker-{i}', 'summarizer_type': 'reference', 'text': references[i] } summaries.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'file_path': filename, 'document': { 'text': document }, 'summary': summary, 'references': [references[0]] }) summaries_with_crowd.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'file_path': filename, 'document': { 'text': document }, 'summary': summary, 'references': references }) expert_metrics = MetricsDict({ 'coherence': [annotation['coherence'] for annotation in expert_annotations], 'consistency': [ annotation['consistency'] for annotation in expert_annotations ], 'fluency': [annotation['fluency'] for annotation in expert_annotations], 'relevance': [annotation['relevance'] for annotation in expert_annotations] }) turker_metrics = MetricsDict({ 'coherence': [annotation['coherence'] for annotation in turker_annotations], 'consistency': [ annotation['consistency'] for annotation in turker_annotations ], 'fluency': [annotation['fluency'] for annotation in turker_annotations], 'relevance': [annotation['relevance'] for annotation in turker_annotations] }) both = MetricsDict({ 'expert': expert_metrics, 'turker': turker_metrics }) metrics = Metrics(instance_id, summarizer_id, 'peer', both) metrics_list.append(metrics) return summaries, summaries_with_crowd, metrics_list
def load_judgments( file_path: str ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Metrics]]: summaries = [] summaries_with_crowd = [] metrics_list = [] with JsonlReader(file_path) as f: for instance in f: instance_id = instance['id'] summarizer_id = instance['model_id'] filename = instance['filepath'] summary = {'text': instance['decoded']} references = instance['references'] expert_annotations = instance['expert_annotations'] turker_annotations = instance['turker_annotations'] document = instance['text'] # The first reference is always the ground-truth # https://github.com/Yale-LILY/SummEval/issues/8 assert len(references) == 11 references[0] = { 'summarizer_id': 'ground-truth', 'summarizer_type': 'reference', 'text': references[0] } for i in range(1, 11): references[i] = { 'summarizer_id': f'turker-{i}', 'summarizer_type': 'reference', 'text': references[i] } summaries.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'file_path': filename, 'document': { 'text': document }, 'summary': summary, 'references': [references[0]] }) summaries_with_crowd.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'file_path': filename, 'document': { 'text': document }, 'summary': summary, 'references': references }) expert_metrics = MetricsDict({ 'coherence': [annotation['coherence'] for annotation in expert_annotations], 'consistency': [ annotation['consistency'] for annotation in expert_annotations ], 'fluency': [annotation['fluency'] for annotation in expert_annotations], 'relevance': [annotation['relevance'] for annotation in expert_annotations] }) turker_metrics = MetricsDict({ 'coherence': [annotation['coherence'] for annotation in turker_annotations], 'consistency': [ annotation['consistency'] for annotation in turker_annotations ], 'fluency': [annotation['fluency'] for annotation in turker_annotations], 'relevance': [annotation['relevance'] for annotation in turker_annotations] }) both = MetricsDict({ 'expert': expert_metrics, 'turker': turker_metrics }) metrics = Metrics(instance_id, summarizer_id, 'peer', both) metrics_list.append(metrics) return summaries, summaries_with_crowd, metrics_list