def score_responses(self): metatypes = { 'ALL': ['Event', 'Relation'], 'Event': ['Event'], 'Relation': ['Relation'] } scores = [] for document_id in self.get('core_documents'): language = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id).get('language') gold_trfs = self.get('document_type_role_fillers', 'gold', document_id) system_trfs = self.get('document_type_role_fillers', 'system', document_id) self.align_trfs(document_id, gold_trfs, system_trfs) for metatype_key in metatypes: num_gold_trf, num_system_trf, precision, recall, f1 = self.get( 'score', gold_trfs, system_trfs, metatypes[metatype_key]) if num_gold_trf + num_system_trf == 0: continue score = ArgumentMetricScore(logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype_key, precision=precision, recall=recall, f1=f1) scores.append(score) scores_printer = ScorePrinter(self.logger, self.printing_specs) for score in multisort(scores, (('document_id', False), ('metatype_sortkey', False))): scores_printer.add(score) self.aggregate_scores(scores_printer, ArgumentMetricScore) self.scores = scores_printer
def score_responses(self): scores = ScorePrinter(self.logger, self.printing_specs, self.separator) # TODO: add details for query_id in self.queries_to_score: for document_id in self.document_mappings.get('core_documents'): num_submitted = 150 num_correct = 70 num_incorrect = 30 num_right = 60 num_wrong = 40 num_redundant = 10 num_pooled = 100 num_ignored = 50 num_ground_truth = 100 scoring_metric_1 = 0.8 scoring_metric_2 = 0.7 scoring_metric_3 = 0.6 score = ClassScore(self.logger, self.runid, query_id, document_id, num_submitted, num_correct, num_incorrect, num_right, num_wrong, num_redundant, num_pooled, num_ignored, num_ground_truth, scoring_metric_1, scoring_metric_2, scoring_metric_3) scores.add(score) self.scores = scores
def score_responses(self): scores = [] for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document = self.get('gold_responses').get('document_mappings').get('documents').get(document_id) language = document.get('language') document_gold_to_system = self.get('cluster_alignment').get('gold_to_system').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get(gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get(gold_cluster_id).get('aligned_similarity') similarity = 0 if gold_cluster_id == 'None': continue gold_cluster = self.get('cluster', 'gold', document_id, gold_cluster_id) metatype = gold_cluster.get('metatype') if metatype not in ['Event', 'Relation']: continue if list(gold_cluster.get('dates').values())[0] is None: self.record_event('NO_TEMPORAL_CONSTRAINT', gold_cluster_id, document_id) continue if system_cluster_id != 'None': if aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_cluster = self.get('cluster', 'system', document_id, system_cluster_id) if system_cluster.get('metatype') != metatype: self.record_event('UNEXPECTED_ALIGNED_CLUSTER_METATYPE', system_cluster.get('metatype'), system_cluster_id, metatype, gold_cluster_id) if len(gold_cluster.get('dates').keys()) > 1: self.record_event('UNEXPECTED_NUM_DATES', gold_cluster_id, document_id) similarity = self.get('temporal_similarity', list(gold_cluster.get('dates').values())[0], list(system_cluster.get('dates').values())) score = TemporalMetricScore(logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype, gold_cluster_id=gold_cluster_id, system_cluster_id=system_cluster_id, similarity=similarity) scores.append(score) scores_printer = ScorePrinter(self.logger, self.printing_specs) for score in multisort(scores, (('document_id', False), ('metatype', False), ('gold_cluster_id', False), ('system_cluster_id', False))): scores_printer.add(score) self.aggregate_scores(scores_printer, TemporalMetricScore) self.scores = scores_printer
def score_responses(self): metatypes = { 'ALL': ['Entity', 'Event'], 'Entity': ['Entity'], 'Event': ['Event'] } scores = [] for document_id in self.get('core_documents'): document = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id) language = document.get('language') for metatype_key in metatypes: max_total_similarity = self.get('max_total_similarity', document_id, metatypes[metatype_key]) total_self_similarity_gold = self.get('total_self_similarity', 'gold', document_id, metatypes[metatype_key]) total_self_similarity_system = self.get( 'total_self_similarity', 'system', document_id, metatypes[metatype_key]) precision = max_total_similarity / total_self_similarity_system if total_self_similarity_system else 0 recall = max_total_similarity / total_self_similarity_gold f1 = 2 * precision * recall / ( precision + recall) if precision + recall else 0 score = CoreferenceMetricScore(logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype_key, precision=precision, recall=recall, f1=f1) scores.append(score) scores_printer = ScorePrinter(self.logger, self.printing_specs) for score in multisort(scores, (('document_id', False), ('metatype_sortkey', False))): scores_printer.add(score) self.aggregate_scores(scores_printer, CoreferenceMetricScore) self.scores = scores_printer
def score_responses(self): scores = ScorePrinter(self.logger, self.printing_specs, self.separator) mean_f1 = 0 for document_id in self.get('core_documents'): max_total_similarity = self.get('max_total_similarity', document_id) total_self_similarity_gold = self.get('total_self_similarity', 'gold', document_id) total_self_similarity_system = self.get('total_self_similarity', 'system', document_id) precision = max_total_similarity / total_self_similarity_system if total_self_similarity_system else 0 recall = max_total_similarity / total_self_similarity_gold f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0 score = CoreferenceMetricScore(self.logger, self.get('runid'), document_id, precision, recall, f1) mean_f1 += f1 scores.add(score) mean_f1 = mean_f1 / len(self.get('core_documents').keys()) mean_score = CoreferenceMetricScore(self.logger, self.get('runid'), 'Summary', '', '', mean_f1, summary=True) scores.add(mean_score) self.scores = scores
def score_responses(self): scores = [] sum_average_precision = 0 for query_id in self.get('queries_to_score'): entity_id = self.get('entity_id', query_id) counts = self.get('counts', query_id) sum_average_precision += counts['average_precision'] score = AcrossDocumentsCoreferenceMetricScore(self.get('logger'), run_id=self.get('run_id'), query_id=query_id, entity_id=entity_id, **counts) scores.append(score) macro_counts = {'average_precision': sum_average_precision/len(self.get('queries_to_score'))} for field_name in [s.get('name') for s in self.get('printing_specs') if s.get('name').startswith('num_')]: macro_counts[field_name] = macro_counts[field_name] if field_name in macro_counts else '' macro_average_score = AcrossDocumentsCoreferenceMetricScore(self.get('logger'), run_id=self.get('run_id'), query_id='ALL-Macro', entity_id='Summary', summary=True, **macro_counts) scores_printer = ScorePrinter(self.logger, self.printing_specs) for score in multisort(scores, (('entity_id', False), ('query_id', False))): scores_printer.add(score) scores_printer.add(macro_average_score) self.scores = scores_printer
def score_responses(self): metatypes = { 'ALL': ['Entity', 'Event'], 'Entity': ['Entity'], 'Event': ['Event'] } scores = [] mean_f1s = {} counts = {} for document_id in self.get('core_documents'): document = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id) language = document.get('language') for metatype_key in metatypes: max_total_similarity = self.get('max_total_similarity', document_id, metatypes[metatype_key]) total_self_similarity_gold = self.get('total_self_similarity', 'gold', document_id, metatypes[metatype_key]) total_self_similarity_system = self.get( 'total_self_similarity', 'system', document_id, metatypes[metatype_key]) precision = max_total_similarity / total_self_similarity_system if total_self_similarity_system else 0 recall = max_total_similarity / total_self_similarity_gold f1 = 2 * precision * recall / ( precision + recall) if precision + recall else 0 score = CoreferenceMetricScore(self.logger, self.get('runid'), document_id, language, metatype_key, precision, recall, f1) for language_key in ['ALL', language]: key = '{language}:{metatype}'.format(language=language_key, metatype=metatype_key) mean_f1s[key] = mean_f1s.get(key, 0) + f1 counts[key] = counts.get(key, 0) + 1 scores.append(score) scores_printer = ScorePrinter(self.logger, self.printing_specs, self.separator) for score in multisort(scores, (('document_id', False), ('metatype_sortkey', False))): scores_printer.add(score) for key in sorted(mean_f1s, key=self.order): mean_f1 = mean_f1s[key] / counts[key] if counts[key] else 0 language, metatype = key.split(':') mean_score = CoreferenceMetricScore(self.logger, self.get('runid'), 'Summary', language, metatype, '', '', mean_f1, summary=True) scores_printer.add(mean_score) self.scores = scores_printer
def score_responses(self): metatypes = { 'ALL': ['Event', 'Relation'], 'Event': ['Event'], 'Relation': ['Relation'] } scores = [] mean_f1s = {} counts = {} for document_id in self.get('core_documents'): language = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id).get('language') gold_trfs = self.get('document_type_role_fillers', 'gold', document_id) system_trfs = self.get('document_type_role_fillers', 'system', document_id) self.align_trfs(document_id, gold_trfs, system_trfs) for metatype_key in metatypes: num_gold_trf, num_system_trf, precision, recall, f1 = self.get( 'score', gold_trfs, system_trfs, metatypes[metatype_key]) if num_gold_trf + num_system_trf == 0: continue for language_key in ['ALL', language]: aggregate_key = '{language}:{metatype}'.format( language=language_key, metatype=metatype_key) mean_f1s[aggregate_key] = mean_f1s.get(aggregate_key, 0) + f1 counts[aggregate_key] = counts.get(aggregate_key, 0) + 1 score = ArgumentMetricScore(self.logger, self.get('runid'), document_id, language, metatype_key, precision, recall, f1) scores.append(score) scores_printer = ScorePrinter(self.logger, self.printing_specs, self.separator) for score in multisort(scores, (('document_id', False), ('metatype_sortkey', False))): scores_printer.add(score) for key in sorted(mean_f1s, key=self.order): mean_f1 = mean_f1s[key] / counts[key] if counts[key] else 0 language, metatype = key.split(':') mean_score = ArgumentMetricScore(self.logger, self.get('runid'), 'Summary', language, metatype, '', '', mean_f1, summary=True) scores_printer.add(mean_score) self.scores = scores_printer
def score_responses(self): scores = ScorePrinter(self.logger, self.printing_specs, self.separator) mean_f1 = 0 count = 0 for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document_gold_to_system = self.get('cluster_alignment').get( 'gold_to_system').get(document_id) document_system_to_gold = self.get('cluster_alignment').get( 'system_to_gold').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get( gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get( gold_cluster_id).get('aligned_similarity') if system_cluster_id and system_cluster_id != 'None' and aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') precision, recall, f1 = [0, 0, 0] if system_cluster_id and system_cluster_id != 'None': gold_cluster = self.get('gold_responses').get( 'document_clusters').get(document_id).get( gold_cluster_id) system_cluster = self.get('system_responses').get( 'document_clusters').get(document_id).get( system_cluster_id) skip_flag = False for cluster in [gold_cluster, system_cluster]: if cluster.get('metatype') not in [ 'Event', 'Relation' ]: skip_flag = True if skip_flag: continue gold_frame = self.get('gold_responses').get( 'document_frames').get(document_id).get( gold_cluster_id) gold_slot_fillers = {} if gold_frame is None: self.record_event('MISSING_GOLD_FRAME', gold_cluster.get('metatype'), gold_cluster_id, document_id, self.get('code_location')) continue for role_name in gold_frame.get('role_fillers'): for gold_filler_cluster_id in gold_frame.get( 'role_fillers').get(role_name): gold_slot_fillers['{}:{}'.format( role_name, gold_filler_cluster_id)] = 1 system_frame = self.get('system_responses').get( 'document_frames').get(document_id).get( system_cluster_id) system_slot_fillers = {} for role_name in system_frame.get('role_fillers'): for system_filler_cluster_id in system_frame.get( 'role_fillers').get(role_name): aligned_gold_filler_cluster_id = document_system_to_gold.get( system_filler_cluster_id).get('aligned_to') aligned_gold_filler_cluster_id_similarity = document_system_to_gold.get( system_filler_cluster_id).get( 'aligned_similarity') if aligned_gold_filler_cluster_id and aligned_gold_filler_cluster_id != 'None': if aligned_gold_filler_cluster_id_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_slot_fillers['{}:{}'.format( role_name, aligned_gold_filler_cluster_id)] = 1 else: system_slot_fillers['{}:{}'.format( role_name, system_filler_cluster_id)] = 1 if len(gold_slot_fillers) and len(system_slot_fillers): precision, recall, f1 = get_precision_recall_and_f1( set(gold_slot_fillers.keys()), set(system_slot_fillers.keys())) mean_f1 += f1 count += 1 score = FrameMetricScore(self.logger, self.get('runid'), document_id, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.add(score) # add scores corresponding to unaligned system clusters for system_cluster_id in document_system_to_gold if document_system_to_gold else []: gold_cluster_id = document_system_to_gold.get( system_cluster_id).get('aligned_to') aligned_similarity = document_system_to_gold.get( system_cluster_id).get('aligned_similarity') if gold_cluster_id and gold_cluster_id != 'None' and aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') if gold_cluster_id and gold_cluster_id != 'None': continue precision, recall, f1 = [0, 0, 0] count += 1 score = FrameMetricScore(self.logger, self.get('runid'), document_id, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.add(score) mean_f1 = mean_f1 / count if count else 0 mean_score = FrameMetricScore(self.logger, self.get('runid'), 'Summary', '', '', '', '', mean_f1, summary=True) scores.add(mean_score) self.scores = scores
def score_responses(self): scores = [] for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id) language = document.get('language') document_gold_to_system = self.get('cluster_alignment').get( 'gold_to_system').get(document_id) document_system_to_gold = self.get('cluster_alignment').get( 'system_to_gold').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get( gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get( gold_cluster_id).get('aligned_similarity') precision, recall, f1 = [0, 0, 0] if gold_cluster_id == 'None': continue gold_cluster = self.get('cluster', 'gold', document_id, gold_cluster_id) metatype = gold_cluster.get('metatype') if metatype not in ['Event', 'Relation']: continue if system_cluster_id != 'None': if aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_cluster = self.get('cluster', 'system', document_id, system_cluster_id) if system_cluster.get('metatype') != metatype: self.record_event( 'UNEXPECTED_ALIGNED_CLUSTER_METATYPE', system_cluster.get('metatype'), system_cluster_id, metatype, gold_cluster_id) gold_frame = self.get('frame', 'gold', document_id, gold_cluster_id) gold_slot_fillers = {} if gold_frame is None or len( gold_frame.get('role_fillers')) == 0: if gold_cluster.get('metatype') == 'Relation': self.record_event('MISSING_GOLD_FRAME', gold_cluster.get('metatype'), gold_cluster_id, document_id, self.get('code_location')) continue for role_name in gold_frame.get('role_fillers'): for gold_filler_cluster_id in gold_frame.get( 'role_fillers').get(role_name): gold_slot_fillers['{}:{}'.format( role_name, gold_filler_cluster_id)] = 1 system_frame = self.get('frame', 'system', document_id, system_cluster_id) if system_frame: system_slot_fillers = {} for role_name in system_frame.get('role_fillers'): for system_filler_cluster_id in system_frame.get( 'role_fillers').get(role_name): aligned_gold_filler_cluster_id = document_system_to_gold.get( system_filler_cluster_id).get('aligned_to') aligned_gold_filler_cluster_id_similarity = document_system_to_gold.get( system_filler_cluster_id).get( 'aligned_similarity') if aligned_gold_filler_cluster_id != 'None': if aligned_gold_filler_cluster_id_similarity == 0: self.record_event( 'DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_slot_fillers['{}:{}'.format( role_name, aligned_gold_filler_cluster_id)] = 1 else: system_slot_fillers['{}:{}'.format( role_name, system_filler_cluster_id)] = 1 if len(gold_slot_fillers) and len(system_slot_fillers): precision, recall, f1 = get_precision_recall_and_f1( set(gold_slot_fillers.keys()), set(system_slot_fillers.keys())) score = FrameMetricScore(logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype, gold_cluster_id=gold_cluster_id, system_cluster_id=system_cluster_id, precision=precision, recall=recall, f1=f1) scores.append(score) # add scores corresponding to unaligned system clusters precision, recall, f1 = [0, 0, 0] for system_cluster_id in document_system_to_gold if document_system_to_gold else []: gold_cluster_id = document_system_to_gold.get( system_cluster_id).get('aligned_to') aligned_similarity = document_system_to_gold.get( system_cluster_id).get('aligned_similarity') if system_cluster_id != 'None': if gold_cluster_id == 'None': metatype = self.get('cluster', 'system', document_id, system_cluster_id).get('metatype') if metatype not in ['Event', 'Relation']: continue score = FrameMetricScore( logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype, gold_cluster_id=gold_cluster_id, system_cluster_id=system_cluster_id, precision=precision, recall=recall, f1=f1) scores.append(score) elif aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') scores_printer = ScorePrinter(self.logger, self.printing_specs) for score in multisort(scores, (('document_id', False), ('metatype', False), ('gold_cluster_id', False), ('system_cluster_id', False))): scores_printer.add(score) self.aggregate_scores(scores_printer, FrameMetricScore) self.scores = scores_printer
def score_responses(self): scores = ScorePrinter(self.logger, self.printing_specs, self.separator) mean_f1 = 0 count = 0 for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document_gold_to_system = self.get('cluster_alignment').get( 'gold_to_system').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get( gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get( gold_cluster_id).get('aligned_similarity') if system_cluster_id and aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') precision, recall, f1 = [0, 0, 0] if system_cluster_id: gold_cluster_types = set( self.get('gold_responses').get( 'document_clusters').get(document_id).get( gold_cluster_id).get('all_expanded_types')) system_cluster_types = set() if document_id in self.get('system_responses').get( 'document_clusters'): system_cluster_types = set( self.get('system_responses'). get('document_clusters').get(document_id).get( system_cluster_id).get('all_expanded_types')) precision, recall, f1 = get_precision_recall_and_f1( gold_cluster_types, system_cluster_types) mean_f1 += f1 count += 1 score = TypeMetricScore(self.logger, self.get('runid'), document_id, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.add(score) # add scores unaligned system clusters document_system_to_gold = self.get('cluster_alignment').get( 'system_to_gold').get(document_id) for system_cluster_id in document_system_to_gold if document_system_to_gold else []: gold_cluster_id = document_system_to_gold.get( system_cluster_id).get('aligned_to') aligned_similarity = document_system_to_gold.get( system_cluster_id).get('aligned_similarity') if gold_cluster_id and aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') if gold_cluster_id: continue precision, recall, f1 = [0, 0, 0] count += 1 score = TypeMetricScore(self.logger, self.get('runid'), document_id, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.add(score) mean_f1 = mean_f1 / count mean_score = TypeMetricScore(self.logger, self.get('runid'), 'Summary', '', '', '', '', mean_f1, summary=True) scores.add(mean_score) self.scores = scores
def score_responses(self): scores = [] mean_similarities = {} counts = {} for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id) language = document.get('language') document_gold_to_system = self.get('cluster_alignment').get( 'gold_to_system').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get( gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get( gold_cluster_id).get('aligned_similarity') similarity = 0 if gold_cluster_id == 'None': continue gold_cluster = self.get('cluster', 'gold', document_id, gold_cluster_id) metatype = gold_cluster.get('metatype') if metatype not in ['Event', 'Relation']: continue if list(gold_cluster.get('dates').values())[0] is None: self.record_event('NO_TEMPORAL_CONSTRAINT', gold_cluster_id, document_id) continue if system_cluster_id != 'None': if aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_cluster = self.get('cluster', 'system', document_id, system_cluster_id) if system_cluster.get('metatype') != metatype: self.record_event( 'UNEXPECTED_ALIGNED_CLUSTER_METATYPE', system_cluster.get('metatype'), system_cluster_id, metatype, gold_cluster_id) if len(gold_cluster.get('dates').keys()) > 1: self.record_event('UNEXPECTED_NUM_DATES', gold_cluster_id, document_id) similarity = self.get( 'temporal_similarity', list(gold_cluster.get('dates').values())[0], list(system_cluster.get('dates').values())) for metatype_key in ['ALL', metatype]: for language_key in ['ALL', language]: key = '{language}:{metatype}'.format( metatype=metatype_key, language=language_key) mean_similarities[key] = mean_similarities.get( key, 0) + similarity counts[key] = counts.get(key, 0) + 1 score = TemporalMetricScore(self.logger, self.get('runid'), document_id, language, metatype, gold_cluster_id, system_cluster_id, similarity) scores.append(score) scores_printer = ScorePrinter(self.logger, self.printing_specs, self.separator) for score in multisort(scores, (('document_id', False), ('metatype', False), ('gold_cluster_id', False), ('system_cluster_id', False))): scores_printer.add(score) for key in sorted(mean_similarities, key=self.order): mean_similarity = mean_similarities[key] / counts[key] if counts[ key] else 0 language, metatype = key.split(':') mean_score = TemporalMetricScore(self.logger, self.get('runid'), 'Summary', language, metatype, '', '', mean_similarity, summary=True) scores_printer.add(mean_score) self.scores = scores_printer
def score_responses(self): scores = [] for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document = self.get('gold_responses').get('document_mappings').get('documents').get(document_id) language = document.get('language') self.record_event('ANNOTATED_TYPES_INFO', document_id, ','.join(self.get('annotated_regions').get('types_annotated_for_document', document_id))) document_gold_to_system = self.get('cluster_alignment').get('gold_to_system').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get(gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get(gold_cluster_id).get('aligned_similarity') precision, recall, f1 = [0,0,0] if gold_cluster_id == 'None': continue gold_cluster = self.get('gold_responses').get('document_clusters').get(document_id).get(gold_cluster_id) metatype = gold_cluster.get('metatype') if metatype not in ['Entity', 'Event']: continue if system_cluster_id != 'None': if aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_cluster = self.get('cluster', 'system', document_id, system_cluster_id) if system_cluster.get('metatype') != metatype: self.record_event('UNEXPECTED_ALIGNED_CLUSTER_METATYPE', system_cluster.get('metatype'), system_cluster_id, metatype, gold_cluster_id) gold_types = set(gold_cluster.get('all_expanded_types')) system_types = set() if document_id in self.get('system_responses').get('document_clusters'): system_types = set(self.get('system_responses').get('document_clusters').get(document_id).get(system_cluster_id).get('all_expanded_types')) augmented_gold_types = self.get('augmented_types', document_id, gold_types) augmented_system_types = self.get('augmented_types', document_id, system_types) self.record_event('TYPE_METRIC_SCORE_INFO', self.__class__.__name__, 'TYPES_SUBMITTED', document_id, gold_cluster_id, ','.join(gold_types), system_cluster_id, ','.join(system_types)) self.record_event('TYPE_METRIC_SCORE_INFO', self.__class__.__name__, 'TYPES_SCORED', document_id, gold_cluster_id, ','.join(augmented_gold_types), system_cluster_id, ','.join(augmented_system_types)) precision, recall, f1 = get_precision_recall_and_f1(augmented_gold_types, augmented_system_types) score = TypeMetricScoreV1(logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype, gold_cluster_id=gold_cluster_id, system_cluster_id=system_cluster_id, precision=precision, recall=recall, f1=f1) scores.append(score) # add scores unaligned system clusters document_system_to_gold = self.get('cluster_alignment').get('system_to_gold').get(document_id) for system_cluster_id in document_system_to_gold if document_system_to_gold else []: gold_cluster_id = document_system_to_gold.get(system_cluster_id).get('aligned_to') aligned_similarity = document_system_to_gold.get(system_cluster_id).get('aligned_similarity') if system_cluster_id != 'None': system_cluster = self.get('system_responses').get('document_clusters').get(document_id).get(system_cluster_id) metatype = system_cluster.get('metatype') if metatype not in ['Entity', 'Event']: continue if gold_cluster_id == 'None': precision, recall, f1 = [0,0,0] score = TypeMetricScoreV1(logger=self.logger, run_id=self.get('run_id'), document_id=document_id, language=language, metatype=metatype, gold_cluster_id=gold_cluster_id, system_cluster_id=system_cluster_id, precision=precision, recall=recall, f1=f1) scores.append(score) elif aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') scores_printer = ScorePrinter(self.logger, self.printing_specs) for score in multisort(scores, (('document_id', False), ('metatype', False), ('gold_cluster_id', False), ('system_cluster_id', False))): scores_printer.add(score) self.aggregate_scores(scores_printer, TypeMetricScoreV1) self.scores = scores_printer
def score_responses(self): scores = [] mean_f1s = {} counts = {} for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id) language = document.get('language') document_gold_to_system = self.get('cluster_alignment').get( 'gold_to_system').get(document_id) document_system_to_gold = self.get('cluster_alignment').get( 'system_to_gold').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get( gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get( gold_cluster_id).get('aligned_similarity') precision, recall, f1 = [0, 0, 0] if gold_cluster_id == 'None': continue gold_cluster = self.get('cluster', 'gold', document_id, gold_cluster_id) metatype = gold_cluster.get('metatype') if metatype not in ['Event', 'Relation']: continue if system_cluster_id != 'None': if aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_cluster = self.get('cluster', 'system', document_id, system_cluster_id) if system_cluster.get('metatype') != metatype: self.record_event( 'UNEXPECTED_ALIGNED_CLUSTER_METATYPE', system_cluster.get('metatype'), system_cluster_id, metatype, gold_cluster_id) gold_frame = self.get('frame', 'gold', document_id, gold_cluster_id) gold_slot_fillers = {} if gold_frame is None or len( gold_frame.get('role_fillers')) == 0: if gold_cluster.get('metatype') == 'Relation': self.record_event('MISSING_GOLD_FRAME', gold_cluster.get('metatype'), gold_cluster_id, document_id, self.get('code_location')) continue for role_name in gold_frame.get('role_fillers'): for gold_filler_cluster_id in gold_frame.get( 'role_fillers').get(role_name): gold_slot_fillers['{}:{}'.format( role_name, gold_filler_cluster_id)] = 1 system_frame = self.get('frame', 'system', document_id, system_cluster_id) if system_frame: system_slot_fillers = {} for role_name in system_frame.get('role_fillers'): for system_filler_cluster_id in system_frame.get( 'role_fillers').get(role_name): aligned_gold_filler_cluster_id = document_system_to_gold.get( system_filler_cluster_id).get('aligned_to') aligned_gold_filler_cluster_id_similarity = document_system_to_gold.get( system_filler_cluster_id).get( 'aligned_similarity') if aligned_gold_filler_cluster_id != 'None': if aligned_gold_filler_cluster_id_similarity == 0: self.record_event( 'DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_slot_fillers['{}:{}'.format( role_name, aligned_gold_filler_cluster_id)] = 1 else: system_slot_fillers['{}:{}'.format( role_name, system_filler_cluster_id)] = 1 if len(gold_slot_fillers) and len(system_slot_fillers): precision, recall, f1 = get_precision_recall_and_f1( set(gold_slot_fillers.keys()), set(system_slot_fillers.keys())) for metatype_key in ['ALL', metatype]: for language_key in ['ALL', language]: key = '{language}:{metatype}'.format( metatype=metatype_key, language=language_key) mean_f1s[key] = mean_f1s.get(key, 0) + f1 counts[key] = counts.get(key, 0) + 1 score = FrameMetricScore(self.logger, self.get('runid'), document_id, language, metatype, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.append(score) # add scores corresponding to unaligned system clusters precision, recall, f1 = [0, 0, 0] for system_cluster_id in document_system_to_gold if document_system_to_gold else []: gold_cluster_id = document_system_to_gold.get( system_cluster_id).get('aligned_to') aligned_similarity = document_system_to_gold.get( system_cluster_id).get('aligned_similarity') if system_cluster_id != 'None': if gold_cluster_id == 'None': metatype = self.get('cluster', 'system', document_id, system_cluster_id).get('metatype') if metatype not in ['Event', 'Relation']: continue for metatype_key in ['ALL', metatype]: for language_key in ['ALL', language]: key = '{language}:{metatype}'.format( metatype=metatype_key, language=language_key) mean_f1s[key] = mean_f1s.get(key, 0) + f1 counts[key] = counts.get(key, 0) + 1 score = FrameMetricScore(self.logger, self.get('runid'), document_id, language, metatype, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.append(score) elif aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') scores_printer = ScorePrinter(self.logger, self.printing_specs, self.separator) for score in multisort(scores, (('document_id', False), ('metatype', False), ('gold_cluster_id', False), ('system_cluster_id', False))): scores_printer.add(score) for key in sorted(mean_f1s, key=self.order): mean_f1 = mean_f1s[key] / counts[key] if counts[key] else 0 language, metatype = key.split(':') mean_score = FrameMetricScore(self.logger, self.get('runid'), 'Summary', language, metatype, '', '', '', '', mean_f1, summary=True) scores_printer.add(mean_score) self.scores = scores_printer
def score_responses(self): scores = [] mean_f1s = {} counts = {} for document_id in self.get('core_documents'): # add scores corresponding to all gold clusters document = self.get('gold_responses').get('document_mappings').get( 'documents').get(document_id) language = document.get('language') self.record_event( 'ANNOTATED_TYPES_INFO', document_id, ','.join( self.get('annotated_regions').get( 'types_annotated_for_document', document_id))) document_gold_to_system = self.get('cluster_alignment').get( 'gold_to_system').get(document_id) for gold_cluster_id in document_gold_to_system if document_gold_to_system else []: system_cluster_id = document_gold_to_system.get( gold_cluster_id).get('aligned_to') aligned_similarity = document_gold_to_system.get( gold_cluster_id).get('aligned_similarity') precision, recall, f1 = [0, 0, 0] if gold_cluster_id == 'None': continue gold_cluster = self.get('gold_responses').get( 'document_clusters').get(document_id).get(gold_cluster_id) metatype = gold_cluster.get('metatype') if metatype not in ['Entity', 'Event']: continue if system_cluster_id != 'None': if aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') system_cluster = self.get('cluster', 'system', document_id, system_cluster_id) if system_cluster.get('metatype') != metatype: self.record_event( 'UNEXPECTED_ALIGNED_CLUSTER_METATYPE', system_cluster.get('metatype'), system_cluster_id, metatype, gold_cluster_id) gold_types = set(gold_cluster.get('all_expanded_types')) system_types = set() if document_id in self.get('system_responses').get( 'document_clusters'): system_types = set( self.get('system_responses'). get('document_clusters').get(document_id).get( system_cluster_id).get('all_expanded_types')) augmented_gold_types = self.get('augmented_types', document_id, gold_types) augmented_system_types = self.get('augmented_types', document_id, system_types) self.record_event('TEMPORAL_METRIC_SCORE_INFO', 'TYPES_SUBMITTED', document_id, gold_cluster_id, ','.join(gold_types), system_cluster_id, ','.join(system_types)) self.record_event('TEMPORAL_METRIC_SCORE_INFO', 'TYPES_SCORED', document_id, gold_cluster_id, ','.join(augmented_gold_types), system_cluster_id, ','.join(augmented_system_types)) precision, recall, f1 = get_precision_recall_and_f1( augmented_gold_types, augmented_system_types) for metatype_key in ['ALL', metatype]: for language_key in ['ALL', language]: key = '{language}:{metatype}'.format( metatype=metatype_key, language=language_key) mean_f1s[key] = mean_f1s.get(key, 0) + f1 counts[key] = counts.get(key, 0) + 1 score = TypeMetricScore(self.logger, self.get('runid'), document_id, language, metatype, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.append(score) # add scores unaligned system clusters document_system_to_gold = self.get('cluster_alignment').get( 'system_to_gold').get(document_id) for system_cluster_id in document_system_to_gold if document_system_to_gold else []: gold_cluster_id = document_system_to_gold.get( system_cluster_id).get('aligned_to') aligned_similarity = document_system_to_gold.get( system_cluster_id).get('aligned_similarity') if system_cluster_id != 'None': system_cluster = self.get('system_responses').get( 'document_clusters').get(document_id).get( system_cluster_id) metatype = system_cluster.get('metatype') if metatype not in ['Entity', 'Event']: continue if gold_cluster_id == 'None': precision, recall, f1 = [0, 0, 0] for metatype_key in ['ALL', metatype]: for language_key in ['ALL', language]: key = '{language}:{metatype}'.format( metatype=metatype_key, language=language_key) mean_f1s[key] = mean_f1s.get(key, 0) + f1 counts[key] = counts.get(key, 0) + 1 score = TypeMetricScore(self.logger, self.get('runid'), document_id, language, metatype, gold_cluster_id, system_cluster_id, precision, recall, f1) scores.append(score) elif aligned_similarity == 0: self.record_event('DEFAULT_CRITICAL_ERROR', 'aligned_similarity=0') scores_printer = ScorePrinter(self.logger, self.printing_specs, self.separator) for score in multisort(scores, (('document_id', False), ('metatype', False), ('gold_cluster_id', False), ('system_cluster_id', False))): scores_printer.add(score) for key in sorted(mean_f1s, key=self.order): mean_f1 = mean_f1s[key] / counts[key] if counts[key] else 0 language, metatype = key.split(':') mean_score = TypeMetricScore(self.logger, self.get('runid'), 'Summary', language, metatype, '', '', '', '', mean_f1, summary=True) scores_printer.add(mean_score) self.scores = scores_printer