def __init__(self, logger, document_mappings, document_boundaries, annotated_regions, gold_mentions_filename, gold_edges_filename, system_mentions_filename, system_edges_filename): """ Initialize the Clusters. """ super().__init__(logger) self.document_mappings = document_mappings self.document_boundaries = document_boundaries self.annotated_regions = annotated_regions self.filenames = Container(logger) self.filenames.add(key='gold', value={ 'mentions': gold_mentions_filename, 'edges': gold_edges_filename }) self.filenames.add(key='system', value={ 'mentions': system_mentions_filename, 'edges': system_edges_filename }) self.clusters = { 'gold': Container(logger), 'system': Container(logger) } self.frames = {'gold': Container(logger), 'system': Container(logger)} self.alignment = {'gold_to_system': {}, 'system_to_gold': {}} self.load() self.align_clusters()
def load(self): logger = self.get('logger') for filename in sorted(os.listdir(self.get('directory')), key=str): filename_including_path = '{}/{}'.format(self.get('directory'), filename) document_id = filename.replace('.tab', '') for entry in FileHandler(logger, filename_including_path): system_cluster = entry.get('system_cluster') gold_cluster = entry.get('gold_cluster') similarity = entry.get('similarity') document_system_to_gold = self.get('system_to_gold').get( document_id, default=Container(logger)) document_gold_to_system = self.get('gold_to_system').get( document_id, default=Container(logger)) document_system_to_gold.add(key=system_cluster, value={ 'aligned_to': gold_cluster, 'aligned_similarity': similarity }) document_gold_to_system.add(key=gold_cluster, value={ 'aligned_to': system_cluster, 'aligned_similarity': similarity })
def __init__(self, logger, similarities_directory): super().__init__(logger) self.directory = similarities_directory self.cluster_to_metatype = Container(logger) self.gold = Container(logger) self.system = Container(logger) self.load()
def __init__(self, logger, document_mappings, document_boundaries, ID): super().__init__(logger) self.ID = ID self.document_mappings = document_mappings self.document_boundaries = document_boundaries self.types = Container(logger) self.mentions = Container(logger) self.metatype = None
def __init__(self, logger, filename): super().__init__(logger) self.logger = logger self.filename = filename self.containers = { 'Entity': Container(logger), 'Relation': Container(logger), 'Event': Container(logger) } self.load_data()
def __init__(self, logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, path, runid): super().__init__(logger) self.ontology_type_mappings = ontology_type_mappings self.slot_mappings = slot_mappings self.document_mappings = document_mappings self.document_boundaries = document_boundaries self.validator = Validator(logger) self.generator = Generator(logger) self.normalizer = Normalizer(logger) self.document_clusters = Container(logger) self.document_frames = Container(logger) self.runid = runid self.path = path self.load_responses()
def main(args): """ The main program for generating AIF """ check_paths(args) logger = Logger(args.log, args.log_specifications_filename, sys.argv) core_documents = CoreDocuments(logger, args.core_documents_filename) encodings = Encodings(logger, args.encodings_filename) document_mappings = DocumentMappings(logger, args.parent_children_filename, encodings, core_documents) text_boundaries = TextBoundaries(logger, args.sentence_boundaries_filename) image_boundaries = ImageBoundaries(logger, args.image_boundaries_filename) video_boundaries = VideoBoundaries(logger, args.video_boundaries_filename) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries_filename) type_mappings = Container(logger) for entry in FileHandler(logger, args.type_mappings_filename): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) slot_mappings = SlotMappings(logger, args.slot_mappings_filename) annotations = Annotations(logger, slot_mappings, document_mappings, text_boundaries, image_boundaries, video_boundaries, keyframe_boundaries, type_mappings, args.annotations, load_video_time_offsets_flag=args.notime) generator = AIFGenerator(logger, annotations, args.nochannel, args.reference_kb_id) generator.write_output(args.output) exit(ALLOK_EXIT_CODE)
def __init__(self, logger, document_mappings, document_boundaries, regions_filename): super().__init__(logger) self.document_mappings = document_mappings self.document_boundaries = document_boundaries self.filename = regions_filename self.regions = Container(logger) self.load()
def __init__(self, logger, ID): """ Initializes this instance. """ super().__init__(logger) self.documents = Container(logger) self.ID = ID
def load(self): logger = self.get('logger') for filename in sorted(os.listdir(self.get('directory')), key=str): filename_including_path = '{}/{}'.format(self.get('directory'), filename) document_id = filename.replace('.tab', '') for entry in FileHandler(logger, filename_including_path): metatype = entry.get('metatype') system_or_gold1 = entry.get('system_or_gold1') system_or_gold2 = entry.get('system_or_gold2') cluster1 = entry.get('cluster1') cluster2 = entry.get('cluster2') similarity = entry.get('similarity') if similarity == 0 or system_or_gold1 != system_or_gold2 or cluster1 != cluster2: continue self.get('cluster_to_metatype').add(key='{}:{}'.format( system_or_gold1.upper(), cluster1), value=metatype) self.get('cluster_to_metatype').add(key='{}:{}'.format( system_or_gold1.upper(), cluster1), value=metatype) self.get(system_or_gold1).get(document_id, default=Container(logger)).add( key=cluster1, value=similarity)
def __init__(self, logger, entities_ontology_filename, relations_ontology_filename, events_ontology_filename): """ Initialize the ontology. """ super().__init__(logger) self.entities_ontology_filename = entities_ontology_filename self.relations_ontology_filename = relations_ontology_filename self.events_ontology_filename = events_ontology_filename self.entities = Container(logger) self.relations = Container(logger) self.events = Container(logger) self.load('entities', EntitySpec, entities_ontology_filename, self.entities) self.load('relations', RelationSpec, relations_ontology_filename, self.relations) self.load('events', EventSpec, events_ontology_filename, self.events)
def __init__(self, logger, task, arguments): super().__init__(logger) self.task = task for key in arguments: self.set(key, arguments[key]) self.metrics = self.task_metrics[task] self.scores = Container(logger) self.score_responses()
def get_entry_to_spans(self, entry): logger = self.get('logger') spans = Container(logger) for span_string in entry.get('spans').split(';'): span = string_to_span(logger, span_string, entry.get('where')) span.set('modality', self.get('')) spans.add(key=span, value=span) return spans
def __init__(self, logger, ontology_type_mappings, document_mappings, document_boundaries, regions_filename, strictness='strict'): super().__init__(logger) self.ontology_type_mappings = ontology_type_mappings self.document_mappings = document_mappings self.document_boundaries = document_boundaries self.filename = regions_filename self.regions = Container(logger) self.strictness = strictness self.load()
def load(self): logger = self.get('logger') for entry in FileHandler(logger, self.get('filename')): key = self.get('entry_to_key', entry) if key not in self.get('regions'): self.get('regions').add(key=key, value=Container(logger)) regions_by_key = self.get('regions').get(key) for span in self.get('entry_to_spans', entry): regions_by_key.add(key=span, value=span)
def __init__(self, logger, filename, encodings, core_documents=None): """ Initialize the mapping between Document and DocumentElement. Arguments: logger (aida.Logger): the aida.Logger object filename (str): the parent-children file containing mapping between documents and document-elements. encodings (aida.Encodings) core_documents (aida.CoreDocuments) """ self.logger = logger self.filename = filename self.fileheader = None self.core_documents = core_documents self.documents = Container(logger) self.document_elements = Container(logger) self.encodings = encodings self.load_data()
def get_frame(self, frame_id, entry): logger = self.get('logger') document_id = entry.get('document_id') if document_id not in self.get('document_frames'): self.get('document_frames').add(key=document_id, value=Container(logger)) document_frames = self.get('document_frames').get(document_id) if frame_id not in document_frames: frame = EventOrRelationFrame(logger, frame_id, entry.get('where')) document_frames.add(key=frame_id, value=frame) frame = document_frames.get(frame_id) return frame
def get_entity_assessments(self, the_query_id): entity_assessments = Container(self.get('logger')) entity_id = self.get('entity_id', the_query_id) for query_id in self.get('queries_to_score'): if self.get('queries_to_score').get(query_id).get('entity_id') == entity_id: for key in self.get('assessments').get(query_id): value = self.get('assessments').get(query_id).get(key) if key not in entity_assessments: entity_assessments.add(value, key) elif value.get('assessment') != entity_assessments.get(key).get('assessment'): self.record_event('CONFLICTING_ASSESSMENTS', key, query_id, entity_assessments.get(key).get('queryid')) return entity_assessments
def get_cluster(self, cluster_id, entry): logger = self.get('logger') document_id = entry.get('document_id') if document_id not in self.get('document_clusters'): self.get('document_clusters').add(key=document_id, value=Container(logger)) document_clusters = self.get('document_clusters').get(document_id) if cluster_id not in document_clusters: cluster = Cluster(logger, self.get('document_mappings'), self.get('document_boundaries'), cluster_id) document_clusters.add(key=cluster_id, value=cluster) cluster = document_clusters.get(cluster_id) return cluster
def __init__(self, logger, slot_mappings, document_mappings, text_boundaries, image_boundaries, video_boundaries, keyframe_boundaries, type_mappings, annotations_dir, load_topic_ids=None, load_video_time_offsets_flag=True): """ Initialize the Annotations. Arguments: logger (aida.Logger) slot_mappings (aida.SlotMappings) document_mappings (aida.DocumentMappings) text_boundaries (aida.TextBoundaries) image_boundaries (aida.ImageBoundaries) video_boundaries (aida.VideoBoundaries) keyframe_boundaries (aida.KeyFrameBoundaries) type_mappings (aida.Container) annotations_dir (str): The path to the annotations directory as received from LDC. load_topic_ids (None or list): Specify the topic IDs to load, or set it to None in order to load all the topics. load_video_time_offsets_flag (bool): Set it to True in order to load video time offsets, False otherwise. """ super().__init__(logger) self.logger = logger self.annotations_dir = annotations_dir self.document_mappings = document_mappings self.text_boundaries = text_boundaries self.image_boundaries = image_boundaries self.video_boundaries = video_boundaries self.keyframe_boundaries = keyframe_boundaries self.type_mappings = type_mappings self.load_topic_ids = load_topic_ids if self.load_topic_ids is None: self.load_topic_ids = list(os.listdir(annotations_dir + "/data")) self.slot_mappings = slot_mappings self.load_video_time_offsets_flag = load_video_time_offsets_flag self.mentions = {} self.nodes = {} self.subject_nodes = {} self.slots = Container(logger) self.load_annotations()
def get_document_type_role_fillers(self, system_or_gold, document_id): logger = self.get('logger') type_role_fillers = Container(logger) responses = self.get('{}_responses'.format(system_or_gold)) if document_id in responses.get('document_frames'): for frame in responses.get('document_frames').get( document_id).values(): metatype = frame.get('metatype') role_fillers = frame.get('role_fillers') for role_name in role_fillers: for filler_cluster_id in role_fillers.get(role_name): for predicate_justification in role_fillers.get( role_name).get(filler_cluster_id): type_invoked = self.get('type_invoked', predicate_justification, role_name) type_role_filler_string = '{type_invoked}_{role_name}:{filler_cluster_id}'.format( type_invoked=type_invoked, role_name=role_name, filler_cluster_id=filler_cluster_id) type_role_filler = type_role_fillers.get( type_role_filler_string, default=Object(logger)) type_role_filler.set('metatype', metatype) type_role_filler.set('type', type_invoked) type_role_filler.set('role_name', role_name) type_role_filler.set('filler_cluster_id', filler_cluster_id) if type_role_filler.get( 'predicate_justifications') is None: type_role_filler.set( 'predicate_justifications', Container(logger)) type_role_filler.get('predicate_justifications' ).add(predicate_justification) return type_role_fillers
def __init__(self, logger, gold_responses, system_responses, cluster_alignment, cluster_self_similarities, separator=None): super().__init__(logger) self.gold_responses = gold_responses self.system_responses = system_responses self.cluster_alignment = cluster_alignment self.cluster_self_similarities = cluster_self_similarities self.separator = separator self.scores = Container(logger) self.score_responses()
def __init__(self, logger, filename): """ Initialize the slots object. Parameters: logger (logger): the logger object filename (str): the name of the file containing mappings between LDC internal slot code, and external slot name. The file contains tab separated values with header in first line. For example, slot_type_code slot_type evt001arg01damagerdestroyer ArtifactExistence.DamageDestroy_DamagerDestroyer evt002arg01damager ArtifactExistence.DamageDestroy.Damage_Damager """ super().__init__(logger) self.mappings = { 'code_to_type': Container(logger), 'type_to_codes': Container(logger) } # load the data and store the mapping in a dictionary for entry in FileHandler(logger, filename): slot_type_code = entry.get('slot_type_code') slot_type = entry.get('slot_type') if slot_type_code in self.get('mappings').get('code_to_type'): logger.record_event('DUPLICATE_VALUE_IN_COLUMN', slot_type_code, 'slot_type_code', entry.get('where')) self.get('mappings').get('code_to_type').add(key=slot_type_code, value=slot_type) self.get('mappings').get('type_to_codes').get( slot_type, default=Container(logger)).add(key=slot_type_code, value=slot_type_code)
def __init__(self, logger, entry): """ Initialize the specifications of a relation taken from the entry corresponding to a line as read from ontology. """ super().__init__(logger, entry) self.arguments = Container(logger) self.annotation_id = entry.get('AnnotIndexID') self.type = entry.get('Type') self.subtype = entry.get('Subtype') self.subsubtype = entry.get('Sub-Subtype') self.type_ov = entry.get('Output Value for Type') self.subtype_ov = entry.get('Output Value for Subtype') self.subsubtype_ov = entry.get('Output Value for Sub-Subtype') for arg_num in range(1, 3): self.get('arguments').add(ArgumentSpec(logger, entry, arg_num), 'arg{}'.format(arg_num))
def __init__(self, logger, ID): """ Initializes the document instance, setting the logger, and optionally its ID. Arguments: logger (aida.Logger): the aida.Logger object ID (str): the string identifier of the document NOTE: the document contains a container to store document elements that comprise this document. """ super().__init__(logger) self.document_elements = Container(logger) self.ID = ID self.logger = logger
def load_file(self, fh, schema): logger = self.get('logger') filename = fh.get('filename') if not self.exists(filename): file_container = Container(logger) file_container.set('header', fh.get('header')) self.add(key=filename, value=file_container) for entry in fh: lineno = entry.get('lineno') entry.set('runid', self.get('runid')) entry.set('schema', schema) for i in range(len(schema.get('columns'))): entry.set( schema.get('columns')[i], entry.get(entry.get('header').get('columns')[i])) valid = True for attribute_name in attributes: attribute = attributes[attribute_name] if attribute_name != attribute.get('name'): logger.record_event( 'DEFAULT_CRITICAL_ERROR', 'Mismatching name of attribute: {}'.format( attribute_name), self.get_code_location()) # skip if the attribute is not required for the given schema if not self.attribute_required(attribute, schema): continue # generate value for the attribute, if needed self.generate_value(attribute, entry) # normalize value normalizer_name = attribute.get('normalize') if normalizer_name: self.get('normalizer').normalize(self, normalizer_name, entry, attribute) # validate value validator_name = attribute.get('validate') if validator_name: valid_attribute = self.get('validator').validate( self, validator_name, schema, entry, attribute) if not valid_attribute: valid = False entry.set('valid', valid) if self.get('document_mappings').get('documents').get( entry.get('document_id')).get('is_core'): self.get(filename).add(key=str(lineno), value=entry)
def aggregate_scores(self, scores, score_class): aggregates = {} for score in scores.values(): languages = self.get('languages', score, scores) metatypes = self.get('metatypes', score, scores) for language in languages: for metatype in metatypes: group_by = language + ',' + metatype if group_by not in aggregates: aggregates[group_by] = score_class( self.get('logger'), aggregate=True, language=language, metatype=metatype, run_id=self.get('run_id'), summary=True, elements=Container(self.get('logger'))) aggregate_scores = aggregates[group_by] aggregate_scores.get('elements').add(score) for score in sorted(aggregates.values(), key=self.order): scores.add(score)
def load_task2_assessments(self): next_fqec_num = 1001 generated_fqecs = {} path = '{}/data/zero-hop/*.tab'.format(self.assessments_dir) header = FileHeader(self.logger, "\t".join(assessments.get('task2').get('across_documents_coreference').get('columns'))) for filename in glob.glob(path): for entry in FileHandler(self.logger, filename, header): queryid, docid, mention_span, assessment_read, fqec_read, where = map( lambda key: entry.get(key), ['queryid', 'docid', 'mention_span', 'assessment', 'fqec', 'where'] ) entity_id = self.get('queries_to_score').get(queryid).get('entity_id') assessment = self.normalize('assessment', assessment_read) query_and_document = '{}:{}'.format(queryid, docid) key = '{}:{}'.format(query_and_document, mention_span) if self.exists(key): self.logger.record_event('MULTIPLE_ASSESSMENTS', key, where) fqec = fqec_read if fqec == 'NIL' and self.normalize('assessment', assessment) == 'CORRECT': if key not in generated_fqecs: fqec = 'NILG{}'.format(next_fqec_num) generated_fqecs[key] = fqec fqec = generated_fqecs[key] assessment_entry = Object(self.logger) assessment_entry.set('assessment', assessment) assessment_entry.set('docid', docid) assessment_entry.set('queryid', queryid) assessment_entry.set('mention_span', mention_span) assessment_entry.set('fqec_read', fqec_read) assessment_entry.set('fqec', fqec) assessment_entry.set('line', entry.get('line')) assessment_entry.set('where', where) if not self.exists(queryid): self.add(key=queryid, value=Container(self.get('logger'))) self.get(queryid).add(key=':'.join(key.split(':')[1:]), value=assessment_entry) line = 'ENTITYID={} QUERYID={} DOCID={} MENTION={} ASSESSMENT={} FQEC_READ={} FQEC={}'.format( entity_id, queryid, docid, mention_span, assessment, fqec_read, fqec) self.logger.record_event('GROUND_TRUTH', line, where)
def __init__(self, logger, alignment_directory): super().__init__(logger) self.directory = alignment_directory self.system_to_gold = Container(logger) self.gold_to_system = Container(logger) self.load()
def main(args): logger = Logger(args.log, args.log_specifications, sys.argv) type_mappings = Container(logger) for entry in FileHandler(logger, args.ontology_type_mappings): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } output = [] for entry in FileHandler(logger, args.input): document_id = entry.get('root_doc_id') document_element_id = entry.get('doc_element_id') modality = entry.get('media_type') type = entry.get('type') subtype = entry.get('subtype') subsubtype = entry.get('subsubtype') full_type = '{type}.{subtype}.{subsubtype}'.format( type=type, subtype=subtype, subsubtype=subsubtype) full_type_cleaned = full_type.replace('.unspecified', '') propercased_full_type = type_mappings.get(full_type_cleaned, None) span_string = entry.get('span') keyframe_id = None keyframe_num = 0 if span_string == 'ENTIRE_DOCUMENT_ELEMENT': document_boundary = document_boundaries.get(modality).get( document_element_id) span_string = document_boundary.__str__() elif '-' in span_string: start, end = span_string.split('-') span_string = '({start},0)-({end},0)'.format(start=start, end=end) elif '_' in span_string: keyframe_id = span_string keyframe_num = span_string.split('_')[1] document_boundary = document_boundaries.get('keyframe').get( keyframe_id) span_string = document_boundary.__str__() else: span_string = None output_object = { 'document_id': document_id, 'document_element_id': document_element_id, 'keyframe_id': keyframe_id, 'keyframe_num': int(keyframe_num), 'modality': modality, 'region': span_string, 'type': propercased_full_type, } output.append(output_object) printed = {} fh = open(args.output, 'w') header = [ 'document_id', 'document_element_or_keyframe_id', 'modality', 'region', 'type' ] fh.write('{}\n'.format('\t'.join(header))) for output_object in multisort( output, (('document_id', False), ('modality', False), ('document_element_id', False), ('keyframe_num', False), ('region', False), ('type', False))): line = get_line(output_object, header) if line not in printed: fh.write('{}\n'.format(line)) printed[line] = 1 fh.close() exit(ALLOK_EXIT_CODE)