def load_data(self): fh = FileHandler(self.logger, self.filename) for entry in fh.get('entries'): metatype = entry.get('metatype') container = self.get('containers').get(metatype) container.add(key=entry.get('ontology_id'), value=entry.get('full_type'))
def load_data(self): """ Reads the file containing the mappings into Encodings. """ fh = FileHandler(self.logger, self.filename) for entry in fh.get('entries'): self.add(key=entry.get('encoding'), value=entry.get('modality'))
def load_data(self): """ Loads the data from the parent-children file into the DocumentMappings object. """ mappings = nested_dict() fh = FileHandler(self.logger, self.filename) self.fileheader = fh.get('header') for entry in fh: doceid = entry.get('doceid') docid = entry.get('docid') detype = entry.get('detype') delang = entry.get('lang_manual') mappings[doceid]['docids'][docid] = 1 mappings[doceid]['detype'] = detype mappings[doceid]['delang'] = delang.upper() for doceid in mappings: # TODO: next if doceid is n/a? delang = mappings[doceid]['delang'] detype = mappings[doceid]['detype'] modality = self.encodings.get(detype) for docid in mappings[doceid]['docids']: is_core = 0 if self.core_documents is not None and self.core_documents.exists(docid): is_core = 1 document = self.get('documents').get(docid, default=Document(self.logger, docid)) document.set('is_core', is_core) document_element = self.get('document_elements').get(doceid, default=DocumentElement(self.logger, doceid)) document_element.add_document(document) document_element.set('type', detype) document_element.set('modality', modality) document_element.set('language', delang) document.add_document_element(document_element)
def load_responses(self): def order(filename): filename_order_map = { 'AIDA_P2_TA1_CM_A0001.rq.tsv': 1, 'AIDA_P2_TA1_AM_A0001.rq.tsv': 2, 'AIDA_P2_TA1_TM_A0001.rq.tsv': 3 } if filename not in filename_order_map: print("Filename: '{}' not found in lookup".format(filename)) exit() return filename_order_map[filename] logger = self.get('logger') for subdir in [ '{}/{}'.format(self.get('path'), d) for d in os.listdir(self.get('path')) ]: for filename in sorted(os.listdir(subdir), key=order): filename_including_path = '{}/{}'.format(subdir, filename) fh = FileHandler(logger, filename_including_path) schema = identify_file_schema(fh) if schema is None: logger.record_event('UNKNOWN_RESPONSE_FILE_TYPE', filename_including_path, self.get('code_location')) else: self.load_file(fh, schema)
def main(args): """ The main program for generating AIF """ check_paths(args) logger = Logger(args.log, args.log_specifications_filename, sys.argv) core_documents = CoreDocuments(logger, args.core_documents_filename) encodings = Encodings(logger, args.encodings_filename) document_mappings = DocumentMappings(logger, args.parent_children_filename, encodings, core_documents) text_boundaries = TextBoundaries(logger, args.sentence_boundaries_filename) image_boundaries = ImageBoundaries(logger, args.image_boundaries_filename) video_boundaries = VideoBoundaries(logger, args.video_boundaries_filename) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries_filename) type_mappings = Container(logger) for entry in FileHandler(logger, args.type_mappings_filename): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) slot_mappings = SlotMappings(logger, args.slot_mappings_filename) annotations = Annotations(logger, slot_mappings, document_mappings, text_boundaries, image_boundaries, video_boundaries, keyframe_boundaries, type_mappings, args.annotations, load_video_time_offsets_flag=args.notime) generator = AIFGenerator(logger, annotations, args.nochannel, args.reference_kb_id) generator.write_output(args.output) exit(ALLOK_EXIT_CODE)
def load(self): logger = self.get('logger') for filename in sorted(os.listdir(self.get('directory')), key=str): filename_including_path = '{}/{}'.format(self.get('directory'), filename) document_id = filename.replace('.tab', '') for entry in FileHandler(logger, filename_including_path): system_cluster = entry.get('system_cluster') gold_cluster = entry.get('gold_cluster') similarity = entry.get('similarity') document_system_to_gold = self.get('system_to_gold').get( document_id, default=Container(logger)) document_gold_to_system = self.get('gold_to_system').get( document_id, default=Container(logger)) document_system_to_gold.add(key=system_cluster, value={ 'aligned_to': gold_cluster, 'aligned_similarity': similarity }) document_gold_to_system.add(key=gold_cluster, value={ 'aligned_to': system_cluster, 'aligned_similarity': similarity })
def load(self): logger = self.get('logger') for filename in sorted(os.listdir(self.get('directory')), key=str): filename_including_path = '{}/{}'.format(self.get('directory'), filename) document_id = filename.replace('.tab', '') for entry in FileHandler(logger, filename_including_path): metatype = entry.get('metatype') system_or_gold1 = entry.get('system_or_gold1') system_or_gold2 = entry.get('system_or_gold2') cluster1 = entry.get('cluster1') cluster2 = entry.get('cluster2') similarity = entry.get('similarity') if similarity == 0 or system_or_gold1 != system_or_gold2 or cluster1 != cluster2: continue self.get('cluster_to_metatype').add(key='{}:{}'.format( system_or_gold1.upper(), cluster1), value=metatype) self.get('cluster_to_metatype').add(key='{}:{}'.format( system_or_gold1.upper(), cluster1), value=metatype) self.get(system_or_gold1).get(document_id, default=Container(logger)).add( key=cluster1, value=similarity)
def process_slots(self, filename, subjectmentionid_fieldname): """ Processes the slots from the file specified using the argument 'filename', and store the slots into the slots container. Note that this method is called for both the relation slots, and event slots, and the key field name, i.e. the subject mention ID fieldname, is different. It is called 'relationmention_id' for relations, and 'eventmention_id' for events. This is why we use the parameter subjectmentionid_fieldname to choose between the two depending on where we are processing relations or events. """ for entry in FileHandler(self.logger, filename): subjectmention_id = entry.get(subjectmentionid_fieldname) slot_code = entry.get('slot_type') slot_type = self.get('slot_mappings').get('code_to_type', slot_code) argmention_id = entry.get('argmention_id') subject = self.get('mentions').get(subjectmention_id, None) argument = self.get('mentions').get(argmention_id, None) attribute = entry.get('attribute') if subject is None: self.get('logger').record_event('MISSING_ITEM_WITH_KEY', 'Mention', subjectmention_id, entry.get('where')) continue if argument is None: self.get('logger').record_event('MISSING_ITEM_WITH_KEY', 'Mention', argmention_id, entry.get('where')) continue slot = Slot(self.logger, subject, slot_code, slot_type, argument, attribute, entry.get('where')) subject.add_slot(slot) argument.add_slot(slot) self.get('slots').add_member(slot)
def main(args): logger = Logger(args.log, args.log_specifications, sys.argv) os.mkdir(args.sparql) columns = ['query_id', 'entrypoint_type', 'entrypoint', 'num_clusters', 'depth'] queries_fh = open(args.queries, 'w') queries_fh.write('{}\n'.format('\t'.join(columns))) query_num = 0 for entry in FileHandler(logger, args.input): query_num += 1 values = { 'depth' : args.depth, 'entrypoint_type': entry.get('entrypoint_type'), 'entrypoint' : entry.get('entrypoint'), 'num_clusters' : entry.get('num_clusters'), 'query_id' : '{prefix}{query_num}'.format(prefix=args.prefix, query_num=augment(query_num)) } line = '\t'.join([values[column] for column in columns]) queries_fh.write('{}\n'.format(line)) sparql_query_fh = open('{dir}/{query_id}.rq'.format(dir=args.sparql, query_id=values['query_id']), 'w') sparql_query_fh.write(get_sparql(logger, values['query_id'], values['entrypoint_type'], values['entrypoint'])) sparql_query_fh.close() queries_fh.close() exit(ALLOK_EXIT_CODE)
def __init__(self, logger, **kwargs): super().__init__(logger) for key in kwargs: self.set(key, kwargs[key]) self.file_handler = FileHandler(logger, self.get('filename')) self.components = ClaimComponents( logger, file_handler=self.get('file_handler'))
def process_mentions(self, filename, key_fieldname): """ Processes the mentions from the file specified using the argument 'filename', and store the mentions into the mentions dictionary using the key taken from the entry using the argument 'key_fieldname'. """ for entry in FileHandler(self.logger, filename): if entry.get('type') == 'personalsocial' and entry.get( 'subtype') == 'unspecified': entry.set('subtype', 'relationship') key = entry.get(key_fieldname) if self.mentions.get(key, None) is None: mention = Mention(self.logger, self.document_mappings, self.text_boundaries, self.image_boundaries, self.video_boundaries, self.keyframe_boundaries, self.type_mappings, self.load_video_time_offsets_flag, entry) if len(mention.get('document_spans')) > 0: self.mentions[key] = mention else: self.record_event('MISSING_SPAN_FOR_MENTION', key, entry.get('where')) else: self.record_event('DUPLICATE_VALUE_IN_COLUMN', key, key_fieldname, entry.get('where'))
def load(self): """ Load the file containing core documents into the container """ for entry in FileHandler(self.logger, self.filename): docid = entry.get('root_id') self.add(key=docid, value=docid)
def load(self): logger = self.get('logger') for entry in FileHandler(logger, self.get('filename')): key = self.get('entry_to_key', entry) if key not in self.get('regions'): self.get('regions').add(key=key, value=Container(logger)) regions_by_key = self.get('regions').get(key) for span in self.get('entry_to_spans', entry): regions_by_key.add(key=span, value=span)
def load(self): """ load keyframe boundary information. """ for entry in FileHandler(self.logger, self.filename): start_x, start_y, end_x, end_y = [0, 0, 0, 0] if entry.get('wxh'): end_x, end_y = entry.get('wxh').split('x') self.add(key=entry.get('keyframeid'), value=DocumentBoundary(self.logger, start_x, start_y, end_x, end_y))
def parse_topics(self, condition, topics_file): logger = self.get('logger') header_columns = { 'Condition5': ['topic_id', 'topic', 'subtopic', 'claim_template'], 'Condition6': ['topic_id', 'topic', 'subtopic', 'claim_template'], 'Condition7': ['topic_id', 'topic'] } header = FileHeader(logger, '\t'.join(header_columns.get(condition))) topics = {} for entry in FileHandler(logger, topics_file, header=header): topics.setdefault(entry.get('topic_id'), []).append(entry) return topics
def merge_files(self, input_files, output_file): print('--merging ...') print('--input:{}'.format('\n'.join(input_files))) print('--output:{}'.format(output_file)) header = None fhs = {} for filename_with_path in input_files: fh = FileHandler(self.get('logger'), filename_with_path, encoding='utf-8') if header is None: header = fh.get('header').get('line').strip() if header != fh.get('header').get('line').strip(): self.record_event('DEFAULT_CRITICAL_ERROR', 'Input file headers do not match') fhs[filename_with_path] = fh with open(output_file, 'w', encoding='utf-8') as program_output: program_output.write('{header}\n'.format(header=header)) for filename_with_path in fhs: fh = fhs[filename_with_path] for entry in fh: program_output.write( '{line}'.format(line=entry.get('line')))
def load_frames(self, filetype): logger = self.get('logger') frames = self.get('frames').get(filetype) if self.get('filenames').get(filetype).get('edges') is None: return for entry in FileHandler( self.get('logger'), self.get('filenames').get(filetype).get('edges')): # get edge_id frame_id = entry.get('?subject') if not frames.exists(frame_id): frames.add(key=frame_id, value=EventOrRelationFrame(logger, frame_id, entry.get('where'))) frame = frames.get(frame_id) frame.update(entry)
def load_mentions(self, filetype): logger = self.get('logger') clusters = self.get('clusters').get(filetype) if self.get('filenames').get(filetype).get('mentions') is None: return for entry in FileHandler( self.get('logger'), self.get('filenames').get(filetype).get('mentions')): cluster_id = entry.get('?cluster') if not clusters.exists(cluster_id): clusters.add(key=cluster_id, value=Cluster(logger, self.get('document_mappings'), self.get('document_boundaries'), cluster_id)) cluster = clusters.get(cluster_id) cluster.add(entry)
def load(self): """ load video boundary information. """ for entry in FileHandler(self.logger, self.filename): start_x, start_y, end_x, end_y = [0, 0, 0, 0] document_element_id = splitext(entry.get('video_filename'))[0] entry.set('document_element_id', document_element_id) if entry.get('length'): end_x = entry.get('length') self.add(key=entry.get('document_element_id'), value=DocumentBoundary(self.logger, start_x, start_y, end_x, end_y))
def load(self, ere, ere_spec, ere_filename, ere_container): """ Load the file. Arguments: ere (str): One of the following: 'entities', 'relations', or 'events'. ere_spec (EntitySpec, RelationSpec, or EventSpec): One of the following: EntitySpec, RelationSpec, or EventSpec. ere_filename (str) ere_container (aida.Container): The container into which to load the file. """ for entry in FileHandler(self.logger, ere_filename, encoding='ISO-8859-1'): ontology_id = entry.get('AnnotIndexID') ere_container.add(key=ontology_id, value=ere_spec(self.get('logger'), entry))
def load(self): """ Read the sentence boundary file to load document boundary information. """ for entry in FileHandler(self.logger, self.filename): doceid, start_char, end_char = map( lambda arg: entry.get(arg), 'doceid,start_char,end_char'.split(',')) document_boundary = self.get(doceid, default=DocumentBoundary( self.logger, start_char, 0, end_char, 0)) tb_start_char = document_boundary.get('start_x') tb_end_char = document_boundary.get('end_x') if int(start_char) < int(tb_start_char): document_boundary.set('start_x', start_char) if int(end_char) > int(tb_end_char): document_boundary.set('end_x', end_char)
def process_kb_linking(self, filename): """ Processes the KB linking information. """ for entry in FileHandler(self.logger, filename): kb_id_or_kb_ids = entry.get('kb_id') mention_id = entry.get('mention_id') mention = self.get('mentions').get(mention_id, None) if mention is None: self.record_event('MISSING_ITEM_WITH_KEY', 'Mention', mention_id, entry.get('where')) continue node_metatype = mention.get('node_metatype') for kb_id in kb_id_or_kb_ids.split('|'): node = self.get('nodes').get(kb_id, None) if node is None: node = Node(self.logger, kb_id, node_metatype, [mention]) self.nodes[kb_id] = node else: node.add_mention(mention) mention.add_node(node)
def load_task2_assessments(self): next_fqec_num = 1001 generated_fqecs = {} path = '{}/data/zero-hop/*.tab'.format(self.assessments_dir) header = FileHeader(self.logger, "\t".join(assessments.get('task2').get('across_documents_coreference').get('columns'))) for filename in glob.glob(path): for entry in FileHandler(self.logger, filename, header): queryid, docid, mention_span, assessment_read, fqec_read, where = map( lambda key: entry.get(key), ['queryid', 'docid', 'mention_span', 'assessment', 'fqec', 'where'] ) entity_id = self.get('queries_to_score').get(queryid).get('entity_id') assessment = self.normalize('assessment', assessment_read) query_and_document = '{}:{}'.format(queryid, docid) key = '{}:{}'.format(query_and_document, mention_span) if self.exists(key): self.logger.record_event('MULTIPLE_ASSESSMENTS', key, where) fqec = fqec_read if fqec == 'NIL' and self.normalize('assessment', assessment) == 'CORRECT': if key not in generated_fqecs: fqec = 'NILG{}'.format(next_fqec_num) generated_fqecs[key] = fqec fqec = generated_fqecs[key] assessment_entry = Object(self.logger) assessment_entry.set('assessment', assessment) assessment_entry.set('docid', docid) assessment_entry.set('queryid', queryid) assessment_entry.set('mention_span', mention_span) assessment_entry.set('fqec_read', fqec_read) assessment_entry.set('fqec', fqec) assessment_entry.set('line', entry.get('line')) assessment_entry.set('where', where) if not self.exists(queryid): self.add(key=queryid, value=Container(self.get('logger'))) self.get(queryid).add(key=':'.join(key.split(':')[1:]), value=assessment_entry) line = 'ENTITYID={} QUERYID={} DOCID={} MENTION={} ASSESSMENT={} FQEC_READ={} FQEC={}'.format( entity_id, queryid, docid, mention_span, assessment, fqec_read, fqec) self.logger.record_event('GROUND_TRUTH', line, where)
def load_classquery_assessments(self): next_fqec_num = 1001 generated_fqecs = {} query_type = 'ClassQuery' path = '{}/data/class/*/*.tab'.format(self.assessments_dir) header = FileHeader( self.logger, "\t".join(assessments.get(query_type).get('columns'))) for filename in glob.glob(path): for entry in FileHandler(self.logger, filename, header): queryid, docid, mention_span, assessment_read, fqec_read, where = map( lambda key: entry.get(key), [ 'queryid', 'docid', 'mention_span', 'assessment', 'fqec', 'where' ]) assessment = self.normalize('assessment', assessment_read) query_and_document = '{}:{}'.format(queryid, docid) key = '{}:{}'.format(query_and_document, mention_span) if self.exists(key): self.logger.record_event('MULTIPLE_ASSESSMENTS', key, where) fqec = fqec_read if fqec == 'NIL' and self.normalize('assessment', assessment) == 'CORRECT': if key not in generated_fqecs: fqec = 'NILG{}'.format(next_fqec_num) generated_fqecs[key] = fqec fqec = generated_fqecs[key] assessment_entry = Object(self.logger) assessment_entry.set('assessment', assessment) assessment_entry.set('docid', docid) assessment_entry.set('queryid', queryid) assessment_entry.set('mention_span', mention_span) assessment_entry.set('fqec_read', fqec_read) assessment_entry.set('fqec', fqec) assessment_entry.set('where', where) if not self.exists(key): self.add(key=key, value=assessment_entry) line = 'QUERYID={} DOCID={} MENTION={} ASSESSMENT={} FQEC_READ={} FQEC={}'.format( queryid, docid, mention_span, assessment, fqec_read, fqec) self.logger.record_event('GROUND_TRUTH', line, where)
def __init__(self, logger, filename): """ Initialize the slots object. Parameters: logger (logger): the logger object filename (str): the name of the file containing mappings between LDC internal slot code, and external slot name. The file contains tab separated values with header in first line. For example, slot_type_code slot_type evt001arg01damagerdestroyer ArtifactExistence.DamageDestroy_DamagerDestroyer evt002arg01damager ArtifactExistence.DamageDestroy.Damage_Damager """ super().__init__(logger) self.mappings = { 'code_to_type': Container(logger), 'type_to_codes': Container(logger) } # load the data and store the mapping in a dictionary for entry in FileHandler(logger, filename): slot_type_code = entry.get('slot_type_code') slot_type = entry.get('slot_type') if slot_type_code in self.get('mappings').get('code_to_type'): logger.record_event('DUPLICATE_VALUE_IN_COLUMN', slot_type_code, 'slot_type_code', entry.get('where')) self.get('mappings').get('code_to_type').add(key=slot_type_code, value=slot_type) self.get('mappings').get('type_to_codes').get( slot_type, default=Container(logger)).add(key=slot_type_code, value=slot_type_code)
def __init__(self, logger, **kwargs): super().__init__(logger) for key in kwargs: self.set(key, kwargs[key]) self.file_handler = FileHandler(logger, self.get('filename'))
def load_responses(self): def get_expanded_claim_relations(provided=None): claimrelations = set(['ontopic']) if provided is not None: claimrelations.add(provided) lookup = { 'refuting': ['nonsupporting'], 'supporting': ['nonrefuting'], 'nonrefuting': ['supporting'], 'nonsupporting': ['refuting'], 'related': ['nonsupporting', 'nonrefuting'] } claimrelations.update(lookup.get(provided)) return claimrelations logger = self.get('logger') queries_to_pool = {} for entry in FileHandler(logger, self.get('queries_to_pool_file')): condition = entry.get('condition') query_id = entry.get('query_id') depth = entry.get('depth') queries_to_pool['{}:{}'.format(condition, query_id)] = depth runs_directory = self.get('input_dir') for entry in FileHandler(logger, self.get('runs_to_pool_file')): run_id = entry.get('run_id') arf_directory = os.path.join(runs_directory, run_id, 'ARF-output') for condition in os.listdir(arf_directory): condition_dir = os.path.join(arf_directory, condition) for query_id in os.listdir(condition_dir): condition_and_query = '{}:{}'.format(condition, query_id) if condition_and_query in queries_to_pool: condition_and_query_dir = os.path.join( condition_dir, query_id) depth_left = {} for claim_relation_and_depth in queries_to_pool.get( condition_and_query).split(','): c, d = claim_relation_and_depth.split(':') depth_left[c] = int(d) ranking_file = '{path}/{query_id}.ranking.tsv'.format( path=condition_and_query_dir.replace( 'ARF-output', 'SPARQL-VALID-output'), query_id=query_id) ranks = {} with open(ranking_file) as fh: lineno = 0 for line in fh.readlines(): lineno += 1 elements = line.strip().split('\t') claim_id, rank = elements[1], int(elements[2]) provided = elements[ 3] if condition == 'Condition5' else None claim_relations = get_expanded_claim_relations( provided=provided) if rank in ranks: # critical error self.record_event('DUPLICATE_RANK', { 'filename': ranking_file, 'lineno': lineno }) ranks[rank] = { 'claim_id': claim_id, 'claim_relations': claim_relations } for rank in sorted(ranks): claim_id = ranks.get(rank).get('claim_id') claim_relations = ranks.get(rank).get( 'claim_relations') include_in_pool = False for claim_relation in claim_relations: if depth_left.get(claim_relation) > 0: include_in_pool = True if include_in_pool: self.get('claims').add( condition, query_id, run_id, claim_id, runs_directory, condition_and_query_dir, claim_relations) for claim_relation in claim_relations: if depth_left.get(claim_relation) > 0: depth_left[claim_relation] -= 1
def augment_file(self, input_file, output_file): print('--augmenting ...') print('--input:{}'.format(input_file)) print('--output:{}'.format(output_file)) missing_handles = ['[unknown]', '', '""'] fh = FileHandler(self.get('logger'), input_file, encoding='utf-8') with open(output_file, 'w', encoding='utf-8') as program_output: program_output.write( '{header}\n'.format(header=fh.get('header').get('line'))) for entry in fh: line = entry.get('line') handle_text = entry.get('?objectc_handle') if handle_text is not None: if handle_text in missing_handles: corrected_handle_text = self.get( 'handle_text', entry.get('?oinf_j_span')) if corrected_handle_text: entry.set('?objectc_handle', corrected_handle_text) self.record_event( 'DEFAULT_INFO', 'replacing missing handle \'{}\' with text \'{}\'' .format(handle_text, corrected_handle_text), entry.get('where')) line = '{}\n'.format('\t'.join([ entry.get(column) for column in entry.get( 'header').get('columns') ])) else: self.record_event( 'DEFAULT_INFO', "handle \'{}\' found to be missing but no replacements made" .format(handle_text), entry.get('where')) elif len(handle_text.split(':')) == 3: handle_span = handle_text pattern = re.compile( '^(\w+?):(\w+?):\((\S+),(\S+)\)-\((\S+),(\S+)\)$') match = pattern.match(handle_span) if match: handle_text_from_span = self.get( 'handle_text', handle_span) if handle_text_from_span: entry.set('?objectc_handle', handle_text_from_span) self.record_event( 'DEFAULT_INFO', 'replacing handle span \'{}\' with text \'{}\'' .format(handle_span, handle_text_from_span), entry.get('where')) line = '{}\n'.format('\t'.join([ entry.get(column) for column in entry.get( 'header').get('columns') ])) else: self.record_event( 'DEFAULT_INFO', "handle span \'{}\' found but not replaced with text" .format(handle_text), entry.get('where')) program_output.write('{line}'.format(line=line))
def main(args): logger = Logger(args.log, args.log_specifications, sys.argv) type_mappings = Container(logger) for entry in FileHandler(logger, args.ontology_type_mappings): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } output = [] for entry in FileHandler(logger, args.input): document_id = entry.get('root_doc_id') document_element_id = entry.get('doc_element_id') modality = entry.get('media_type') type = entry.get('type') subtype = entry.get('subtype') subsubtype = entry.get('subsubtype') full_type = '{type}.{subtype}.{subsubtype}'.format( type=type, subtype=subtype, subsubtype=subsubtype) full_type_cleaned = full_type.replace('.unspecified', '') propercased_full_type = type_mappings.get(full_type_cleaned, None) span_string = entry.get('span') keyframe_id = None keyframe_num = 0 if span_string == 'ENTIRE_DOCUMENT_ELEMENT': document_boundary = document_boundaries.get(modality).get( document_element_id) span_string = document_boundary.__str__() elif '-' in span_string: start, end = span_string.split('-') span_string = '({start},0)-({end},0)'.format(start=start, end=end) elif '_' in span_string: keyframe_id = span_string keyframe_num = span_string.split('_')[1] document_boundary = document_boundaries.get('keyframe').get( keyframe_id) span_string = document_boundary.__str__() else: span_string = None output_object = { 'document_id': document_id, 'document_element_id': document_element_id, 'keyframe_id': keyframe_id, 'keyframe_num': int(keyframe_num), 'modality': modality, 'region': span_string, 'type': propercased_full_type, } output.append(output_object) printed = {} fh = open(args.output, 'w') header = [ 'document_id', 'document_element_or_keyframe_id', 'modality', 'region', 'type' ] fh.write('{}\n'.format('\t'.join(header))) for output_object in multisort( output, (('document_id', False), ('modality', False), ('document_element_id', False), ('keyframe_num', False), ('region', False), ('type', False))): line = get_line(output_object, header) if line not in printed: fh.write('{}\n'.format(line)) printed[line] = 1 fh.close() exit(ALLOK_EXIT_CODE)