def load_data(self):
     fh = FileHandler(self.logger, self.filename)
     for entry in fh.get('entries'):
         metatype = entry.get('metatype')
         container = self.get('containers').get(metatype)
         container.add(key=entry.get('ontology_id'),
                       value=entry.get('full_type'))
Beispiel #2
0
 def load_data(self):
     """
     Reads the file containing the mappings into Encodings.
     """
     fh = FileHandler(self.logger, self.filename)
     for entry in fh.get('entries'):
         self.add(key=entry.get('encoding'), value=entry.get('modality'))
Beispiel #3
0
 def load_data(self):
     """
     Loads the data from the parent-children file into the DocumentMappings object.
     """
     mappings = nested_dict()
     fh = FileHandler(self.logger, self.filename)
     self.fileheader = fh.get('header')
     for entry in fh:
         doceid = entry.get('doceid')
         docid = entry.get('docid')
         detype = entry.get('detype')
         delang = entry.get('lang_manual')
         mappings[doceid]['docids'][docid] = 1
         mappings[doceid]['detype'] = detype
         mappings[doceid]['delang'] = delang.upper()
     for doceid in mappings:
         # TODO: next if doceid is n/a?
         delang = mappings[doceid]['delang']
         detype = mappings[doceid]['detype']
         modality = self.encodings.get(detype)
         for docid in mappings[doceid]['docids']:
             is_core = 0
             if self.core_documents is not None and self.core_documents.exists(docid):
                 is_core = 1
             document = self.get('documents').get(docid, default=Document(self.logger, docid))
             document.set('is_core', is_core)
             document_element = self.get('document_elements').get(doceid, default=DocumentElement(self.logger, doceid))
             document_element.add_document(document)
             document_element.set('type', detype)
             document_element.set('modality', modality)
             document_element.set('language', delang)
             document.add_document_element(document_element)
Beispiel #4
0
    def load_responses(self):
        def order(filename):
            filename_order_map = {
                'AIDA_P2_TA1_CM_A0001.rq.tsv': 1,
                'AIDA_P2_TA1_AM_A0001.rq.tsv': 2,
                'AIDA_P2_TA1_TM_A0001.rq.tsv': 3
            }
            if filename not in filename_order_map:
                print("Filename: '{}' not found in lookup".format(filename))
                exit()
            return filename_order_map[filename]

        logger = self.get('logger')
        for subdir in [
                '{}/{}'.format(self.get('path'), d)
                for d in os.listdir(self.get('path'))
        ]:
            for filename in sorted(os.listdir(subdir), key=order):
                filename_including_path = '{}/{}'.format(subdir, filename)
                fh = FileHandler(logger, filename_including_path)
                schema = identify_file_schema(fh)
                if schema is None:
                    logger.record_event('UNKNOWN_RESPONSE_FILE_TYPE',
                                        filename_including_path,
                                        self.get('code_location'))
                else:
                    self.load_file(fh, schema)
Beispiel #5
0
def main(args):
    """
    The main program for generating AIF
    """
    check_paths(args)
    logger = Logger(args.log, args.log_specifications_filename, sys.argv)
    core_documents = CoreDocuments(logger, args.core_documents_filename)
    encodings = Encodings(logger, args.encodings_filename)
    document_mappings = DocumentMappings(logger, args.parent_children_filename,
                                         encodings, core_documents)
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries_filename)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries_filename)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries_filename)
    keyframe_boundaries = KeyFrameBoundaries(logger,
                                             args.keyframe_boundaries_filename)
    type_mappings = Container(logger)
    for entry in FileHandler(logger, args.type_mappings_filename):
        type_mappings.add(key=entry.get('full_type_ov'),
                          value=entry.get('full_type'))
    slot_mappings = SlotMappings(logger, args.slot_mappings_filename)
    annotations = Annotations(logger,
                              slot_mappings,
                              document_mappings,
                              text_boundaries,
                              image_boundaries,
                              video_boundaries,
                              keyframe_boundaries,
                              type_mappings,
                              args.annotations,
                              load_video_time_offsets_flag=args.notime)
    generator = AIFGenerator(logger, annotations, args.nochannel,
                             args.reference_kb_id)
    generator.write_output(args.output)
    exit(ALLOK_EXIT_CODE)
Beispiel #6
0
 def load(self):
     logger = self.get('logger')
     for filename in sorted(os.listdir(self.get('directory')), key=str):
         filename_including_path = '{}/{}'.format(self.get('directory'),
                                                  filename)
         document_id = filename.replace('.tab', '')
         for entry in FileHandler(logger, filename_including_path):
             system_cluster = entry.get('system_cluster')
             gold_cluster = entry.get('gold_cluster')
             similarity = entry.get('similarity')
             document_system_to_gold = self.get('system_to_gold').get(
                 document_id, default=Container(logger))
             document_gold_to_system = self.get('gold_to_system').get(
                 document_id, default=Container(logger))
             document_system_to_gold.add(key=system_cluster,
                                         value={
                                             'aligned_to': gold_cluster,
                                             'aligned_similarity':
                                             similarity
                                         })
             document_gold_to_system.add(key=gold_cluster,
                                         value={
                                             'aligned_to': system_cluster,
                                             'aligned_similarity':
                                             similarity
                                         })
Beispiel #7
0
 def load(self):
     logger = self.get('logger')
     for filename in sorted(os.listdir(self.get('directory')), key=str):
         filename_including_path = '{}/{}'.format(self.get('directory'),
                                                  filename)
         document_id = filename.replace('.tab', '')
         for entry in FileHandler(logger, filename_including_path):
             metatype = entry.get('metatype')
             system_or_gold1 = entry.get('system_or_gold1')
             system_or_gold2 = entry.get('system_or_gold2')
             cluster1 = entry.get('cluster1')
             cluster2 = entry.get('cluster2')
             similarity = entry.get('similarity')
             if similarity == 0 or system_or_gold1 != system_or_gold2 or cluster1 != cluster2:
                 continue
             self.get('cluster_to_metatype').add(key='{}:{}'.format(
                 system_or_gold1.upper(), cluster1),
                                                 value=metatype)
             self.get('cluster_to_metatype').add(key='{}:{}'.format(
                 system_or_gold1.upper(), cluster1),
                                                 value=metatype)
             self.get(system_or_gold1).get(document_id,
                                           default=Container(logger)).add(
                                               key=cluster1,
                                               value=similarity)
Beispiel #8
0
    def process_slots(self, filename, subjectmentionid_fieldname):
        """
        Processes the slots from the file specified using the argument 'filename',
        and store the slots into the slots container.

        Note that this method is called for both the relation slots, and event slots,
        and the key field name, i.e. the subject mention ID fieldname, is different.
        It is called 'relationmention_id' for relations, and 'eventmention_id' for
        events. This is why we use the parameter subjectmentionid_fieldname to choose
        between the two depending on where we are processing relations or events.
        """
        for entry in FileHandler(self.logger, filename):
            subjectmention_id = entry.get(subjectmentionid_fieldname)
            slot_code = entry.get('slot_type')
            slot_type = self.get('slot_mappings').get('code_to_type', slot_code)
            argmention_id = entry.get('argmention_id')
            subject = self.get('mentions').get(subjectmention_id, None)
            argument = self.get('mentions').get(argmention_id, None)
            attribute = entry.get('attribute')
            if subject is None:
                self.get('logger').record_event('MISSING_ITEM_WITH_KEY', 'Mention', subjectmention_id, entry.get('where'))
                continue
            if argument is None:
                self.get('logger').record_event('MISSING_ITEM_WITH_KEY', 'Mention', argmention_id, entry.get('where'))
                continue
            slot = Slot(self.logger, subject, slot_code, slot_type, argument, attribute, entry.get('where'))
            subject.add_slot(slot)
            argument.add_slot(slot)
            self.get('slots').add_member(slot)
Beispiel #9
0
def main(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    os.mkdir(args.sparql)

    columns = ['query_id', 'entrypoint_type', 'entrypoint', 'num_clusters', 'depth']

    queries_fh = open(args.queries, 'w')
    queries_fh.write('{}\n'.format('\t'.join(columns)))
    query_num = 0
    for entry in FileHandler(logger, args.input):
        query_num += 1
        values = {
            'depth'          : args.depth,
            'entrypoint_type': entry.get('entrypoint_type'),
            'entrypoint'     : entry.get('entrypoint'),
            'num_clusters'   : entry.get('num_clusters'),
            'query_id'       : '{prefix}{query_num}'.format(prefix=args.prefix, query_num=augment(query_num))
            }
        line = '\t'.join([values[column] for column in columns])
        queries_fh.write('{}\n'.format(line))
        
        sparql_query_fh = open('{dir}/{query_id}.rq'.format(dir=args.sparql, query_id=values['query_id']), 'w')
        sparql_query_fh.write(get_sparql(logger,
                                         values['query_id'],
                                         values['entrypoint_type'],
                                         values['entrypoint']))
        sparql_query_fh.close()
        
    queries_fh.close()

    exit(ALLOK_EXIT_CODE)
Beispiel #10
0
 def __init__(self, logger, **kwargs):
     super().__init__(logger)
     for key in kwargs:
         self.set(key, kwargs[key])
     self.file_handler = FileHandler(logger, self.get('filename'))
     self.components = ClaimComponents(
         logger, file_handler=self.get('file_handler'))
Beispiel #11
0
 def process_mentions(self, filename, key_fieldname):
     """
     Processes the mentions from the file specified using the argument 'filename',
     and store the mentions into the mentions dictionary using the key taken
     from the entry using the argument 'key_fieldname'.
     """
     for entry in FileHandler(self.logger, filename):
         if entry.get('type') == 'personalsocial' and entry.get(
                 'subtype') == 'unspecified':
             entry.set('subtype', 'relationship')
         key = entry.get(key_fieldname)
         if self.mentions.get(key, None) is None:
             mention = Mention(self.logger, self.document_mappings,
                               self.text_boundaries, self.image_boundaries,
                               self.video_boundaries,
                               self.keyframe_boundaries, self.type_mappings,
                               self.load_video_time_offsets_flag, entry)
             if len(mention.get('document_spans')) > 0:
                 self.mentions[key] = mention
             else:
                 self.record_event('MISSING_SPAN_FOR_MENTION', key,
                                   entry.get('where'))
         else:
             self.record_event('DUPLICATE_VALUE_IN_COLUMN', key,
                               key_fieldname, entry.get('where'))
Beispiel #12
0
 def load(self):
     """
     Load the file containing core documents into the container
     """
     for entry in FileHandler(self.logger, self.filename):
         docid = entry.get('root_id')
         self.add(key=docid, value=docid)
Beispiel #13
0
 def load(self):
     logger = self.get('logger')
     for entry in FileHandler(logger, self.get('filename')):
         key = self.get('entry_to_key', entry)
         if key not in self.get('regions'):
             self.get('regions').add(key=key, value=Container(logger))
         regions_by_key = self.get('regions').get(key)
         for span in self.get('entry_to_spans', entry):
             regions_by_key.add(key=span, value=span)
Beispiel #14
0
 def load(self):
     """
     load keyframe boundary information.
     """
     for entry in FileHandler(self.logger, self.filename):
         start_x, start_y, end_x, end_y = [0, 0, 0, 0]
         if entry.get('wxh'):
             end_x, end_y = entry.get('wxh').split('x')
         self.add(key=entry.get('keyframeid'),
                  value=DocumentBoundary(self.logger, start_x, start_y,
                                         end_x, end_y))
Beispiel #15
0
 def parse_topics(self, condition, topics_file):
     logger = self.get('logger')
     header_columns = {
         'Condition5': ['topic_id', 'topic', 'subtopic', 'claim_template'],
         'Condition6': ['topic_id', 'topic', 'subtopic', 'claim_template'],
         'Condition7': ['topic_id', 'topic']
     }
     header = FileHeader(logger, '\t'.join(header_columns.get(condition)))
     topics = {}
     for entry in FileHandler(logger, topics_file, header=header):
         topics.setdefault(entry.get('topic_id'), []).append(entry)
     return topics
Beispiel #16
0
 def merge_files(self, input_files, output_file):
     print('--merging ...')
     print('--input:{}'.format('\n'.join(input_files)))
     print('--output:{}'.format(output_file))
     header = None
     fhs = {}
     for filename_with_path in input_files:
         fh = FileHandler(self.get('logger'),
                          filename_with_path,
                          encoding='utf-8')
         if header is None:
             header = fh.get('header').get('line').strip()
         if header != fh.get('header').get('line').strip():
             self.record_event('DEFAULT_CRITICAL_ERROR',
                               'Input file headers do not match')
         fhs[filename_with_path] = fh
     with open(output_file, 'w', encoding='utf-8') as program_output:
         program_output.write('{header}\n'.format(header=header))
         for filename_with_path in fhs:
             fh = fhs[filename_with_path]
             for entry in fh:
                 program_output.write(
                     '{line}'.format(line=entry.get('line')))
Beispiel #17
0
 def load_frames(self, filetype):
     logger = self.get('logger')
     frames = self.get('frames').get(filetype)
     if self.get('filenames').get(filetype).get('edges') is None: return
     for entry in FileHandler(
             self.get('logger'),
             self.get('filenames').get(filetype).get('edges')):
         # get edge_id
         frame_id = entry.get('?subject')
         if not frames.exists(frame_id):
             frames.add(key=frame_id,
                        value=EventOrRelationFrame(logger, frame_id,
                                                   entry.get('where')))
         frame = frames.get(frame_id)
         frame.update(entry)
Beispiel #18
0
 def load_mentions(self, filetype):
     logger = self.get('logger')
     clusters = self.get('clusters').get(filetype)
     if self.get('filenames').get(filetype).get('mentions') is None: return
     for entry in FileHandler(
             self.get('logger'),
             self.get('filenames').get(filetype).get('mentions')):
         cluster_id = entry.get('?cluster')
         if not clusters.exists(cluster_id):
             clusters.add(key=cluster_id,
                          value=Cluster(logger,
                                        self.get('document_mappings'),
                                        self.get('document_boundaries'),
                                        cluster_id))
         cluster = clusters.get(cluster_id)
         cluster.add(entry)
Beispiel #19
0
 def load(self):
     """
     load video boundary information.
     """
     for entry in FileHandler(self.logger, self.filename):
         start_x, start_y, end_x, end_y = [0, 0, 0, 0]
         document_element_id = splitext(entry.get('video_filename'))[0]
         entry.set('document_element_id', document_element_id)
         if entry.get('length'):
             end_x = entry.get('length')
         self.add(key=entry.get('document_element_id'),
                  value=DocumentBoundary(self.logger,
                                         start_x, 
                                         start_y, 
                                         end_x, 
                                         end_y))
Beispiel #20
0
    def load(self, ere, ere_spec, ere_filename, ere_container):
        """
        Load the file.

        Arguments:
            ere (str):
                One of the following: 'entities', 'relations', or 'events'.
            ere_spec (EntitySpec, RelationSpec, or EventSpec):
                One of the following: EntitySpec, RelationSpec, or EventSpec.
            ere_filename (str)
            ere_container (aida.Container):
                The container into which to load the file.
        """
        for entry in FileHandler(self.logger,
                                 ere_filename,
                                 encoding='ISO-8859-1'):
            ontology_id = entry.get('AnnotIndexID')
            ere_container.add(key=ontology_id,
                              value=ere_spec(self.get('logger'), entry))
Beispiel #21
0
 def load(self):
     """
     Read the sentence boundary file to load document boundary
     information.
     """
     for entry in FileHandler(self.logger, self.filename):
         doceid, start_char, end_char = map(
             lambda arg: entry.get(arg),
             'doceid,start_char,end_char'.split(','))
         document_boundary = self.get(doceid,
                                      default=DocumentBoundary(
                                          self.logger, start_char, 0,
                                          end_char, 0))
         tb_start_char = document_boundary.get('start_x')
         tb_end_char = document_boundary.get('end_x')
         if int(start_char) < int(tb_start_char):
             document_boundary.set('start_x', start_char)
         if int(end_char) > int(tb_end_char):
             document_boundary.set('end_x', end_char)
Beispiel #22
0
 def process_kb_linking(self, filename):
     """
     Processes the KB linking information.
     """
     for entry in FileHandler(self.logger, filename):
         kb_id_or_kb_ids = entry.get('kb_id')
         mention_id = entry.get('mention_id')
         mention = self.get('mentions').get(mention_id, None)
         if mention is None:
             self.record_event('MISSING_ITEM_WITH_KEY', 'Mention', mention_id, entry.get('where'))
             continue
         node_metatype = mention.get('node_metatype')
         for kb_id in kb_id_or_kb_ids.split('|'):
             node = self.get('nodes').get(kb_id, None)
             if node is None:
                 node = Node(self.logger, kb_id, node_metatype, [mention])
                 self.nodes[kb_id] = node
             else:
                 node.add_mention(mention)
             mention.add_node(node)
Beispiel #23
0
    def load_task2_assessments(self):
        next_fqec_num = 1001
        generated_fqecs = {}
        path = '{}/data/zero-hop/*.tab'.format(self.assessments_dir)
        header =  FileHeader(self.logger, "\t".join(assessments.get('task2').get('across_documents_coreference').get('columns')))
        for filename in glob.glob(path):
            for entry in FileHandler(self.logger, filename, header):
                queryid, docid, mention_span, assessment_read, fqec_read, where = map(
                    lambda key: entry.get(key),
                    ['queryid', 'docid', 'mention_span', 'assessment', 'fqec', 'where']
                    )
                entity_id = self.get('queries_to_score').get(queryid).get('entity_id')
                assessment = self.normalize('assessment', assessment_read)
                query_and_document = '{}:{}'.format(queryid, docid)
                key = '{}:{}'.format(query_and_document, mention_span)
                if self.exists(key):
                    self.logger.record_event('MULTIPLE_ASSESSMENTS', key, where)
                fqec = fqec_read
                if fqec == 'NIL' and self.normalize('assessment', assessment) == 'CORRECT':
                    if key not in generated_fqecs:
                        fqec = 'NILG{}'.format(next_fqec_num)
                        generated_fqecs[key] = fqec
                    fqec = generated_fqecs[key]
                assessment_entry = Object(self.logger)
                assessment_entry.set('assessment', assessment)
                assessment_entry.set('docid', docid)
                assessment_entry.set('queryid', queryid)
                assessment_entry.set('mention_span', mention_span)
                assessment_entry.set('fqec_read', fqec_read)
                assessment_entry.set('fqec', fqec)
                assessment_entry.set('line', entry.get('line'))
                assessment_entry.set('where', where)

                if not self.exists(queryid):
                    self.add(key=queryid, value=Container(self.get('logger')))
                self.get(queryid).add(key=':'.join(key.split(':')[1:]), value=assessment_entry)

                line = 'ENTITYID={} QUERYID={} DOCID={} MENTION={} ASSESSMENT={} FQEC_READ={} FQEC={}'.format(
                    entity_id, queryid, docid, mention_span, assessment, fqec_read, fqec)
                self.logger.record_event('GROUND_TRUTH', line, where)
Beispiel #24
0
 def load_classquery_assessments(self):
     next_fqec_num = 1001
     generated_fqecs = {}
     query_type = 'ClassQuery'
     path = '{}/data/class/*/*.tab'.format(self.assessments_dir)
     header = FileHeader(
         self.logger, "\t".join(assessments.get(query_type).get('columns')))
     for filename in glob.glob(path):
         for entry in FileHandler(self.logger, filename, header):
             queryid, docid, mention_span, assessment_read, fqec_read, where = map(
                 lambda key: entry.get(key), [
                     'queryid', 'docid', 'mention_span', 'assessment',
                     'fqec', 'where'
                 ])
             assessment = self.normalize('assessment', assessment_read)
             query_and_document = '{}:{}'.format(queryid, docid)
             key = '{}:{}'.format(query_and_document, mention_span)
             if self.exists(key):
                 self.logger.record_event('MULTIPLE_ASSESSMENTS', key,
                                          where)
             fqec = fqec_read
             if fqec == 'NIL' and self.normalize('assessment',
                                                 assessment) == 'CORRECT':
                 if key not in generated_fqecs:
                     fqec = 'NILG{}'.format(next_fqec_num)
                     generated_fqecs[key] = fqec
                 fqec = generated_fqecs[key]
             assessment_entry = Object(self.logger)
             assessment_entry.set('assessment', assessment)
             assessment_entry.set('docid', docid)
             assessment_entry.set('queryid', queryid)
             assessment_entry.set('mention_span', mention_span)
             assessment_entry.set('fqec_read', fqec_read)
             assessment_entry.set('fqec', fqec)
             assessment_entry.set('where', where)
             if not self.exists(key):
                 self.add(key=key, value=assessment_entry)
             line = 'QUERYID={} DOCID={} MENTION={} ASSESSMENT={} FQEC_READ={} FQEC={}'.format(
                 queryid, docid, mention_span, assessment, fqec_read, fqec)
             self.logger.record_event('GROUND_TRUTH', line, where)
Beispiel #25
0
 def __init__(self, logger, filename):
     """
     Initialize the slots object.
     
     Parameters:
         logger (logger):
             the logger object
         filename (str):
             the name of the file containing mappings between LDC 
             internal slot code, and external slot name.
         
     The file contains tab separated values with header in first line.
         
     For example,    
         slot_type_code    slot_type
         evt001arg01damagerdestroyer    ArtifactExistence.DamageDestroy_DamagerDestroyer
         evt002arg01damager    ArtifactExistence.DamageDestroy.Damage_Damager
     """
     super().__init__(logger)
     self.mappings = {
         'code_to_type': Container(logger),
         'type_to_codes': Container(logger)
     }
     # load the data and store the mapping in a dictionary
     for entry in FileHandler(logger, filename):
         slot_type_code = entry.get('slot_type_code')
         slot_type = entry.get('slot_type')
         if slot_type_code in self.get('mappings').get('code_to_type'):
             logger.record_event('DUPLICATE_VALUE_IN_COLUMN',
                                 slot_type_code, 'slot_type_code',
                                 entry.get('where'))
         self.get('mappings').get('code_to_type').add(key=slot_type_code,
                                                      value=slot_type)
         self.get('mappings').get('type_to_codes').get(
             slot_type, default=Container(logger)).add(key=slot_type_code,
                                                       value=slot_type_code)
Beispiel #26
0
 def __init__(self, logger, **kwargs):
     super().__init__(logger)
     for key in kwargs:
         self.set(key, kwargs[key])
     self.file_handler = FileHandler(logger, self.get('filename'))
Beispiel #27
0
    def load_responses(self):
        def get_expanded_claim_relations(provided=None):
            claimrelations = set(['ontopic'])
            if provided is not None:
                claimrelations.add(provided)
                lookup = {
                    'refuting': ['nonsupporting'],
                    'supporting': ['nonrefuting'],
                    'nonrefuting': ['supporting'],
                    'nonsupporting': ['refuting'],
                    'related': ['nonsupporting', 'nonrefuting']
                }
                claimrelations.update(lookup.get(provided))
            return claimrelations

        logger = self.get('logger')
        queries_to_pool = {}
        for entry in FileHandler(logger, self.get('queries_to_pool_file')):
            condition = entry.get('condition')
            query_id = entry.get('query_id')
            depth = entry.get('depth')
            queries_to_pool['{}:{}'.format(condition, query_id)] = depth
        runs_directory = self.get('input_dir')
        for entry in FileHandler(logger, self.get('runs_to_pool_file')):
            run_id = entry.get('run_id')
            arf_directory = os.path.join(runs_directory, run_id, 'ARF-output')
            for condition in os.listdir(arf_directory):
                condition_dir = os.path.join(arf_directory, condition)
                for query_id in os.listdir(condition_dir):
                    condition_and_query = '{}:{}'.format(condition, query_id)
                    if condition_and_query in queries_to_pool:
                        condition_and_query_dir = os.path.join(
                            condition_dir, query_id)
                        depth_left = {}
                        for claim_relation_and_depth in queries_to_pool.get(
                                condition_and_query).split(','):
                            c, d = claim_relation_and_depth.split(':')
                            depth_left[c] = int(d)
                        ranking_file = '{path}/{query_id}.ranking.tsv'.format(
                            path=condition_and_query_dir.replace(
                                'ARF-output', 'SPARQL-VALID-output'),
                            query_id=query_id)
                        ranks = {}
                        with open(ranking_file) as fh:
                            lineno = 0
                            for line in fh.readlines():
                                lineno += 1
                                elements = line.strip().split('\t')
                                claim_id, rank = elements[1], int(elements[2])
                                provided = elements[
                                    3] if condition == 'Condition5' else None
                                claim_relations = get_expanded_claim_relations(
                                    provided=provided)
                                if rank in ranks:
                                    # critical error
                                    self.record_event('DUPLICATE_RANK', {
                                        'filename': ranking_file,
                                        'lineno': lineno
                                    })
                                ranks[rank] = {
                                    'claim_id': claim_id,
                                    'claim_relations': claim_relations
                                }
                        for rank in sorted(ranks):
                            claim_id = ranks.get(rank).get('claim_id')
                            claim_relations = ranks.get(rank).get(
                                'claim_relations')
                            include_in_pool = False
                            for claim_relation in claim_relations:
                                if depth_left.get(claim_relation) > 0:
                                    include_in_pool = True
                            if include_in_pool:
                                self.get('claims').add(
                                    condition, query_id, run_id, claim_id,
                                    runs_directory, condition_and_query_dir,
                                    claim_relations)
                                for claim_relation in claim_relations:
                                    if depth_left.get(claim_relation) > 0:
                                        depth_left[claim_relation] -= 1
Beispiel #28
0
    def augment_file(self, input_file, output_file):
        print('--augmenting ...')
        print('--input:{}'.format(input_file))
        print('--output:{}'.format(output_file))

        missing_handles = ['[unknown]', '', '""']

        fh = FileHandler(self.get('logger'), input_file, encoding='utf-8')
        with open(output_file, 'w', encoding='utf-8') as program_output:
            program_output.write(
                '{header}\n'.format(header=fh.get('header').get('line')))
            for entry in fh:
                line = entry.get('line')
                handle_text = entry.get('?objectc_handle')
                if handle_text is not None:
                    if handle_text in missing_handles:
                        corrected_handle_text = self.get(
                            'handle_text', entry.get('?oinf_j_span'))
                        if corrected_handle_text:
                            entry.set('?objectc_handle', corrected_handle_text)
                            self.record_event(
                                'DEFAULT_INFO',
                                'replacing missing handle \'{}\' with text \'{}\''
                                .format(handle_text, corrected_handle_text),
                                entry.get('where'))
                            line = '{}\n'.format('\t'.join([
                                entry.get(column) for column in entry.get(
                                    'header').get('columns')
                            ]))
                        else:
                            self.record_event(
                                'DEFAULT_INFO',
                                "handle \'{}\' found to be missing but no replacements made"
                                .format(handle_text), entry.get('where'))
                    elif len(handle_text.split(':')) == 3:
                        handle_span = handle_text
                        pattern = re.compile(
                            '^(\w+?):(\w+?):\((\S+),(\S+)\)-\((\S+),(\S+)\)$')
                        match = pattern.match(handle_span)
                        if match:
                            handle_text_from_span = self.get(
                                'handle_text', handle_span)
                            if handle_text_from_span:
                                entry.set('?objectc_handle',
                                          handle_text_from_span)
                                self.record_event(
                                    'DEFAULT_INFO',
                                    'replacing handle span \'{}\' with text \'{}\''
                                    .format(handle_span,
                                            handle_text_from_span),
                                    entry.get('where'))
                                line = '{}\n'.format('\t'.join([
                                    entry.get(column) for column in entry.get(
                                        'header').get('columns')
                                ]))
                            else:
                                self.record_event(
                                    'DEFAULT_INFO',
                                    "handle span \'{}\' found but not replaced with text"
                                    .format(handle_text), entry.get('where'))
                program_output.write('{line}'.format(line=line))
Beispiel #29
0
def main(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    type_mappings = Container(logger)
    for entry in FileHandler(logger, args.ontology_type_mappings):
        type_mappings.add(key=entry.get('full_type_ov'),
                          value=entry.get('full_type'))

    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    output = []
    for entry in FileHandler(logger, args.input):
        document_id = entry.get('root_doc_id')
        document_element_id = entry.get('doc_element_id')
        modality = entry.get('media_type')
        type = entry.get('type')
        subtype = entry.get('subtype')
        subsubtype = entry.get('subsubtype')
        full_type = '{type}.{subtype}.{subsubtype}'.format(
            type=type, subtype=subtype, subsubtype=subsubtype)
        full_type_cleaned = full_type.replace('.unspecified', '')
        propercased_full_type = type_mappings.get(full_type_cleaned, None)
        span_string = entry.get('span')
        keyframe_id = None
        keyframe_num = 0
        if span_string == 'ENTIRE_DOCUMENT_ELEMENT':
            document_boundary = document_boundaries.get(modality).get(
                document_element_id)
            span_string = document_boundary.__str__()
        elif '-' in span_string:
            start, end = span_string.split('-')
            span_string = '({start},0)-({end},0)'.format(start=start, end=end)
        elif '_' in span_string:
            keyframe_id = span_string
            keyframe_num = span_string.split('_')[1]
            document_boundary = document_boundaries.get('keyframe').get(
                keyframe_id)
            span_string = document_boundary.__str__()
        else:
            span_string = None
        output_object = {
            'document_id': document_id,
            'document_element_id': document_element_id,
            'keyframe_id': keyframe_id,
            'keyframe_num': int(keyframe_num),
            'modality': modality,
            'region': span_string,
            'type': propercased_full_type,
        }
        output.append(output_object)

    printed = {}
    fh = open(args.output, 'w')
    header = [
        'document_id', 'document_element_or_keyframe_id', 'modality', 'region',
        'type'
    ]
    fh.write('{}\n'.format('\t'.join(header)))
    for output_object in multisort(
            output, (('document_id', False), ('modality', False),
                     ('document_element_id', False), ('keyframe_num', False),
                     ('region', False), ('type', False))):
        line = get_line(output_object, header)
        if line not in printed:
            fh.write('{}\n'.format(line))
            printed[line] = 1
    fh.close()
    exit(ALLOK_EXIT_CODE)