Beispiel #1
0
 def _json_dump(kb, filename):
     """
     Dumps unified knowledgebase into a zipped json file
     :param filename: file to write to
     :return:
     """
     with file_util.open(filename, mode='wb') as outfile:
         kb_dict = {
             'name':
             kb.name,
             'entities': [{
                 'research_entity_id': entity.research_entity_id,
                 'canonical_name': entity.canonical_name,
                 'aliases': entity.aliases,
                 'definition': entity.definition,
                 'source_urls': entity.source_urls,
                 'category': entity.category,
                 'relation_ids': entity.relation_ids,
                 'other_contexts': entity.other_contexts,
                 'additional_details': entity.additional_details
             } for entity in kb.entities],
             'relations': [{
                 'relation_id':
                 rel_id,
                 'relation_type':
                 relation.relation_type,
                 'entity_ids':
                 relation.entity_ids,
                 'symmetric':
                 relation.symmetric,
                 'labels':
                 list(relation.labels) if relation.labels else None,
             } for rel_id, relation in enumerate(kb.relations)]
         }
         outfile.write(json.dumps(kb_dict).encode())
Beispiel #2
0
    def setup_logging(self):
        """
        Initialize logging for this configuration.

        Output will be written to stderr, and appended to the appropriate
        log files in the output directory for this config.
        :return:
        """
        log_level = getattr(logging, self.log_level.upper())
        setup_default_logging(log_level)
        logger = logging.getLogger()
        handler = logging.StreamHandler(
            file_util.open(self.output_file('LOG'), 'a'))
        handler.setFormatter(
            logging.Formatter(
                '%(levelname).1s%(asctime)-15s %(filename)s:%(lineno)d %(message)s',
            ))
        logger.addHandler(handler)

        logging.info('Initialized configuration (%s)', self.__class__.__name__)
        logging.info('Writing to: %s', self.output_dir())
Beispiel #3
0
    def write(data, file_name, schema):
        '''
        Write the data to the file_name, with columns specified by the schema.
        '''
        with file_util.open(file_name, 'w') as fout:
            header = '\t'.join(schema) + '\n\n'
            fout.write(header)

            for doc in data:
                doc_start = '-DOCSTART-'
                doc_id = doc['doc_id']
                if doc_id is not None:
                    doc_start += ' ({0})'.format(doc_id)
                fout.write(doc_start + '\n\n')

                for sentence in doc['sents']:
                    for token in sentence:
                        line = '\t'.join(
                            ['{}'.format(token[col]) for col in schema]
                        ) + '\n'
                        fout.write(line)
                    # blank line after the sentence
                    fout.write('\n')
Beispiel #4
0
    def read(file_name, schema=None, column_transformers={}):
        '''
        Read the file.  The schema is inferred from the header in the file, or can be over-ridden
        by passing schema, a list of column names.

        column_transformers: a dict of column name -> callable functions
            used a hook to transform the raw value for each column.  Use
            cases are changing type (e.g. str -> int), implementing label
            transformations, etc.
        '''
        with file_util.open(file_name, 'r') as fin:
            lines = fin.read().strip().split('\n')

        # Get the schema.
        header = lines[0]
        if header.startswith('-DOCSTART-'):
            if schema is None:
                raise ValueError(
                    "Didn't find a schema in the file. "
                    "Schema can be specified as an argument in the CoNLLIO.read method."
                )
        else:
            # the line after a schema line must be blank or -DOCSTART-
            if lines[1].strip() != '':
                raise ValueError(
                    'The header line must be followed by a blank line'
                )
            file_schema = header.strip().split()
            if schema is not None:
                if len(schema) != len(file_schema):
                    raise ValueError(
                        'The provided schema is not consistent with the number '
                        'of columns in the file.'
                    )
            else:
                schema = file_schema

        if header.startswith('-DOCSTART-'):
            start = 0
        else:
            start = 1

        ret = []
        raw_sentence = []
        doc_id = None
        sents = []
        for line in lines[start:]:
            if line.startswith('-DOCSTART-'):
                if len(raw_sentence) > 0:
                    # clear out the last sentence
                    parsed = CoNLLIO._parse_sentence(
                        raw_sentence, schema, column_transformers
                    )
                    sents.append(parsed)
                    raw_sentence = []

                # add sentences to the return for the previous document
                if len(sents) > 0:
                    ret.append({'doc_id': doc_id, 'sents': sents})
                    sents = []

                # update the doc_id
                doc_id = CoNLLIO._get_doc_id(line)

            elif line.strip() == '':
                # end of a sentence
                if len(raw_sentence) > 0:
                    parsed = CoNLLIO._parse_sentence(
                        raw_sentence, schema, column_transformers
                    )
                    sents.append(parsed)
                    raw_sentence = []
            else:
                # a token in a sentence
                raw_sentence.append(line.strip().split())

        # the last sentence
        if len(raw_sentence) > 0:
            parsed = CoNLLIO._parse_sentence(
                raw_sentence, schema, column_transformers
            )
            sents.append(parsed)
        if len(sents) > 0:
            ret.append({'doc_id': doc_id, 'sents': sents})

        if BILOU_FIELD_NAME in schema:
            # process entities and mentions
            return CoNLLIO._process_entities(
                ret, schema, BILOU_FIELD_NAME
            ), schema
        else:
            return CoNLLIO._process_entities(
                ret, schema, BILOU_FIELD_NAME_MENTIONS
            ), schema