def _json_dump(kb, filename): """ Dumps unified knowledgebase into a zipped json file :param filename: file to write to :return: """ with file_util.open(filename, mode='wb') as outfile: kb_dict = { 'name': kb.name, 'entities': [{ 'research_entity_id': entity.research_entity_id, 'canonical_name': entity.canonical_name, 'aliases': entity.aliases, 'definition': entity.definition, 'source_urls': entity.source_urls, 'category': entity.category, 'relation_ids': entity.relation_ids, 'other_contexts': entity.other_contexts, 'additional_details': entity.additional_details } for entity in kb.entities], 'relations': [{ 'relation_id': rel_id, 'relation_type': relation.relation_type, 'entity_ids': relation.entity_ids, 'symmetric': relation.symmetric, 'labels': list(relation.labels) if relation.labels else None, } for rel_id, relation in enumerate(kb.relations)] } outfile.write(json.dumps(kb_dict).encode())
def setup_logging(self): """ Initialize logging for this configuration. Output will be written to stderr, and appended to the appropriate log files in the output directory for this config. :return: """ log_level = getattr(logging, self.log_level.upper()) setup_default_logging(log_level) logger = logging.getLogger() handler = logging.StreamHandler( file_util.open(self.output_file('LOG'), 'a')) handler.setFormatter( logging.Formatter( '%(levelname).1s%(asctime)-15s %(filename)s:%(lineno)d %(message)s', )) logger.addHandler(handler) logging.info('Initialized configuration (%s)', self.__class__.__name__) logging.info('Writing to: %s', self.output_dir())
def write(data, file_name, schema): ''' Write the data to the file_name, with columns specified by the schema. ''' with file_util.open(file_name, 'w') as fout: header = '\t'.join(schema) + '\n\n' fout.write(header) for doc in data: doc_start = '-DOCSTART-' doc_id = doc['doc_id'] if doc_id is not None: doc_start += ' ({0})'.format(doc_id) fout.write(doc_start + '\n\n') for sentence in doc['sents']: for token in sentence: line = '\t'.join( ['{}'.format(token[col]) for col in schema] ) + '\n' fout.write(line) # blank line after the sentence fout.write('\n')
def read(file_name, schema=None, column_transformers={}): ''' Read the file. The schema is inferred from the header in the file, or can be over-ridden by passing schema, a list of column names. column_transformers: a dict of column name -> callable functions used a hook to transform the raw value for each column. Use cases are changing type (e.g. str -> int), implementing label transformations, etc. ''' with file_util.open(file_name, 'r') as fin: lines = fin.read().strip().split('\n') # Get the schema. header = lines[0] if header.startswith('-DOCSTART-'): if schema is None: raise ValueError( "Didn't find a schema in the file. " "Schema can be specified as an argument in the CoNLLIO.read method." ) else: # the line after a schema line must be blank or -DOCSTART- if lines[1].strip() != '': raise ValueError( 'The header line must be followed by a blank line' ) file_schema = header.strip().split() if schema is not None: if len(schema) != len(file_schema): raise ValueError( 'The provided schema is not consistent with the number ' 'of columns in the file.' ) else: schema = file_schema if header.startswith('-DOCSTART-'): start = 0 else: start = 1 ret = [] raw_sentence = [] doc_id = None sents = [] for line in lines[start:]: if line.startswith('-DOCSTART-'): if len(raw_sentence) > 0: # clear out the last sentence parsed = CoNLLIO._parse_sentence( raw_sentence, schema, column_transformers ) sents.append(parsed) raw_sentence = [] # add sentences to the return for the previous document if len(sents) > 0: ret.append({'doc_id': doc_id, 'sents': sents}) sents = [] # update the doc_id doc_id = CoNLLIO._get_doc_id(line) elif line.strip() == '': # end of a sentence if len(raw_sentence) > 0: parsed = CoNLLIO._parse_sentence( raw_sentence, schema, column_transformers ) sents.append(parsed) raw_sentence = [] else: # a token in a sentence raw_sentence.append(line.strip().split()) # the last sentence if len(raw_sentence) > 0: parsed = CoNLLIO._parse_sentence( raw_sentence, schema, column_transformers ) sents.append(parsed) if len(sents) > 0: ret.append({'doc_id': doc_id, 'sents': sents}) if BILOU_FIELD_NAME in schema: # process entities and mentions return CoNLLIO._process_entities( ret, schema, BILOU_FIELD_NAME ), schema else: return CoNLLIO._process_entities( ret, schema, BILOU_FIELD_NAME_MENTIONS ), schema