def update_neo4j_parallel(results): """ Function to create/update a neo4j database according to the nodeg and edges generated by the create_neo4j_ functions. Change settings.yaml values in the neo4j group of variables to match your needs. Input: - results: json-style dictionary. Check create_neo4j_ functions output for details Output: None, creates/merges the nodes to the wanted database """ found = False for key in ['nodes', 'edges']: for item in results[key]: if item['values'] and item['type'] == 'Entity': found = True break if found: break if not(found): time_log('NO NODES/EDGES FOUND! MOVING ON!') return 1 #c = raw_input() #if c=='q': # exit() #else: # return try: N_THREADS = int(settings['num_cores']) except: N_THREADS = cpu_count() # results = {'nodes': [{'type': 'Entity', 'values': entities_nodes}, {'type': 'Article', 'values': articles_nodes}], # 'edges': [{'type': 'relation', 'values': relations_edges}, {'type': 'mention', 'values': entity_pmc_edges}] # } par_res = [{'nodes': [{} for j in results['nodes']], 'edges': [{} for j in results['edges']]} for i in xrange(N_THREADS)] # Create mini batches of the results for i, nodes in enumerate(results['nodes']): par_nodes = chunk_document_collection(nodes['values'], N_THREADS) for batch_num in xrange(N_THREADS): par_res[batch_num]['nodes'][i]['type'] = nodes['type'] par_res[batch_num]['nodes'][i]['values'] = par_nodes[batch_num] for i, edges in enumerate(results['edges']): par_edges = chunk_document_collection(edges['values'], N_THREADS) for batch_num in xrange(N_THREADS): par_res[batch_num]['edges'][i]['type'] = edges['type'] par_res[batch_num]['edges'][i]['values'] = par_edges[batch_num] len_col = " | ".join([str(len(b)) for b in par_edges]) time_log('Will break the collection into batches of: %s %s edges!' % (len_col, edges['type'])) pool = Pool(N_THREADS, maxtasksperchild=1) res = pool.map(update_neo4j_parallel_worker, par_res) pool.close() pool.join() del pool if sum(res) == N_THREADS: time_log('Completed parallel update of Neo4j!') else: time_log('Something wrong with the parallel execution?') time_log('Returned %d instead of %d' % (sum(res), N_THREADS)) return 1
def aggregate_relations(relations_edges): """ Function to aggregate recurring entity:SEMREP_RELATION:entity relations. Input: - relations_edges: list, list of dicts as generated by create_neo4j_ functions Outpu: - relations_edges: list, list of dicts with aggregated values in identical ages """ uniques = {} c = 0 for edge in relations_edges: cur_key = str(edge[':START_ID'])+'_'+str(edge[':TYPE'])+'_'+str(edge[':END_ID']) flag = False if cur_key in uniques: if not(edge['sent_id:string[]'] in uniques[cur_key]['sent_id:string[]']): for field in edge.keys(): if not(field in [':START_ID', ':TYPE', ':END_ID']): uniques[cur_key][field] = uniques[cur_key][field]+';'+edge[field] flag = True else: uniques[cur_key] = edge if flag: c += 1 un_list = [] time_log('Aggregated %d relations from %d in total' % (c, len(relations_edges))) for k, v in uniques.iteritems(): un_list.append(v) return un_list
def create_neo4j_results(json_, key='harvester'): """ Helper function to call either the create_neo4j_harvester or the create_neo4j_edges function, according to the type of input. Input: - json_: dic, dictionary-json style generated from the parsers/extractors in the previous stages - key: str, string for denoting which create_neo4j_ function to use Output: - results: dic, json-style dictionary with keys 'nodes' and 'edges' containing a list of the transformed nodes and edges to be created/updated in neo4j. Each element in the list has a 'type' field denoting the type of the node/edge and the 'value' field containg the nodes/edges """ if key == 'harvester': results = create_neo4j_harvester(json_) elif key == 'edges': results = create_neo4j_edges(json_) else: time_log('Type %s of data not yet supported!' % key) raise NotImplementedError return results
def save_mongo(json_): """ Helper function to save edges/documents to mongo. Input: - json_: dic, json-style dictionary generated from the transformation modules in the previous phase. Must make sure that there is a field named as indicated in settings['out']['json']['json_doc_field'], where the edges/docs are stored. Specifically for the articles, they are replaced if another item with the same id is found in the collection. Output: None, just populates the database """ uri = settings['out']['mongo']['uri'] db_name = settings['out']['mongo']['db'] collection_name = settings['out']['mongo']['collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] # Output Idfield idfield = settings['out']['json']['json_id_field'] docs = json_[settings['out']['json']['itemfield']] for i, doc in enumerate(docs): if idfield in doc: result = collection.replace_one({'id': str(doc[idfield])}, doc, True) elif 'p' in doc: result = collection.insert_one(doc) else: time_log('Unknown type to persist to mongo') raise NotImplementedError if i % 100 == 0 and i > 99: time_log("Process: %d -- %0.2f %%" % (i, 100*i/float(len(docs)))) return 1
def aggregate_mentions(entity_pmc_edges): """ Function to aggregate recurring entity:MENTIONED_IN:pmc relations. Input: - entity_pmc_edges: list, list of dicts as generated by create_neo4j_ functions Outpu: - entity_pmc_edges: list, list of dicts with aggregated values in identical ages """ uniques = {} c = 0 for edge in entity_pmc_edges: cur_key = str(edge[':START_ID'])+'_'+str(edge[':END_ID']) flag = False if cur_key in uniques: uniques[cur_key]['score:float[]'] = uniques[cur_key]['score:float[]']+';'+edge['score:float[]'] uniques[cur_key]['sent_id:string[]'] = uniques[cur_key]['sent_id:string[]']+';'+edge['sent_id:string[]'] flag = True else: uniques[cur_key] = edge if flag: c += 1 un_list = [] time_log('Aggregated %d mentions from %d in total' % (c, len(entity_pmc_edges))) for k, v in uniques.iteritems(): un_list.append(v) return un_list
def get_collection_count(source, type): """ Helper function to get total collection length. Input: - source: str, value denoting where we will read from (e.g 'mongo') - type: str, value denoting what we will read (e.g. text, edges) Output: - N_collection: int, number of items in the collection """ if source == 'file': inp_path = settings['load']['path']['file_path'] # Document iterator field in the collection infield = settings['load'][type]['itemfield'] with open(inp_path, 'r') as f: docs = ijson2.items(f, '%s.item' % infield) N_collection = 0 for item in docs: N_collection += 1 elif source == 'mongo': # input mongo variables from settings.yaml uri = settings['load']['mongo']['uri'] db_name = settings['load']['mongo']['db'] collection_name = settings['load']['mongo']['collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] N_collection = collection.count() else: time_log("Can't calculate total collection count for source type %s" % settings['in']['source']) raise NotImplementedError return N_collection
def read(self, N=None, ind_=0): """ Run the corresponding parsing function and return: Input: - ind_: int, the starting point to read from Output: 1) In case of the batch or streaming processing: - json_: dict, the corresponding read batch - N: int, the total number of items to iterate through - ind_: int, the index where the next iteration of readings should start from 2) In case of loading the whole collection: - json_: dict, the corresponding collection """ parallel_flag = str(settings['pipeline']['in']['parallel']) == 'True' stream_flag = str(settings['pipeline']['in']['stream']) == 'True' if parallel_flag or stream_flag: json_, ind_ = self.load(self.key, N, ind_) if json_: if self.parse: json_ = self.parse(json_) time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['itemfield']])) return json_, ind_ else: json_ = self.load(self.key) if self.parse: json_ = self.parse(json_) time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['itemfield']])) return json_
def __init__(self, key, parser_key, name=None): """ Initialization of the class. Input: - key: str, string denoting what extraction task is to take place - parser_key: str, string denoting what type of input to expect - name: str, optional string for the tast to be printed """ self.key = key self.parser_key = parser_key if self.key == 'semrep': if str(settings['pipeline']['in']['parallel']) == 'True': self.func = extract_semrep_parallel time_log('Will use multiprocessing for the semrep extraction!') else: self.func = extract_semrep elif self.key == 'metamap': self.func = extract_metamap # self.func = extract_metamap elif self.key == 'reverb': raise NotImplementedError elif self.key == 'get_concepts_from_edges': if str(settings['pipeline']['in']['parallel']) == 'True': self.func = get_concepts_from_edges_parallel else: self.func = get_concepts_from_edges # self.func = extract_reverb if name: self.name = name else: self.name = self.key
def read(self): """ Run the corresponding parsing function and return the .json_ dictionary result. """ json_ = self.func() time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['json_doc_field']])) return json_
def load_mongo_batches(key, N_collection, ind_=0): """ Parse collection from mongo to be processed in streaming/parallel fashion. Fetches step = (N X numb_cores) of documents starting from ind_ and delivers it to the rest of the pipeline. Input: - key: str, the type of input to read - N_collection: int, total collection length - ind: int, the starting point of the batch (or stream) to be read Output: - json_ : dic, json-style dictionary with a field containing items """ # input file path from settings.yaml uri = settings['load']['mongo']['uri'] db_name = settings['load']['mongo']['db'] collection_name = settings['load']['mongo']['collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] # itemfield containing list of elements out_outfield = settings['out']['json']['itemfield'] json_ = {out_outfield: []} stream_flag = str(settings['pipeline']['in']['stream']) == 'True' # batch size in case of streaming enviroment is just one if stream_flag: step = 1 # else N_THREADS* else: try: N_THREADS = int(settings['num_cores']) except: N_THREADS = cpu_count() try: batch_per_core = int(settings['batch_per_core']) except: batch_per_core = 100 step = N_THREADS * batch_per_core print ind_, step time_log("Will start from %d/%d and read %d items" % (ind_, N_collection, step)) if step > N_collection: step = N_collection else: cur = collection.find({}, skip=ind_, limit=step) c = 0 for item in cur: del item['_id'] c += 1 json_[out_outfield].append(item) return json_, ind_ + step
def populate_nodes(graph, nodes, type_): """ Function that actually calls the cypher query and populates the graph with nodes of type_, merging on already existing nodes on their id_. Input: -graph: py2neo.Graph, object representing the graph in neo4j. Using py2neo. - nodes: list, list of dics containing the attributes of each node - type_: str, type of the node to be merged Output: None, populates the db. """ c = 0 total_rel = 0 time_log('~~~~~~ Will create nodes of type: %s ~~~~~~' % type_) for ent in nodes: c += 1 quer = create_merge_query(ent, type_) f = graph.run(quer) total_rel += f.stats()['nodes_created'] if c % 1000 == 0 and c > 999: time_log("Process: %d -- %0.2f %%" % (c, 100*c/float(len(nodes)))) time_log('#%s : %d' % (type_, c)) time_log('Finally added %d new nodes!' % total_rel)
def run(self): parallel_flag = False stream_flag = False if 'parallel' in self.pipeline['in']: parallel_flag = True if 'stream' in self.pipeline['in']: stream_flag = True if parallel_flag or stream_flag: parser = Parser(self.pipeline['in']['source'], self.pipeline['in']['type']) ind_ = 0 N = get_collection_count(parser.source, parser.key) while ind_ < N: old_ind = ind_ json_all, ind_ = parser.read(N=N, ind_=ind_) #break #print 'fetched' #print json_all, ind_ outfield = settings['out']['json']['itemfield'] if json_all: json_ = json_all for phase in self.phases: dic = self.pipeline[phase] if phase == 'trans': for key, value in dic.iteritems(): if value: extractor = Extractor(key, parser.key) json_ = extractor.run(json_) if phase == 'out': for key, value in sorted(dic.iteritems()): if value: dumper = Dumper(key, parser.key) dumper.save(json_) if ind_: time_log( 'Processed %d documents in parallel. We are at index %d!' % (ind_ - old_ind, ind_)) proc = int(ind_ / float(N) * 100) if proc % 10 == 0 and proc > 0: time_log('~' * 50) time_log( 'We are at %d/%d documents processed -- %0.2f %%' % (ind_, N, proc)) time_log('~' * 50) else: parser = Parser(self.pipeline['in']['source'], self.pipeline['in']['type']) json_ = parser.read() for phase in self.phases: dic = self.pipeline[phase] if phase == 'trans': for key, value in dic.iteritems(): if value: extractor = Extractor(key, parser.key) json_ = extractor.run(json_) if phase == 'out': for key, value in sorted(dic.iteritems()): if value: dumper = Dumper(key, parser.key) dumper.save(json_)
def update_mongo(json_): """ Helper function to save the sentences found in the enriched articles in mongodb. Connecting to a collection according to settings and then creating/updating the articles with the sentences found in them. Input: - json_: dic, json-style dictionary generated from the semrep extractor in the previous phase. Must make sure that there is a field named as indicated in json_['out']['json']['json_doc_field'], where the documents/articles are stored and each document/article has a field sents, as expected in the output of the semrep extractor. Output: None, just populates the database """ uri = settings['mongo']['uri'] db_name = settings['mongo']['db'] collection_name = settings['mongo']['collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] new = 0 upd = 0 docs = json_[settings['out']['json']['json_doc_field']] for i, doc in enumerate(docs): cursor = collection.find({'id': doc['id']}) sents = [{'sent_id': sent['sent_id'], 'text': sent['sent_text']} for sent in doc['sents']] if cursor.count() == 0: collection.insert_one({'id': doc['id'], 'sentences': sents}) new += 1 else: for mongo_doc in cursor: cur_sent = mongo_doc['sentences'] cur_ids = [s['sent_id'] for s in cur_sent] new_sent = [s for s in sents if not(s['sent_id'] in cur_ids)] if new_sent: cur_sent.extend(new_sent) mongo_doc['sentences'] = cur_sent collection.replace_one({'id': doc['id']}, mongo_doc) upd += 1 if i % 100 == 0 and i > 99: time_log("Process: %d -- %0.2f %%" % (i, 100*i/float(len(docs)))) time_log('Finally updated %d -- inserted %d documents!' % (upd, new))
def extract_semrep(json_, key): """ Task function to parse and extract concepts from json_ style dic, using the SemRep binary. Input: - json_ : dic, json-style dictionary generated from the Parse object related to the specific type of input - key : str, string denoting the type of medical text to read from. Used to find the correct paragraph in the settings.yaml file. Output: - json_ : dic, the previous json-style dictionary enriched with medical concepts """ # outerfield for the documents in json if key == 'mongo': key = 'json' docfield = settings['out']['json']['itemfield'] # textfield to read text from textfield = settings['out']['json']['json_text_field'] N = len(json_[docfield]) for i, doc in enumerate(json_[docfield]): print doc['id'] text = doc[textfield] if len(text) > 5000: chunks = create_text_batches(text) results = {'text': text, 'sents': []} sent_id = 0 # c = 0 for chunk in chunks: # c += 1 # print 'CHUNK %d' % c # print chunk # print '~'*50 tmp = semrep_wrapper(chunk) for sent in tmp['sents']: sent['sent_id'] = sent_id sent_id += 1 results['sents'].append(sent) else: results = semrep_wrapper(text) json_[docfield][i].update(results) proc = int(i / float(N) * 100) if proc % 10 == 0 and proc > 0: time_log('We are at %d/%d documents -- %0.2f %%' % (i, N, proc)) return json_
def read(self, ind_=0): """ Run the corresponding parsing function and return the .json_ dictionary result. """ parallel_flag = str(settings['pipeline']['in']['parallel']) == 'True' stream_flag = str(settings['pipeline']['in']['stream']) == 'True' if parallel_flag or stream_flag: json_, ind_, N = self.func(ind_) if json_: time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['json_doc_field']])) return json_, ind_, N else: json_ = self.func() time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['json_doc_field']])) return json_
def extract_semrep_parallel(json_, key): """ Task function to parse and extract concepts from json_ style dic, using the SemRep binary. It uses multiprocessing for efficiency. Input: - json_ : dic, json-style dictionary generated from the Parse object related to the specific type of input - key : str, string denoting the type of medical text to read from. Used to find the correct paragraph in the settings.yaml file. Output: - json_ : dic, the previous json-style dictionary enriched with medical concepts """ # outerfield for the documents in json docfield = settings['out']['json']['itemfield'] N = len(json_[docfield]) try: N_THREADS = int(settings['num_cores']) except: N_THREADS = cpu_count() batches = chunk_document_collection(json_[docfield], N_THREADS) len_col = " | ".join([str(len(b)) for b in batches]) time_log('Will break the collection into batches of: %s documents!' % len_col) batches = [{docfield: batch} for batch in batches] data = zip(batches, [key for batch in batches]) pool = Pool(N_THREADS, maxtasksperchild=1) res = pool.map(semrep_parallel_worker, data) pool.close() pool.join() del pool tmp = {docfield: []} for batch_res in res: tmp[docfield].extend(batch_res[docfield]) for i, sub_doc in enumerate(json_[docfield]): for sub_doc_new in tmp[docfield]: if sub_doc_new['id'] == sub_doc['id']: json_[docfield][i].update(sub_doc_new) break time_log('Completed multiprocessing extraction!') return json_
def metamap_wrapper(text): """ Function-wrapper for metamap binary. Extracts concepts found in text. !!!! REMEMBER TO START THE METAMAP TAGGER AND WordSense DISAMBIGUATION SERVER !!!! Input: - text: str, a piece of text or sentence Output: - a dictionary with key sents and values a list of the concepts found """ # Tokenize into sentences sents = sent_tokenize(text) # Load Metamap Instance mm = MetaMap.get_instance(settings['load']['path']['metamap']) concepts, errors = mm.extract_concepts(sents, range(len(sents))) # Keep the sentence ids ids = np.array([int(concept[0]) for concept in concepts]) sentences = [] for i in xrange(len(sents)): tmp = {'sent_id': i + 1, 'entities': [], 'relations': []} # Wanted concepts according to sentence wanted = np.where(ids == i)[0].tolist() for w_ind in wanted: w_conc = concepts[w_ind] if hasattr(w_conc, 'cui'): tmp_conc = { 'label': w_conc.preferred_name, 'cui': w_conc.cui, 'sem_types': w_conc.semtypes, 'score': w_conc.score } tmp['entities'].append(tmp_conc) sentences.append(tmp) if errors: time_log('Errors with extracting concepts!') time_log(errors) return {'sents': sentences, 'sent_text': text}
def __init__(self, source, key, name=None): """ Initialization of the class. Attributes: - source: str, value denoting where we will read from (e.g 'mongo') - type: str, value denoting what we will read (e.g. text, edges) - name: str, The name is only for pretty-printing purposes. """ self.source = source self.key = key parallel_flag = str(settings['pipeline']['in']['parallel']) == 'True' stream_flag = str(settings['pipeline']['in']['stream']) == 'True' if self.source == 'mongo': if parallel_flag or stream_flag: self.load = load_mongo_batches else: self.load = load_mongo elif self.source == 'file': if parallel_flag or stream_flag: self.load = load_file_batches else: self.load = load_file elif self.source == 'delete': self.load = parse_remove_edges else: time_log('Source to read was %s. Please change settings' % self.source) raise NotImplementedError if self.key == 'text': self.parse = parse_text elif self.key == 'med_red': self.parse = None elif self.key == 'edges': self.parse = None else: time_log('Type to read was %s. Please change settings' % self.key) raise NotImplementedError if name: self.name = name else: self.name = 'Type: %s From : %s' % (self.source, self.key)
def save_mongo(json_): """ Helper function to save the sentences found in the enriched articles in mongodb. Connecting to a collection according to settings and then creating/updating the articles with the sentences found in them. Input: - json_: dic, json-style dictionary generated from the semrep extractor in the previous phase. Must make sure that there is a field named as indicated in json_['out']['json']['json_doc_field'], where the documents/articles are stored and each document/article has a field sents, as expected in the output of the semrep extractor. Output: None, just populates the database """ uri = settings['out']['mongo']['uri'] db_name = settings['out']['mongo']['db'] collection_name = settings['out']['mongo']['collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] # Output Idfield idfield = settings['out']['json']['json_id_field'] docs = json_[settings['out']['json']['json_doc_field']] for i, doc in enumerate(docs): if idfield in doc: result = collection.replace_one({'id': str(doc[idfield])}, doc, True) elif 'p' in doc: result = collection.replace_one( { 'p': doc['p'], 's': doc['s'], 'o': doc['o'] }, doc, True) else: time_log('Unknown type to persist to mongo') raise NotImplementedError if i % 100 == 0 and i > 99: time_log("Process: %d -- %0.2f %%" % (i, 100 * i / float(len(docs))))
def cui_to_uri(api_key, cui): """ Function to map from cui to uri if possible. Uses biontology portal Input: - api_key: str, api usage key change it in setting.yaml - cui: str, cui of the entity we wish to map the uri Output: - the uri found in string format or None """ REST_URL = "http://data.bioontology.org" annotations = get_json_with_api(api_key, REST_URL + "/search?include_properties=true&q=" + urllib2.quote(cui)) try: return annotations['collection'][0]['@id'] except Exception, e: time_log(Exception) time_log(e) return None
def save(self, json_): if type(json_) == dict: if self.transform: results = self.transform(json_, self.type_) else: results = json_ json_ = self.func(results) time_log('Completed saving data. Results saved in:\n %s' % settings['out'][self.key]['out_path']) else: time_log('Unsupported type of json to work on!') time_log('Task : %s --- Type of json: %s' % (self.name, type(json))) time_log(json) json_ = {} return json_
def populate_new_edges(graph, new_edges): """ Function to create/merge an unknwon type of edge. Input: - graph: py2neo.Graph, object representing the graph in neo4j. Using py2neo. - new_edges: list, list of dics containing the attributes of each relation Output: None, populates the db. """ c = 0 total_rel = 0 # field containing the type of the node for the subject sub_type = settings['load']['edges']['sub_type'] # field containing the type of the node for the object obj_type = settings['load']['edges']['obj_type'] for edge in new_edges: c += 1 quer = """ MATCH (a:%s {id:"%s"}), (b:%s {id:"%s"}) MERGE (a)-[r:%s]->(b) """ % (sub_type, edge[':START_ID'], obj_type, edge[':END_ID'], edge[':TYPE'],) f = graph.run(quer) total_rel += f.stats()['relationships_created'] if c % 1000 == 0 and c > 999: time_log("Process: %d -- %0.2f %%" % (c, 100*c/float(len(new_edges)))) time_log('#Edges: %d' % c) time_log('Finally added %d new edges!' % total_rel)
def get_concepts_from_edges_parallel(json_, key): """ Same work as the get_concepts_from_edges_paralle. It uses multiprocessing for efficiency. Input: - json: dict, json-style dictionary with a field containing relations - key : str, string denoting the type of medical text to read from. Used to find the correct paragraph in the settings.yaml file. Output: - json: dict, the updated json-style dictionary where the relations in the list have been updated and each subject-object has been mapped to the according """ outfield = settings['load'][key]['itemfield'] N = len(json_[outfield]) try: N_THREADS = int(settings['num_cores']) except: N_THREADS = cpu_count() batches = chunk_document_collection(json_[outfield], N_THREADS) len_col = " | ".join([str(len(b)) for b in batches]) time_log('Will break the edges into batches of: %s documents!' % len_col) batches = [{outfield: batch} for batch in batches] data = zip(batches, [key for batch in batches]) pool = Pool(N_THREADS, maxtasksperchild=1) res = pool.map(edges_parallel_worker, data) pool.close() pool.join() del pool json_ = {outfield: []} for batch_res in res: json_[outfield].extend(batch_res[outfield]) time_log('Completed multiprocessing extraction!') return json_
def run(self, json): """ Run the corresponding extracting function and return the .json_ dictionary result. """ if type(json) == dict: json_ = self.func(json, self.parser_key) time_log('Completed extracting using %s!' % self.name) else: time_log('Unsupported type of json to work on!') time_log('Task : %s --- Type of json: %s' % (self.name, type(json))) time_log(json) json_ = {} return json_
def get_collection_count(source, type): """ Helper function to get total collection length. Input: - source: str, value denoting where we will read from (e.g 'mongo') - type: str, value denoting what we will read (e.g. text, edges) Output: - N_collection: int, number of items in the collection """ if source == 'mongo': # input mongo variables from settings.yaml uri = settings['load']['mongo']['uri'] db_name = settings['load']['mongo']['db'] collection_name = settings['load']['mongo']['collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] N_collection = collection.count() else: time_log("Can't calculate total collection count for source type %s" % settings['in']['source']) raise NotImplementedError return N_collection
def save(self, json_): if type(json_) == dict: if self.transform: results = self.transform(json_, self.type_) else: results = json_ json_ = self.func(results) if self.key == 'mongo_sentences': out_p = '/'.join([ settings[self.key]['uri'], settings[self.key]['db'], settings[self.key]['collection'] ]) time_log('Completed saving data. Results saved in:\n %s' % out_p) else: time_log('Completed saving data. Results saved in:\n %s' % settings['out'][self.key]['out_path']) else: time_log('Unsupported type of json to work on!') time_log('Task : %s --- Type of json: %s' % (self.name, type(json))) time_log(json) json_ = {} return json_
def populate_mentioned_edges(graph, entity_pmc_edges): """ Function to create/merge the relation edges between existing entities. Input: - graph: py2neo.Graph, object representing the graph in neo4j. Using py2neo. - entity_pmc_edges: list, list of dics containing the attributes of each relation Output: None, populates the db. """ c = 0 total_rel = 0 for edge in entity_pmc_edges: c += 1 quer = """ Match (a:Entity {id:"%s"}), (b:Article {id:"%s"}) MATCH (a)-[r:%s]->(b) WHERE "%s" in r.sent_id Return r; """ % (edge[':START_ID'], edge[':END_ID'], edge[':TYPE'], edge['sent_id:string[]']) f = graph.run(quer) if len(f.data()) == 0 and edge[':START_ID'] and edge[':END_ID']: quer = create_edge_query(edge, 'Entity', 'Article') # sent_s = '[' # for i in edge['sent_id:string[]'].split(';'): # sent_s += '"' + i + '"' + ',' # sent_s = sent_s[:-1] + ']' # sent_res = '[' # for i in edge['resource:string[]'].split(';'): # sent_res += '"' + i + '"' + ',' # sent_res = sent_res[:-1] + ']' # quer = """ # Match (a:Entity {id:"%s"}), (b:Article {id:"%s"}) # MERGE (a)-[r:MENTIONED_IN]->(b) # ON MATCH SET r.score = r.score + %s, r.sent_id = r.sent_id + %s, r.resource = r.resource + %s # ON CREATE SET r.score = %s, r.sent_id = %s, r.resource = %s # """ % (edge[':START_ID'], edge[':END_ID'], # str([int(i) for i in edge['score:float[]'].split(';')]), sent_s, sent_res, # str([int(i) for i in edge['score:float[]'].split(';')]), sent_s, sent_res) f = graph.run(quer) total_rel += f.stats()['relationships_created'] if c % 1000 == 0 and c > 999: time_log("Process: %d -- %0.2f %%" % (c, 100 * c / float(len(entity_pmc_edges)))) time_log('#Mentions: %d' % c) time_log('Finally added %d new mentions!' % total_rel)
def populate_new_edges(graph, new_edges): """ Function to create/merge an unknwon type of edge. Input: - graph: py2neo.Graph, object representing the graph in neo4j. Using py2neo. - new_edges: list, list of dics containing the attributes of each relation Output: None, populates the db. """ c = 0 total_rel = 0 # field containing the type of the node for the subject sub_type = settings['load']['edges']['sub_type'] # field containing the type of the node for the object obj_type = settings['load']['edges']['obj_type'] for edge in new_edges: c += 1 quer = """ Match (a:%s {id:"%s"}), (b:%s {id:"%s"}) MATCH (a)-[r:%s]->(b) WHERE ("%s" in r.resource) Return r; """ % (sub_type, edge[':START_ID'], obj_type, edge[':END_ID'], edge[':TYPE'], settings['neo4j']['resource']) f = graph.run(quer) if len(f.data()) == 0 and edge[':START_ID'] and edge[':END_ID']: quer = create_edge_query(edge, sub_type, obj_type) # sent_res = '[' # for i in edge['resource:string[]'].split(';'): # sent_res += '"' + i + '"' + ',' # sent_res = sent_res[:-1] + ']' # quer = """ # MATCH (a:%s {id:"%s"}), (b:%s {id:"%s"}) # MERGE (a)-[r:%s]->(b) # ON MATCH SET r.resource = r.resource + %s # ON CREATE SET r.resource = %s # """ % (sub_type, edge[':START_ID'], obj_type, edge[':END_ID'], # edge[':TYPE'], sent_res, sent_res) # print quer f = graph.run(quer) total_rel += f.stats()['relationships_created'] if c % 1000 == 0 and c > 999: time_log("Process: %d -- %0.2f %%" % (c, 100 * c / float(len(new_edges)))) time_log('#Edges: %d' % c) time_log('Finally added %d new edges!' % total_rel)
def populate_relation_edges(graph, relations_edges): """ Function to create/merge the relation edges between existing entities. Input: - graph: py2neo.Graph, object representing the graph in neo4j. Using py2neo. - relations_edges: list, list of dics containing the attributes of each relation Output: None, populates the db. """ c = 0 total_rel = 0 for edge in relations_edges: c +=1 quer = """ Match (a:Entity {id:"%s"}), (b:Entity {id:"%s"}) MATCH (a)-[r:%s]->(b) WHERE "%s" in r.sent_id Return r; """ % (edge[':START_ID'], edge[':END_ID'], edge[':TYPE'], edge['sent_id:string[]'].split(';')[0]) print quer f = graph.run(quer) if len(f.data()) == 0: subj_s = '[' for i in edge['subject_sem_type:string[]'].split(';'): subj_s += '"' + i + '"' + ',' subj_s = subj_s[:-1] + ']' obj_s = '[' for i in edge['object_sem_type:string[]'].split(';'): obj_s += '"' + i + '"' + ',' obj_s = obj_s[:-1] + ']' sent_s = '[' for i in edge['sent_id:string[]'].split(';'): sent_s += '"' + i + '"' + ',' sent_s = sent_s[:-1] + ']' neg_s = '[' for i in edge['negation:string[]'].split(';'): neg_s += '"' + i + '"' + ',' neg_s = neg_s[:-1] + ']' quer = """ Match (a:Entity {id:"%s"}), (b:Entity {id:"%s"}) MERGE (a)-[r:%s]->(b) ON MATCH SET r.subject_score = r.subject_score + %s, r.subject_sem_type = r.subject_sem_type + %s, r.object_score = r.object_score + %s, r.object_sem_type = r.object_sem_type + %s, r.sent_id = r.sent_id + %s, r.negation = r.negation + %s ON CREATE SET r.subject_score = %s, r.subject_sem_type = %s, r.object_score = %s, r.object_sem_type = %s, r.sent_id = %s, r.negation = %s """ % (edge[':START_ID'], edge[':END_ID'], edge[':TYPE'], str([int(i) for i in edge['subject_score:float[]'].split(';')]), subj_s, str([int(i) for i in edge['object_score:float[]'].split(';')]), obj_s, sent_s, neg_s, str([int(i) for i in edge['subject_score:float[]'].split(';')]), subj_s, str([int(i) for i in edge['object_score:float[]'].split(';')]), obj_s, sent_s, neg_s) print quer print '~'*50 f = graph.run(quer) total_rel += f.stats()['relationships_created'] if c % 1000 == 0 and c > 999: time_log('Process: %d -- %0.2f %%' % (c, 100*c/float(len(relations_edges)))) time_log('#Relations :%d' % c) time_log('Finally added %d new relations!' % total_rel)
def create_neo4j_csv(results): """ Create csv's for use by the neo4j import tool. Relies on create_neo4j_ functions output and transforms it to suitable format for automatic importing. Input: - results: dic, json-style dictionary. Check create_neo4j_ function output for details Output: - None just saves the documents in the allocated path as defined in settings.yaml """ outpath = settings['out']['csv']['out_path'] entities_nodes = None articles_nodes = None relations_edges = None entity_pmc_edges = None other_nodes = [] other_edges = [] for nodes in results['nodes']: if nodes['type'] == 'Entity': entities_nodes = nodes['values'] elif nodes['type'] == 'Article': articles_nodes = nodes['values'] else: other_nodes.extend(nodes['values']) for edges in results['edges']: if edges['type'] == 'relation': relations_edges = edges['values'] elif edges['type'] == 'mention': entity_pmc_edges = edges['values'] elif edges['type'] == 'NEW': other_edges.extend(edges['values']) dic_ = { 'entities.csv': entities_nodes, 'articles.csv': articles_nodes, 'other_nodes.csv': other_nodes, 'entities_pmc.csv':entity_pmc_edges, 'relations.csv':relations_edges, 'other_edges.csv': other_edges } dic_fiels = { 'entities.csv': ['id:ID', 'label', 'sem_types:string[]'], 'articles.csv': ['id:ID', 'title', 'journal','sent_id:string[]'], 'other_nodes.csv': ['id:ID'], 'entities_pmc.csv':[':START_ID','score:float[]','sent_id:string[]', ':END_ID'], 'relations.csv':[':START_ID','subject_score:float[]','subject_sem_type:string[]',':TYPE','pred_type:string[]', 'object_score:float[]','object_sem_type:string[]','sent_id:string[]','negation:string[]',':END_ID'], 'other_edges.csv':[':START_ID', ':TYPE', ':END_ID'] } for k, toCSV in dic_.iteritems(): if toCSV: keys = toCSV[0].keys() out = os.path.join(outpath, k) with open(out, 'wb') as output_file: time_log("Created file %s" % k) dict_writer = csv2.DictWriter(output_file, fieldnames=dic_fiels[k], encoding='utf-8') dict_writer.writeheader() dict_writer.writerows(toCSV) time_log('Created all documents needed')