def get_concepts_from_edges(json_, key): """ Get concept-specific info related to an entity from a list containing relations. Each subject-object in the relations list is expressed in a another data source(MESH, DRUGBANK etc) and their unique identifier is provided. Also, articles and new kinde of sub-obj are handled. Input: - json: dict, json-style dictionary with a field containing relations - key : str, string denoting the type of medical text to read from. Used to find the correct paragraph in the settings.yaml file. Output: - json: dict, the updated json-style dictionary where the relations in the list have been updated and each subject-object has been mapped to the according """ # docfield containing list of elements containing the relations outfield = settings['load'][key]['itemfield'] # field containing the type of the node for the subject sub_type = settings['load'][key]['sub_type'] # field containing the source of the node for the subject sub_source = settings['load'][key]['sub_source'] # field containing the type of the node for the object obj_type = settings['load'][key]['obj_type'] # field containing the source of the node for the object obj_source = settings['load'][key]['obj_source'] new_relations = [] uri = settings['load']['mongo']['uri'] db_name = settings['load']['mongo']['db'] collection_name = settings['load']['mongo']['cache_collection'] client = pymongo.MongoClient(uri) db = client[db_name] collection = db[collection_name] cur = collection.find({}) cache = {} for item in cur: cache[item['key']] = item['value'] N = len(json_[outfield]) for ii, triple in enumerate(json_[outfield]): try: if sub_source == 'UMLS': if not (triple['s'] in cache): ent = get_concept_from_cui(triple['s']) cache[triple['s']] = ent collection.insert_one({'key': triple['s'], 'value': ent}) print 'INSERTED in UMLS %s' % triple['s'] else: ent = cache[triple['s']] if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_subj = [{ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }] elif (sub_source == 'PMC') or (sub_source == 'TEXT') or (sub_source == 'None'): triple_subj = [{'id:ID': triple['s']}] else: if not (triple['s'] in cache): ents = get_concept_from_source(triple['s'], sub_source) cache[triple['s']] = ents collection.insert_one({'key': triple['s'], 'value': ents}) print 'INSERTED in other %s' % triple['s'] else: ents = cache[triple['s']] triple_subj = [] for ent in ents: if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_subj.append({ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }) if obj_source == 'UMLS': if not (triple['o'] in cache): ent = get_concept_from_cui(triple['o']) cache[triple['o']] = ent collection.insert_one({'key': triple['o'], 'value': ent}) print 'INSERTED in UMLS %s' % triple['o'] else: ent = cache[triple['o']] if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_obj = [{ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }] elif (obj_source == 'PMC') or (obj_source == 'TEXT') or (obj_source == 'None'): triple_obj = [{'id:ID': triple['o']}] else: if not (triple['o'] in cache): ents = get_concept_from_source(triple['o'], obj_source) cache[triple['o']] = ents collection.insert_one({'key': triple['o'], 'value': ents}) print 'INSERTED in other %s' % triple['o'] else: ents = cache[triple['o']] triple_obj = [] for ent in ents: if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_obj.append({ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }) combs = product(triple_subj, triple_obj) for comb in combs: new_relations.append({ 's': comb[0], 'p': triple['p'], 'o': comb[1] }) except Exception, e: time_log(e) time_log('S: %s | P: %s | O: %s' % (triple['s'], triple['p'], triple['o'])) time_log( 'Skipped the above edge! Probably due to concept-fetching errors!' ) proc = int(ii / float(N) * 100) if proc % 10 == 0 and proc > 0: time_log('We are at %d/%d edges transformed -- %0.2f %%' % (ii, N, proc))
def get_concepts_from_edges(json_, key): """ Get concept-specific info related to an entity from a list containing relations. Each subject-object in the relations list is expressed in a another data source(MESH, DRUGBANK etc) and their unique identifier is provided. Also, articles and new kinde of sub-obj are handled. Input: - json: dict, json-style dictionary with a field containing relations - key : str, string denoting the type of medical text to read from. Used to find the correct paragraph in the settings.yaml file. Output: - json: dict, the updated json-style dictionary where the relations in the list have been updated and each subject-object has been mapped to the according """ # docfield containing list of elements containing the relations outfield = settings['load'][key]['edge_field'] # field containing the type of the node for the subject sub_type = settings['load'][key]['sub_type'] # field containing the source of the node for the subject sub_source = settings['load'][key]['sub_source'] # field containing the type of the node for the object obj_type = settings['load'][key]['obj_type'] # field containing the source of the node for the object obj_source = settings['load'][key]['obj_source'] new_relations = [] # Cache used to avoid retrieving the same concepts try: with open(settings['cache_path'], 'r') as f: cache = json.load(f) except IOError: cache = {} N = len(json_[outfield]) for ii, triple in enumerate(json_[outfield]): print triple try: if sub_source == 'UMLS': if not (triple['s'] in cache): ent = get_concept_from_cui(triple['s']) cache[triple['s']] = ent else: ent = cache[triple['s']] if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_subj = [{ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }] elif (sub_source == 'PMC') or (sub_source == 'TEXT') or (sub_source == 'None'): triple_subj = [{'id:ID': triple['s']}] else: if not (triple['s'] in cache): ents = get_concept_from_source(triple['s'], sub_source) cache[triple['s']] = ents else: ents = cache[triple['s']] triple_subj = [] for ent in ents: if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_subj.append({ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }) if obj_source == 'UMLS': if not (triple['o'] in cache): ent = get_concept_from_cui(triple['o']) cache[triple['o']] = ent else: ent = cache[triple['o']] if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_obj = [{ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }] elif (obj_source == 'PMC') or (obj_source == 'TEXT') or (obj_source == 'None'): triple_obj = [{'id:ID': triple['o']}] else: if not (triple['o'] in cache): ents = get_concept_from_source(triple['o'], obj_source) cache[triple['o']] = ents else: ents = cache[triple['o']] triple_obj = [] for ent in ents: if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1): sem_types = ';'.join(ent['sem_types']) elif (',' in ent['sem_types']): sem_types = ';'.join(ent['sem_types'].split(',')) else: sem_types = ent['sem_types'] triple_obj.append({ 'id:ID': ent['cuid'], 'label': ent['label'], 'sem_types:string[]': sem_types }) combs = product(triple_subj, triple_obj) for comb in combs: new_relations.append({ 's': comb[0], 'p': triple['p'], 'o': comb[1] }) except Exception, e: time_log('S: %s | P: %s | O: %s' % (triple['s'], triple['p'], triple['o'])) time_log( 'Skipped the above edge! Probably due to concept-fetching errors!' ) proc = int(ii / float(N) * 100) if proc % 10 == 0 and proc > 0: time_log('We are at %d/%d edges transformed -- %0.2f %%' % (ii, N, proc)) with open(settings['cache_path'], 'w+') as f: json.dump(cache, f, indent=2)