Example #1
0
def get_concepts_from_edges(json_, key):
    """
    Get concept-specific info related to an entity from a list
    containing relations. Each subject-object in the relations
    list is expressed in a another data source(MESH, DRUGBANK etc)
    and their unique identifier is provided. Also, articles and new
    kinde of sub-obj are handled.
    Input:
        - json: dict,
        json-style dictionary with a field containing
        relations
        - key : str,
        string denoting the type of medical text to read from. Used to
        find the correct paragraph in the settings.yaml file.
    Output:
        - json: dict,
        the updated json-style dictionary where the relations
        in the list have been updated and each subject-object has been
        mapped to the according

    """

    # docfield containing list of elements containing the relations
    outfield = settings['load'][key]['itemfield']
    # field containing the type of the node for the subject
    sub_type = settings['load'][key]['sub_type']
    # field containing the source of the node for the subject
    sub_source = settings['load'][key]['sub_source']
    # field containing the type of the node for the object
    obj_type = settings['load'][key]['obj_type']
    # field containing the source of the node for the object
    obj_source = settings['load'][key]['obj_source']
    new_relations = []
    uri = settings['load']['mongo']['uri']
    db_name = settings['load']['mongo']['db']
    collection_name = settings['load']['mongo']['cache_collection']
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    cur = collection.find({})
    cache = {}
    for item in cur:
        cache[item['key']] = item['value']
    N = len(json_[outfield])
    for ii, triple in enumerate(json_[outfield]):
        try:
            if sub_source == 'UMLS':
                if not (triple['s'] in cache):
                    ent = get_concept_from_cui(triple['s'])
                    cache[triple['s']] = ent
                    collection.insert_one({'key': triple['s'], 'value': ent})
                    print 'INSERTED in UMLS %s' % triple['s']
                else:
                    ent = cache[triple['s']]
                if (type(ent['sem_types']) == list
                        and len(ent['sem_types']) > 1):
                    sem_types = ';'.join(ent['sem_types'])
                elif (',' in ent['sem_types']):
                    sem_types = ';'.join(ent['sem_types'].split(','))
                else:
                    sem_types = ent['sem_types']

                triple_subj = [{
                    'id:ID': ent['cuid'],
                    'label': ent['label'],
                    'sem_types:string[]': sem_types
                }]
            elif (sub_source == 'PMC') or (sub_source
                                           == 'TEXT') or (sub_source
                                                          == 'None'):
                triple_subj = [{'id:ID': triple['s']}]
            else:
                if not (triple['s'] in cache):
                    ents = get_concept_from_source(triple['s'], sub_source)
                    cache[triple['s']] = ents
                    collection.insert_one({'key': triple['s'], 'value': ents})
                    print 'INSERTED in other %s' % triple['s']
                else:
                    ents = cache[triple['s']]
                triple_subj = []
                for ent in ents:
                    if (type(ent['sem_types']) == list
                            and len(ent['sem_types']) > 1):
                        sem_types = ';'.join(ent['sem_types'])
                    elif (',' in ent['sem_types']):
                        sem_types = ';'.join(ent['sem_types'].split(','))
                    else:
                        sem_types = ent['sem_types']

                    triple_subj.append({
                        'id:ID': ent['cuid'],
                        'label': ent['label'],
                        'sem_types:string[]': sem_types
                    })
            if obj_source == 'UMLS':
                if not (triple['o'] in cache):
                    ent = get_concept_from_cui(triple['o'])
                    cache[triple['o']] = ent
                    collection.insert_one({'key': triple['o'], 'value': ent})
                    print 'INSERTED in UMLS %s' % triple['o']
                else:
                    ent = cache[triple['o']]
                if (type(ent['sem_types']) == list
                        and len(ent['sem_types']) > 1):
                    sem_types = ';'.join(ent['sem_types'])
                elif (',' in ent['sem_types']):
                    sem_types = ';'.join(ent['sem_types'].split(','))
                else:
                    sem_types = ent['sem_types']
                triple_obj = [{
                    'id:ID': ent['cuid'],
                    'label': ent['label'],
                    'sem_types:string[]': sem_types
                }]
            elif (obj_source == 'PMC') or (obj_source
                                           == 'TEXT') or (obj_source
                                                          == 'None'):
                triple_obj = [{'id:ID': triple['o']}]
            else:
                if not (triple['o'] in cache):
                    ents = get_concept_from_source(triple['o'], obj_source)
                    cache[triple['o']] = ents
                    collection.insert_one({'key': triple['o'], 'value': ents})
                    print 'INSERTED in other %s' % triple['o']
                else:
                    ents = cache[triple['o']]
                triple_obj = []
                for ent in ents:
                    if (type(ent['sem_types']) == list
                            and len(ent['sem_types']) > 1):
                        sem_types = ';'.join(ent['sem_types'])
                    elif (',' in ent['sem_types']):
                        sem_types = ';'.join(ent['sem_types'].split(','))
                    else:
                        sem_types = ent['sem_types']

                    triple_obj.append({
                        'id:ID': ent['cuid'],
                        'label': ent['label'],
                        'sem_types:string[]': sem_types
                    })
            combs = product(triple_subj, triple_obj)
            for comb in combs:
                new_relations.append({
                    's': comb[0],
                    'p': triple['p'],
                    'o': comb[1]
                })
        except Exception, e:
            time_log(e)
            time_log('S: %s | P: %s | O: %s' %
                     (triple['s'], triple['p'], triple['o']))
            time_log(
                'Skipped the above edge! Probably due to concept-fetching errors!'
            )
        proc = int(ii / float(N) * 100)
        if proc % 10 == 0 and proc > 0:
            time_log('We are at %d/%d edges transformed -- %0.2f %%' %
                     (ii, N, proc))
Example #2
0
def get_concepts_from_edges(json_, key):
    """
    Get concept-specific info related to an entity from a list
    containing relations. Each subject-object in the relations
    list is expressed in a another data source(MESH, DRUGBANK etc)
    and their unique identifier is provided. Also, articles and new
    kinde of sub-obj are handled.
    Input:
        - json: dict,
        json-style dictionary with a field containing
        relations
        - key : str,
        string denoting the type of medical text to read from. Used to
        find the correct paragraph in the settings.yaml file.
    Output:
        - json: dict,
        the updated json-style dictionary where the relations
        in the list have been updated and each subject-object has been
        mapped to the according

    """

    # docfield containing list of elements containing the relations
    outfield = settings['load'][key]['edge_field']
    # field containing the type of the node for the subject
    sub_type = settings['load'][key]['sub_type']
    # field containing the source of the node for the subject
    sub_source = settings['load'][key]['sub_source']
    # field containing the type of the node for the object
    obj_type = settings['load'][key]['obj_type']
    # field containing the source of the node for the object
    obj_source = settings['load'][key]['obj_source']
    new_relations = []
    # Cache used to avoid retrieving the same concepts
    try:
        with open(settings['cache_path'], 'r') as f:
            cache = json.load(f)
    except IOError:
        cache = {}
    N = len(json_[outfield])
    for ii, triple in enumerate(json_[outfield]):
        print triple
        try:
            if sub_source == 'UMLS':
                if not (triple['s'] in cache):
                    ent = get_concept_from_cui(triple['s'])
                    cache[triple['s']] = ent
                else:
                    ent = cache[triple['s']]
                if (type(ent['sem_types']) == list
                        and len(ent['sem_types']) > 1):
                    sem_types = ';'.join(ent['sem_types'])
                elif (',' in ent['sem_types']):
                    sem_types = ';'.join(ent['sem_types'].split(','))
                else:
                    sem_types = ent['sem_types']

                triple_subj = [{
                    'id:ID': ent['cuid'],
                    'label': ent['label'],
                    'sem_types:string[]': sem_types
                }]
            elif (sub_source == 'PMC') or (sub_source
                                           == 'TEXT') or (sub_source
                                                          == 'None'):
                triple_subj = [{'id:ID': triple['s']}]
            else:
                if not (triple['s'] in cache):
                    ents = get_concept_from_source(triple['s'], sub_source)
                    cache[triple['s']] = ents
                else:
                    ents = cache[triple['s']]
                triple_subj = []
                for ent in ents:
                    if (type(ent['sem_types']) == list
                            and len(ent['sem_types']) > 1):
                        sem_types = ';'.join(ent['sem_types'])
                    elif (',' in ent['sem_types']):
                        sem_types = ';'.join(ent['sem_types'].split(','))
                    else:
                        sem_types = ent['sem_types']

                    triple_subj.append({
                        'id:ID': ent['cuid'],
                        'label': ent['label'],
                        'sem_types:string[]': sem_types
                    })
            if obj_source == 'UMLS':
                if not (triple['o'] in cache):
                    ent = get_concept_from_cui(triple['o'])
                    cache[triple['o']] = ent
                else:
                    ent = cache[triple['o']]
                if (type(ent['sem_types']) == list
                        and len(ent['sem_types']) > 1):
                    sem_types = ';'.join(ent['sem_types'])
                elif (',' in ent['sem_types']):
                    sem_types = ';'.join(ent['sem_types'].split(','))
                else:
                    sem_types = ent['sem_types']
                triple_obj = [{
                    'id:ID': ent['cuid'],
                    'label': ent['label'],
                    'sem_types:string[]': sem_types
                }]
            elif (obj_source == 'PMC') or (obj_source
                                           == 'TEXT') or (obj_source
                                                          == 'None'):
                triple_obj = [{'id:ID': triple['o']}]
            else:
                if not (triple['o'] in cache):
                    ents = get_concept_from_source(triple['o'], obj_source)
                    cache[triple['o']] = ents
                else:
                    ents = cache[triple['o']]
                triple_obj = []
                for ent in ents:
                    if (type(ent['sem_types']) == list
                            and len(ent['sem_types']) > 1):
                        sem_types = ';'.join(ent['sem_types'])
                    elif (',' in ent['sem_types']):
                        sem_types = ';'.join(ent['sem_types'].split(','))
                    else:
                        sem_types = ent['sem_types']

                    triple_obj.append({
                        'id:ID': ent['cuid'],
                        'label': ent['label'],
                        'sem_types:string[]': sem_types
                    })
            combs = product(triple_subj, triple_obj)
            for comb in combs:
                new_relations.append({
                    's': comb[0],
                    'p': triple['p'],
                    'o': comb[1]
                })
        except Exception, e:
            time_log('S: %s | P: %s | O: %s' %
                     (triple['s'], triple['p'], triple['o']))
            time_log(
                'Skipped the above edge! Probably due to concept-fetching errors!'
            )
        proc = int(ii / float(N) * 100)
        if proc % 10 == 0 and proc > 0:
            time_log('We are at %d/%d edges transformed -- %0.2f %%' %
                     (ii, N, proc))
        with open(settings['cache_path'], 'w+') as f:
            json.dump(cache, f, indent=2)