Esempio n. 1
0
def update_neo4j_parallel(results):
    
    """
    Function to create/update a neo4j database according to the nodeg and edges
    generated by the create_neo4j_ functions. Change settings.yaml values in
    the neo4j group of variables to match your needs.
    Input:
        - results: 
        json-style dictionary. Check create_neo4j_ functions output for
        details
    Output: None, creates/merges the nodes to the wanted database
    """
    found = False
    for key in ['nodes', 'edges']:
        for item in results[key]:
            if item['values'] and item['type'] == 'Entity':
                found = True
                break
        if found:
            break
    if not(found):
        time_log('NO NODES/EDGES FOUND! MOVING ON!')
        return 1
        #c = raw_input()
        #if c=='q':
        #    exit()
        #else:
        #    return
    try:
        N_THREADS = int(settings['num_cores'])
    except:
        N_THREADS = cpu_count()
    # results = {'nodes': [{'type': 'Entity', 'values': entities_nodes}, {'type': 'Article', 'values': articles_nodes}],
    #            'edges': [{'type': 'relation', 'values': relations_edges}, {'type': 'mention', 'values': entity_pmc_edges}]
    #            }
    par_res = [{'nodes': [{} for j in results['nodes']], 'edges': [{} for j in results['edges']]} for i in xrange(N_THREADS)]
    # Create mini batches of the results
    for i, nodes in enumerate(results['nodes']):
        par_nodes = chunk_document_collection(nodes['values'], N_THREADS)
        for batch_num in xrange(N_THREADS):
            par_res[batch_num]['nodes'][i]['type'] = nodes['type']
            par_res[batch_num]['nodes'][i]['values'] = par_nodes[batch_num]
    for i, edges in enumerate(results['edges']):
        par_edges = chunk_document_collection(edges['values'], N_THREADS)
        for batch_num in xrange(N_THREADS):
            par_res[batch_num]['edges'][i]['type'] = edges['type']
            par_res[batch_num]['edges'][i]['values'] = par_edges[batch_num]
    len_col = " | ".join([str(len(b)) for b in par_edges])
    time_log('Will break the collection into batches of: %s  %s edges!' % (len_col, edges['type']))
    pool = Pool(N_THREADS, maxtasksperchild=1)
    res = pool.map(update_neo4j_parallel_worker, par_res)
    pool.close()
    pool.join()
    del pool
    if sum(res) == N_THREADS:
        time_log('Completed parallel update of Neo4j!')
    else:
        time_log('Something wrong with the parallel execution?')
        time_log('Returned %d instead of %d' % (sum(res), N_THREADS))
    return 1
Esempio n. 2
0
def aggregate_relations(relations_edges):
    """
    Function to aggregate recurring entity:SEMREP_RELATION:entity relations.
    Input:
        - relations_edges: list,
        list of dicts as generated by create_neo4j_ functions
    Outpu:
        - relations_edges: list,
        list of dicts with aggregated values in identical ages
    """
    uniques = {}
    c = 0
    for edge in relations_edges:
        cur_key = str(edge[':START_ID'])+'_'+str(edge[':TYPE'])+'_'+str(edge[':END_ID'])
        flag = False
        if cur_key in uniques:
            if not(edge['sent_id:string[]'] in uniques[cur_key]['sent_id:string[]']):
                for field in edge.keys():
                    if not(field in [':START_ID', ':TYPE', ':END_ID']):
                        uniques[cur_key][field] = uniques[cur_key][field]+';'+edge[field]
                flag = True
        else:
            uniques[cur_key] = edge
        if flag:
            c += 1
    un_list = []
    time_log('Aggregated %d relations from %d in total' % (c, len(relations_edges)))
    for k, v in uniques.iteritems():
        un_list.append(v)
    return un_list
Esempio n. 3
0
def create_neo4j_results(json_, key='harvester'):
    """
    Helper function to call either the create_neo4j_harvester or the
    create_neo4j_edges function, according to the type of input.
    Input:
        - json_: dic,
        dictionary-json style generated from the parsers/extractors in the
        previous stages
        - key: str,
        string for denoting which create_neo4j_ function to use
    Output:
        - results: dic,
        json-style dictionary with keys 'nodes' and 'edges' containing
        a list of the transformed nodes and edges to be created/updated in
        neo4j. Each element in the list has a 'type' field denoting the type
        of the node/edge and the 'value' field containg the nodes/edges
    """
    if key == 'harvester':
        results = create_neo4j_harvester(json_)
    elif key == 'edges':
        results = create_neo4j_edges(json_)
    else:
        time_log('Type %s of data not yet supported!' % key)
        raise NotImplementedError
    return results
Esempio n. 4
0
def save_mongo(json_):
    """
    Helper function to save edges/documents to mongo.
    Input:
        - json_: dic,
        json-style dictionary generated from the transformation modules in the
        previous phase. Must make sure that there is a field named as indicated
        in settings['out']['json']['json_doc_field'], where the edges/docs
        are stored. Specifically for the articles, they are replaced if another
        item with the same id is found in the collection.
    Output:
        None, just populates the database

    """
    uri = settings['out']['mongo']['uri']
    db_name = settings['out']['mongo']['db']
    collection_name = settings['out']['mongo']['collection']
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    # Output Idfield
    idfield = settings['out']['json']['json_id_field']
    docs = json_[settings['out']['json']['itemfield']]
    for i, doc in enumerate(docs):
        if idfield in doc:
            result = collection.replace_one({'id': str(doc[idfield])}, doc, True)
        elif 'p' in doc:
            result = collection.insert_one(doc)
        else:
            time_log('Unknown type to persist to mongo')
            raise NotImplementedError
        if i % 100 == 0 and i > 99:
            time_log("Process: %d -- %0.2f %%" % (i, 100*i/float(len(docs))))
    return 1
Esempio n. 5
0
def aggregate_mentions(entity_pmc_edges):
    """
    Function to aggregate recurring entity:MENTIONED_IN:pmc relations.
    Input:
        - entity_pmc_edges: list,
        list of dicts as generated by create_neo4j_ functions
    Outpu:
        - entity_pmc_edges: list,
        list of dicts with aggregated values in identical ages
    """
    uniques = {}
    c = 0
    for edge in entity_pmc_edges:
        cur_key = str(edge[':START_ID'])+'_'+str(edge[':END_ID'])
        flag = False
        if cur_key in uniques:
            uniques[cur_key]['score:float[]'] = uniques[cur_key]['score:float[]']+';'+edge['score:float[]']
            uniques[cur_key]['sent_id:string[]'] = uniques[cur_key]['sent_id:string[]']+';'+edge['sent_id:string[]']
            flag = True
        else:
            uniques[cur_key] = edge
        if flag:
            c += 1
    un_list = []
    time_log('Aggregated %d mentions from %d in total' % (c, len(entity_pmc_edges)))
    for k, v in uniques.iteritems():
        un_list.append(v)
    return un_list
Esempio n. 6
0
def get_collection_count(source, type):
    """
    Helper function to get total collection length.
    Input:
        - source: str, value denoting where we will read from (e.g 'mongo')
        - type: str, value denoting what we will read (e.g. text, edges)
    Output:
        - N_collection: int,
        number of items in the collection
    """
    if source == 'file':
        inp_path = settings['load']['path']['file_path']
        # Document iterator field in the collection
        infield = settings['load'][type]['itemfield']
        with open(inp_path, 'r') as f:
            docs = ijson2.items(f, '%s.item' % infield)
            N_collection = 0
            for item in docs:
                N_collection += 1
    elif source == 'mongo':
        # input mongo variables from settings.yaml
        uri = settings['load']['mongo']['uri']
        db_name = settings['load']['mongo']['db']
        collection_name = settings['load']['mongo']['collection']
        client = pymongo.MongoClient(uri)
        db = client[db_name]
        collection = db[collection_name]
        N_collection = collection.count()
    else:
        time_log("Can't calculate total collection count for source type %s" %
                 settings['in']['source'])
        raise NotImplementedError
    return N_collection
Esempio n. 7
0
    def read(self, N=None, ind_=0):
        """
        Run the corresponding parsing function and return:
        Input:
            - ind_: int, the starting point to read from
        Output:
        1) In case of the batch or streaming processing:
            - json_: dict, the corresponding read batch
            - N: int, the total number of items to iterate through
            - ind_: int, the index where the next iteration of readings
            should start from

        2) In case of loading the whole collection:
            - json_: dict, the corresponding collection
        """
        parallel_flag = str(settings['pipeline']['in']['parallel']) == 'True'
        stream_flag = str(settings['pipeline']['in']['stream']) == 'True'
        if parallel_flag or stream_flag:
            json_, ind_ = self.load(self.key, N, ind_)
            if json_:
                if self.parse:
                    json_ = self.parse(json_)
                time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['itemfield']]))
            return json_, ind_
        else:
            json_ = self.load(self.key)
            if self.parse:
                json_ = self.parse(json_)
            time_log('Completed Parsing. Read: %d documents!' % len(json_[settings['out']['json']['itemfield']]))
            return json_
Esempio n. 8
0
    def __init__(self, key, parser_key, name=None):
        """
        Initialization of the class.
        Input:
            - key: str,
            string denoting what extraction task is to take place
            - parser_key: str,
            string denoting what type of input to expect
            - name: str,
            optional string for the tast to be printed
        """

        self.key = key
        self.parser_key = parser_key
        if self.key == 'semrep':
            if str(settings['pipeline']['in']['parallel']) == 'True':
                self.func = extract_semrep_parallel
                time_log('Will use multiprocessing for the semrep extraction!')
            else:
                self.func = extract_semrep
        elif self.key == 'metamap':
            self.func = extract_metamap
            # self.func = extract_metamap
        elif self.key == 'reverb':
            raise NotImplementedError
        elif self.key == 'get_concepts_from_edges':
            if str(settings['pipeline']['in']['parallel']) == 'True':
                self.func = get_concepts_from_edges_parallel
            else:
                self.func = get_concepts_from_edges
            # self.func = extract_reverb
        if name:
            self.name = name
        else:
            self.name = self.key
Esempio n. 9
0
    def read(self):
        """
        Run the corresponding parsing function and return the .json_
        dictionary result.
        """

        json_ = self.func()
        time_log('Completed Parsing. Read: %d documents!' %
                 len(json_[settings['out']['json']['json_doc_field']]))
        return json_
Esempio n. 10
0
def load_mongo_batches(key, N_collection, ind_=0):
    """
    Parse collection from mongo to be processed in streaming/parallel fashion.
    Fetches step = (N X numb_cores) of documents starting from ind_ and
    delivers it to the rest of the pipeline.
    Input:
        - key: str,
        the type of input to read
        - N_collection: int,
        total collection length
        - ind: int,
        the starting point of the batch (or stream) to be read
    Output:
        - json_ : dic,
        json-style dictionary with a field containing
        items
    """
    # input file path from settings.yaml
    uri = settings['load']['mongo']['uri']
    db_name = settings['load']['mongo']['db']
    collection_name = settings['load']['mongo']['collection']
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    # itemfield containing list of elements
    out_outfield = settings['out']['json']['itemfield']
    json_ = {out_outfield: []}
    stream_flag = str(settings['pipeline']['in']['stream']) == 'True'
    # batch size in case of streaming enviroment is just one
    if stream_flag:
        step = 1
    # else N_THREADS*
    else:
        try:
            N_THREADS = int(settings['num_cores'])
        except:
            N_THREADS = cpu_count()
        try:
            batch_per_core = int(settings['batch_per_core'])
        except:
            batch_per_core = 100
        step = N_THREADS * batch_per_core
    print ind_, step
    time_log("Will start from %d/%d and read %d items" % (ind_, N_collection, step))
    if step > N_collection:
        step = N_collection
    else:
        cur = collection.find({}, skip=ind_, limit=step)
        c = 0
        for item in cur:
            del item['_id']
            c += 1
            json_[out_outfield].append(item)
        return json_, ind_ + step
Esempio n. 11
0
def populate_nodes(graph, nodes, type_):
    """
    Function that actually calls the cypher query and populates the graph
    with nodes of type_, merging on already existing nodes on their id_.
    Input:
        -graph: py2neo.Graph,
        object representing the graph in neo4j. Using py2neo.
        - nodes: list,
        list of dics containing the attributes of each node
        - type_: str,
        type of the node to be merged
    Output: None, populates the db.
    """
    c = 0
    total_rel = 0
    time_log('~~~~~~  Will create nodes of type: %s  ~~~~~~' % type_)
    for ent in nodes:
        c += 1
        quer = create_merge_query(ent, type_)
        f = graph.run(quer)
        total_rel += f.stats()['nodes_created']
        if c % 1000 == 0 and c > 999:
            time_log("Process: %d -- %0.2f %%" % (c, 100*c/float(len(nodes))))
    time_log('#%s : %d' % (type_, c))
    time_log('Finally added %d new nodes!' % total_rel) 
Esempio n. 12
0
 def run(self):
     parallel_flag = False
     stream_flag = False
     if 'parallel' in self.pipeline['in']:
         parallel_flag = True
     if 'stream' in self.pipeline['in']:
         stream_flag = True
     if parallel_flag or stream_flag:
         parser = Parser(self.pipeline['in']['source'],
                         self.pipeline['in']['type'])
         ind_ = 0
         N = get_collection_count(parser.source, parser.key)
         while ind_ < N:
             old_ind = ind_
             json_all, ind_ = parser.read(N=N, ind_=ind_)
             #break
             #print 'fetched'
             #print json_all, ind_
             outfield = settings['out']['json']['itemfield']
             if json_all:
                 json_ = json_all
                 for phase in self.phases:
                     dic = self.pipeline[phase]
                     if phase == 'trans':
                         for key, value in dic.iteritems():
                             if value:
                                 extractor = Extractor(key, parser.key)
                                 json_ = extractor.run(json_)
                     if phase == 'out':
                         for key, value in sorted(dic.iteritems()):
                             if value:
                                 dumper = Dumper(key, parser.key)
                                 dumper.save(json_)
             if ind_:
                 time_log(
                     'Processed %d documents in parallel. We are at index %d!'
                     % (ind_ - old_ind, ind_))
                 proc = int(ind_ / float(N) * 100)
             if proc % 10 == 0 and proc > 0:
                 time_log('~' * 50)
                 time_log(
                     'We are at %d/%d documents processed -- %0.2f %%' %
                     (ind_, N, proc))
                 time_log('~' * 50)
     else:
         parser = Parser(self.pipeline['in']['source'],
                         self.pipeline['in']['type'])
         json_ = parser.read()
         for phase in self.phases:
             dic = self.pipeline[phase]
             if phase == 'trans':
                 for key, value in dic.iteritems():
                     if value:
                         extractor = Extractor(key, parser.key)
                         json_ = extractor.run(json_)
             if phase == 'out':
                 for key, value in sorted(dic.iteritems()):
                     if value:
                         dumper = Dumper(key, parser.key)
                         dumper.save(json_)
Esempio n. 13
0
def update_mongo(json_):
    """
    Helper function to save the sentences found in the enriched articles in
    mongodb. Connecting to a collection according to settings and then
    creating/updating the articles with the sentences found in them.
    Input:
        - json_: dic,
        json-style dictionary generated from the semrep extractor in the
        previous phase. Must make sure that there is a field named as indicated
        in json_['out']['json']['json_doc_field'], where the documents/articles
        are stored and each document/article has a field sents, as expected
        in the output of the semrep extractor.
    Output:
        None, just populates the database

    """
    uri = settings['mongo']['uri']
    db_name = settings['mongo']['db']
    collection_name = settings['mongo']['collection']
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    new = 0
    upd = 0
    docs = json_[settings['out']['json']['json_doc_field']]
    for i, doc in enumerate(docs):
        cursor = collection.find({'id': doc['id']})
        sents = [{'sent_id': sent['sent_id'], 'text': sent['sent_text']} for sent in doc['sents']]
        if cursor.count() == 0:
            collection.insert_one({'id': doc['id'], 'sentences': sents})
            new += 1
        else:
            for mongo_doc in cursor:
                cur_sent = mongo_doc['sentences']
                cur_ids = [s['sent_id'] for s in cur_sent]
                new_sent = [s for s in sents if not(s['sent_id'] in cur_ids)]
                if new_sent:
                    cur_sent.extend(new_sent)
                    mongo_doc['sentences'] = cur_sent
                    collection.replace_one({'id': doc['id']}, mongo_doc)
                    upd += 1
        if i % 100 == 0 and i > 99:
            time_log("Process: %d -- %0.2f %%" % (i, 100*i/float(len(docs))))
    time_log('Finally updated %d -- inserted %d documents!' % (upd, new))



        
Esempio n. 14
0
def extract_semrep(json_, key):
    """
    Task function to parse and extract concepts from json_ style dic, using
    the SemRep binary.
    Input:
        - json_ : dic,
        json-style dictionary generated from the Parse object related
        to the specific type of input
        - key : str,
        string denoting the type of medical text to read from. Used to
        find the correct paragraph in the settings.yaml file.
    Output:
        - json_ : dic,
        the previous json-style dictionary enriched with medical concepts
    """
    # outerfield for the documents in json
    if key == 'mongo':
        key = 'json'
    docfield = settings['out']['json']['itemfield']
    # textfield to read text from
    textfield = settings['out']['json']['json_text_field']
    N = len(json_[docfield])
    for i, doc in enumerate(json_[docfield]):
        print doc['id']
        text = doc[textfield]
        if len(text) > 5000:
            chunks = create_text_batches(text)
            results = {'text': text, 'sents': []}
            sent_id = 0
            # c = 0
            for chunk in chunks:
                # c += 1
                # print 'CHUNK %d' % c
                # print chunk
                # print '~'*50
                tmp = semrep_wrapper(chunk)
                for sent in tmp['sents']:
                    sent['sent_id'] = sent_id
                    sent_id += 1
                    results['sents'].append(sent)
        else:
            results = semrep_wrapper(text)
        json_[docfield][i].update(results)
        proc = int(i / float(N) * 100)
        if proc % 10 == 0 and proc > 0:
            time_log('We are at %d/%d documents -- %0.2f %%' % (i, N, proc))
    return json_
Esempio n. 15
0
 def read(self, ind_=0):
     """
     Run the corresponding parsing function and return the .json_
     dictionary result.
     """
     parallel_flag = str(settings['pipeline']['in']['parallel']) == 'True'
     stream_flag = str(settings['pipeline']['in']['stream']) == 'True'
     if parallel_flag or stream_flag:
         json_, ind_, N = self.func(ind_)
         if json_:
             time_log('Completed Parsing. Read: %d documents!' %
                      len(json_[settings['out']['json']['json_doc_field']]))
         return json_, ind_, N
     else:
         json_ = self.func()
         time_log('Completed Parsing. Read: %d documents!' %
                  len(json_[settings['out']['json']['json_doc_field']]))
         return json_
Esempio n. 16
0
def extract_semrep_parallel(json_, key):
    """
    Task function to parse and extract concepts from json_ style dic, using
    the SemRep binary. It uses multiprocessing for efficiency.
    Input:
        - json_ : dic,
        json-style dictionary generated from the Parse object related
        to the specific type of input
        - key : str,
        string denoting the type of medical text to read from. Used to
        find the correct paragraph in the settings.yaml file.
    Output:
        - json_ : dic,
        the previous json-style dictionary enriched with medical concepts
    """
    # outerfield for the documents in json
    docfield = settings['out']['json']['itemfield']
    N = len(json_[docfield])
    try:
        N_THREADS = int(settings['num_cores'])
    except:
        N_THREADS = cpu_count()
    batches = chunk_document_collection(json_[docfield], N_THREADS)
    len_col = " | ".join([str(len(b)) for b in batches])
    time_log('Will break the collection into batches of: %s documents!' %
             len_col)
    batches = [{docfield: batch} for batch in batches]
    data = zip(batches, [key for batch in batches])
    pool = Pool(N_THREADS, maxtasksperchild=1)
    res = pool.map(semrep_parallel_worker, data)
    pool.close()
    pool.join()
    del pool
    tmp = {docfield: []}
    for batch_res in res:
        tmp[docfield].extend(batch_res[docfield])
    for i, sub_doc in enumerate(json_[docfield]):
        for sub_doc_new in tmp[docfield]:
            if sub_doc_new['id'] == sub_doc['id']:
                json_[docfield][i].update(sub_doc_new)
                break
    time_log('Completed multiprocessing extraction!')
    return json_
Esempio n. 17
0
def metamap_wrapper(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!
    
    Input:
        - text: str,
        a piece of text or sentence
    Output:
       - a dictionary with key sents and values
       a list of the concepts found
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    # Load Metamap Instance
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, range(len(sents)))
    # Keep the sentence ids
    ids = np.array([int(concept[0]) for concept in concepts])
    sentences = []
    for i in xrange(len(sents)):
        tmp = {'sent_id': i + 1, 'entities': [], 'relations': []}
        # Wanted concepts according to sentence
        wanted = np.where(ids == i)[0].tolist()
        for w_ind in wanted:
            w_conc = concepts[w_ind]
            if hasattr(w_conc, 'cui'):
                tmp_conc = {
                    'label': w_conc.preferred_name,
                    'cui': w_conc.cui,
                    'sem_types': w_conc.semtypes,
                    'score': w_conc.score
                }
                tmp['entities'].append(tmp_conc)
        sentences.append(tmp)
    if errors:
        time_log('Errors with extracting concepts!')
        time_log(errors)
    return {'sents': sentences, 'sent_text': text}
Esempio n. 18
0
    def __init__(self, source, key, name=None):
        """
        Initialization of the class.
        Attributes:
            - source: str, value denoting where we will read from (e.g 'mongo')
            - type: str, value denoting what we will read (e.g. text, edges)
            - name: str, The name is only for pretty-printing purposes.
        """

        self.source = source
        self.key = key
        parallel_flag = str(settings['pipeline']['in']['parallel']) == 'True'
        stream_flag = str(settings['pipeline']['in']['stream']) == 'True'
        if self.source == 'mongo':
            if parallel_flag or stream_flag:
                self.load = load_mongo_batches
            else:
                self.load = load_mongo
        elif self.source == 'file':
            if parallel_flag or stream_flag:
                self.load = load_file_batches
            else:
                self.load = load_file
        elif self.source == 'delete':
            self.load = parse_remove_edges
        else:
            time_log('Source to read was %s. Please change settings' %
                     self.source)
            raise NotImplementedError
        if self.key == 'text':
            self.parse = parse_text
        elif self.key == 'med_red':
            self.parse = None
        elif self.key == 'edges':
            self.parse = None
        else:
            time_log('Type to read was %s. Please change settings' % self.key)
            raise NotImplementedError
        if name:
            self.name = name
        else:
            self.name = 'Type: %s From : %s' % (self.source, self.key)
Esempio n. 19
0
def save_mongo(json_):
    """
    Helper function to save the sentences found in the enriched articles in
    mongodb. Connecting to a collection according to settings and then
    creating/updating the articles with the sentences found in them.
    Input:
        - json_: dic,
        json-style dictionary generated from the semrep extractor in the
        previous phase. Must make sure that there is a field named as indicated
        in json_['out']['json']['json_doc_field'], where the documents/articles
        are stored and each document/article has a field sents, as expected
        in the output of the semrep extractor.
    Output:
        None, just populates the database

    """
    uri = settings['out']['mongo']['uri']
    db_name = settings['out']['mongo']['db']
    collection_name = settings['out']['mongo']['collection']
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    # Output Idfield
    idfield = settings['out']['json']['json_id_field']
    docs = json_[settings['out']['json']['json_doc_field']]
    for i, doc in enumerate(docs):
        if idfield in doc:
            result = collection.replace_one({'id': str(doc[idfield])}, doc,
                                            True)
        elif 'p' in doc:
            result = collection.replace_one(
                {
                    'p': doc['p'],
                    's': doc['s'],
                    'o': doc['o']
                }, doc, True)
        else:
            time_log('Unknown type to persist to mongo')
            raise NotImplementedError
        if i % 100 == 0 and i > 99:
            time_log("Process: %d -- %0.2f %%" %
                     (i, 100 * i / float(len(docs))))
Esempio n. 20
0
def cui_to_uri(api_key, cui):
    """
    Function to map from cui to uri if possible. Uses biontology portal
    Input:
        - api_key: str,
        api usage key change it in setting.yaml
        - cui: str,
        cui of the entity we wish to map the uri
    Output:
        - the uri found in string format or None
    """

    REST_URL = "http://data.bioontology.org"
    annotations = get_json_with_api(api_key, REST_URL + "/search?include_properties=true&q=" + urllib2.quote(cui))
    try:
        return annotations['collection'][0]['@id']
    except Exception, e:
        time_log(Exception)
        time_log(e)
        return None
Esempio n. 21
0
 def save(self, json_):
     if type(json_) == dict:
         if self.transform:
             results = self.transform(json_, self.type_)
         else:
             results = json_
         json_ = self.func(results)
         time_log('Completed saving data. Results saved in:\n %s' % settings['out'][self.key]['out_path'])
     else:
         time_log('Unsupported type of json to work on!')
         time_log('Task : %s  --- Type of json: %s' % (self.name, type(json)))
         time_log(json)
         json_ = {}
     return json_
Esempio n. 22
0
def populate_new_edges(graph, new_edges):
    """
    Function to create/merge an unknwon type of edge.
    Input:
        - graph: py2neo.Graph,
        object representing the graph in neo4j. Using py2neo.
        - new_edges: list,
        list of dics containing the attributes of each relation
    Output: None, populates the db.
    """

    c = 0
    total_rel = 0
    # field containing the type of the node for the subject
    sub_type = settings['load']['edges']['sub_type']
    # field containing the type of the node for the object
    obj_type = settings['load']['edges']['obj_type']

    for edge in new_edges:
        c += 1
        quer = """
        MATCH (a:%s {id:"%s"}), (b:%s {id:"%s"})
        MERGE (a)-[r:%s]->(b)
        """ % (sub_type, edge[':START_ID'], obj_type, edge[':END_ID'], edge[':TYPE'],)
        f = graph.run(quer)
        total_rel += f.stats()['relationships_created']
        if c % 1000 == 0 and c > 999:
            time_log("Process: %d -- %0.2f %%" % (c, 100*c/float(len(new_edges))))
    time_log('#Edges: %d' % c)
    time_log('Finally added %d new edges!' % total_rel)
Esempio n. 23
0
def get_concepts_from_edges_parallel(json_, key):
    """
    Same work as the get_concepts_from_edges_paralle. It uses multiprocessing
    for efficiency.
    Input:
        - json: dict,
        json-style dictionary with a field containing
        relations
        - key : str,
        string denoting the type of medical text to read from. Used to
        find the correct paragraph in the settings.yaml file.
    Output:
        - json: dict,
        the updated json-style dictionary where the relations
        in the list have been updated and each subject-object has been
        mapped to the according

    """
    outfield = settings['load'][key]['itemfield']
    N = len(json_[outfield])
    try:
        N_THREADS = int(settings['num_cores'])
    except:
        N_THREADS = cpu_count()
    batches = chunk_document_collection(json_[outfield], N_THREADS)
    len_col = " | ".join([str(len(b)) for b in batches])
    time_log('Will break the edges into batches of: %s documents!' % len_col)
    batches = [{outfield: batch} for batch in batches]
    data = zip(batches, [key for batch in batches])
    pool = Pool(N_THREADS, maxtasksperchild=1)
    res = pool.map(edges_parallel_worker, data)
    pool.close()
    pool.join()
    del pool
    json_ = {outfield: []}
    for batch_res in res:
        json_[outfield].extend(batch_res[outfield])
    time_log('Completed multiprocessing extraction!')
    return json_
Esempio n. 24
0
    def run(self, json):
        """
        Run the corresponding extracting function and return the .json_
        dictionary result.
        """

        if type(json) == dict:
            json_ = self.func(json, self.parser_key)
            time_log('Completed extracting using %s!' % self.name)
        else:
            time_log('Unsupported type of json to work on!')
            time_log('Task : %s  --- Type of json: %s' % (self.name, type(json)))
            time_log(json)
            json_ = {}
        return json_
Esempio n. 25
0
def get_collection_count(source, type):
    """
    Helper function to get total collection length.
    Input:
        - source: str, value denoting where we will read from (e.g 'mongo')
        - type: str, value denoting what we will read (e.g. text, edges)
    Output:
        - N_collection: int,
        number of items in the collection
    """
    if source == 'mongo':
        # input mongo variables from settings.yaml
        uri = settings['load']['mongo']['uri']
        db_name = settings['load']['mongo']['db']
        collection_name = settings['load']['mongo']['collection']
        client = pymongo.MongoClient(uri)
        db = client[db_name]
        collection = db[collection_name]
        N_collection = collection.count()
    else:
        time_log("Can't calculate total collection count for source type %s" % settings['in']['source'])
        raise NotImplementedError
    return N_collection
Esempio n. 26
0
 def save(self, json_):
     if type(json_) == dict:
         if self.transform:
             results = self.transform(json_, self.type_)
         else:
             results = json_
         json_ = self.func(results)
         if self.key == 'mongo_sentences':
             out_p = '/'.join([
                 settings[self.key]['uri'], settings[self.key]['db'],
                 settings[self.key]['collection']
             ])
             time_log('Completed saving data. Results saved in:\n %s' %
                      out_p)
         else:
             time_log('Completed saving data. Results saved in:\n %s' %
                      settings['out'][self.key]['out_path'])
     else:
         time_log('Unsupported type of json to work on!')
         time_log('Task : %s  --- Type of json: %s' %
                  (self.name, type(json)))
         time_log(json)
         json_ = {}
     return json_
Esempio n. 27
0
def populate_mentioned_edges(graph, entity_pmc_edges):
    """
    Function to create/merge the relation edges between existing entities.
    Input:
        - graph: py2neo.Graph,
        object representing the graph in neo4j. Using py2neo.
        - entity_pmc_edges: list,
        list of dics containing the attributes of each relation
    Output: None, populates the db.
    """

    c = 0
    total_rel = 0
    for edge in entity_pmc_edges:
        c += 1
        quer = """
        Match (a:Entity {id:"%s"}), (b:Article {id:"%s"})
        MATCH (a)-[r:%s]->(b)
        WHERE "%s" in r.sent_id
        Return r;
        """ % (edge[':START_ID'], edge[':END_ID'], edge[':TYPE'],
               edge['sent_id:string[]'])
        f = graph.run(quer)
        if len(f.data()) == 0 and edge[':START_ID'] and edge[':END_ID']:
            quer = create_edge_query(edge, 'Entity', 'Article')
            # sent_s = '['
            # for i in edge['sent_id:string[]'].split(';'):
            #     sent_s += '"' + i + '"' + ','
            # sent_s = sent_s[:-1] + ']'
            # sent_res = '['
            # for i in edge['resource:string[]'].split(';'):
            #     sent_res += '"' + i + '"' + ','
            # sent_res = sent_res[:-1] + ']'
            # quer = """
            # Match (a:Entity {id:"%s"}), (b:Article {id:"%s"})
            # MERGE (a)-[r:MENTIONED_IN]->(b)
            # ON MATCH SET r.score = r.score + %s, r.sent_id = r.sent_id + %s, r.resource = r.resource + %s
            # ON CREATE SET r.score = %s, r.sent_id = %s, r.resource = %s
            # """ % (edge[':START_ID'], edge[':END_ID'],
            #        str([int(i) for i in edge['score:float[]'].split(';')]), sent_s, sent_res,
            #        str([int(i) for i in edge['score:float[]'].split(';')]), sent_s, sent_res)
            f = graph.run(quer)
            total_rel += f.stats()['relationships_created']
        if c % 1000 == 0 and c > 999:
            time_log("Process: %d -- %0.2f %%" %
                     (c, 100 * c / float(len(entity_pmc_edges))))
    time_log('#Mentions: %d' % c)
    time_log('Finally added %d new mentions!' % total_rel)
Esempio n. 28
0
def populate_new_edges(graph, new_edges):
    """
    Function to create/merge an unknwon type of edge.
    Input:
        - graph: py2neo.Graph,
        object representing the graph in neo4j. Using py2neo.
        - new_edges: list,
        list of dics containing the attributes of each relation
    Output: None, populates the db.
    """

    c = 0
    total_rel = 0
    # field containing the type of the node for the subject
    sub_type = settings['load']['edges']['sub_type']
    # field containing the type of the node for the object
    obj_type = settings['load']['edges']['obj_type']
    for edge in new_edges:
        c += 1
        quer = """
        Match (a:%s {id:"%s"}), (b:%s {id:"%s"})
        MATCH (a)-[r:%s]->(b)
        WHERE ("%s" in r.resource)
        Return r;
        """ % (sub_type, edge[':START_ID'], obj_type, edge[':END_ID'],
               edge[':TYPE'], settings['neo4j']['resource'])
        f = graph.run(quer)
        if len(f.data()) == 0 and edge[':START_ID'] and edge[':END_ID']:
            quer = create_edge_query(edge, sub_type, obj_type)
            # sent_res = '['
            # for i in edge['resource:string[]'].split(';'):
            #     sent_res += '"' + i + '"' + ','
            # sent_res = sent_res[:-1] + ']'
            # quer = """
            # MATCH (a:%s {id:"%s"}), (b:%s {id:"%s"})
            # MERGE (a)-[r:%s]->(b)
            # ON MATCH SET r.resource = r.resource + %s
            # ON CREATE SET r.resource = %s
            # """ % (sub_type, edge[':START_ID'], obj_type, edge[':END_ID'],
            #        edge[':TYPE'], sent_res, sent_res)
            # print quer
            f = graph.run(quer)
            total_rel += f.stats()['relationships_created']
        if c % 1000 == 0 and c > 999:
            time_log("Process: %d -- %0.2f %%" %
                     (c, 100 * c / float(len(new_edges))))
    time_log('#Edges: %d' % c)
    time_log('Finally added %d new edges!' % total_rel)
Esempio n. 29
0
def populate_relation_edges(graph, relations_edges):
    """
    Function to create/merge the relation edges between existing entities.
    Input:
        - graph: py2neo.Graph,
        object representing the graph in neo4j. Using py2neo.
        - relations_edges: list,
        list of dics containing the attributes of each relation
    Output: None, populates the db.
    """
    c = 0
    total_rel = 0
    for edge in relations_edges:
        c +=1  
        quer = """
        Match (a:Entity {id:"%s"}), (b:Entity {id:"%s"})
        MATCH (a)-[r:%s]->(b)
        WHERE "%s" in r.sent_id
        Return r;
        """ % (edge[':START_ID'], edge[':END_ID'], edge[':TYPE'], edge['sent_id:string[]'].split(';')[0])
        print quer
        f = graph.run(quer)
        if len(f.data()) == 0:
            subj_s = '['
            for i in edge['subject_sem_type:string[]'].split(';'):
                subj_s += '"' + i + '"' + ','
            subj_s = subj_s[:-1] + ']'
            obj_s = '['
            for i in edge['object_sem_type:string[]'].split(';'):
                obj_s += '"' + i + '"' + ','
            obj_s = obj_s[:-1] + ']'
            sent_s = '['
            for i in edge['sent_id:string[]'].split(';'):
                sent_s += '"' + i + '"' + ','
            sent_s = sent_s[:-1] + ']'
            neg_s = '['
            for i in edge['negation:string[]'].split(';'):
                neg_s += '"' + i + '"' + ','
            neg_s = neg_s[:-1] + ']'
            quer = """
            Match (a:Entity {id:"%s"}), (b:Entity {id:"%s"})
            MERGE (a)-[r:%s]->(b)
            ON MATCH SET r.subject_score = r.subject_score + %s, r.subject_sem_type = r.subject_sem_type + %s,
            r.object_score = r.object_score + %s, r.object_sem_type = r.object_sem_type + %s,
            r.sent_id = r.sent_id + %s, r.negation = r.negation + %s
            ON CREATE SET r.subject_score = %s, r.subject_sem_type =  %s,
            r.object_score =  %s, r.object_sem_type =  %s,
            r.sent_id =  %s, r.negation =  %s
            """ % (edge[':START_ID'], edge[':END_ID'], edge[':TYPE'], 
                   str([int(i) for i in edge['subject_score:float[]'].split(';')]), subj_s, 
                   str([int(i) for i in edge['object_score:float[]'].split(';')]), obj_s,
                 sent_s, neg_s, str([int(i) for i in edge['subject_score:float[]'].split(';')]), subj_s, 
                   str([int(i) for i in edge['object_score:float[]'].split(';')]), obj_s,
                 sent_s, neg_s)
            print quer
            print '~'*50
            f = graph.run(quer)
            total_rel += f.stats()['relationships_created']
        if c % 1000 == 0 and c > 999:
            time_log('Process: %d -- %0.2f %%' % (c, 100*c/float(len(relations_edges))))
    time_log('#Relations :%d' % c)
    time_log('Finally added %d new relations!' % total_rel)
Esempio n. 30
0
def create_neo4j_csv(results):
    """
    Create csv's for use by the neo4j import tool. Relies on create_neo4j_ functions
    output and transforms it to suitable format for automatic importing.
    Input: 
        - results: dic,
        json-style dictionary. Check create_neo4j_ function output for
        details
    Output:
        - None just saves the documents in the allocated path as defined
        in settings.yaml 
    """
    outpath = settings['out']['csv']['out_path']
    entities_nodes = None
    articles_nodes = None
    relations_edges = None
    entity_pmc_edges = None
    other_nodes = []
    other_edges = []
    for nodes in results['nodes']:
        if nodes['type'] == 'Entity':
            entities_nodes = nodes['values']
        elif nodes['type'] == 'Article':
            articles_nodes = nodes['values']
        else:
            other_nodes.extend(nodes['values'])
    for edges in results['edges']:
        if edges['type'] == 'relation':
            relations_edges = edges['values']
        elif edges['type'] == 'mention':
            entity_pmc_edges = edges['values']
        elif edges['type'] == 'NEW':
            other_edges.extend(edges['values'])

    dic_ = {
        'entities.csv': entities_nodes,
        'articles.csv': articles_nodes,
        'other_nodes.csv': other_nodes,
        'entities_pmc.csv':entity_pmc_edges, 
        'relations.csv':relations_edges,
        'other_edges.csv': other_edges
    }

    dic_fiels = {
        'entities.csv': ['id:ID', 'label', 'sem_types:string[]'],
        'articles.csv': ['id:ID', 'title', 'journal','sent_id:string[]'],
        'other_nodes.csv': ['id:ID'],
        'entities_pmc.csv':[':START_ID','score:float[]','sent_id:string[]', ':END_ID'], 
        'relations.csv':[':START_ID','subject_score:float[]','subject_sem_type:string[]',':TYPE','pred_type:string[]', 'object_score:float[]','object_sem_type:string[]','sent_id:string[]','negation:string[]',':END_ID'],
        'other_edges.csv':[':START_ID', ':TYPE', ':END_ID']
    }

    for k, toCSV in dic_.iteritems():
        if toCSV:
            keys = toCSV[0].keys()
            out = os.path.join(outpath, k)
            with open(out, 'wb') as output_file:
                time_log("Created file %s" % k)
                dict_writer = csv2.DictWriter(output_file, fieldnames=dic_fiels[k], encoding='utf-8')
                dict_writer.writeheader()
                dict_writer.writerows(toCSV)
    time_log('Created all documents needed')