コード例 #1
0
 def normalize(self, node):
     """Given a node, which will have many potential identifiers, choose the best identifier to be the node ID,
     where 'best' is defined by the order in which identifiers appear in the id prefix configurations within the concept model."""
     #If we have two synonyms with the same id, but one has no label, chuck it
     smap = defaultdict(list)
     for labeledid in node.synonyms:
         smap[labeledid.identifier].append(labeledid.label)
     for lid, labels in smap.items():
         if len(labels) > 1 and (None in labels):
             node.synonyms.remove(LabeledID(identifier=lid, label=None))
         if len(labels) > 1 and ('' in labels):
             node.synonyms.remove(LabeledID(identifier=lid, label=''))
     #Now find the bset one for an id
     type_curies = self.concepts.get(node.type).id_prefixes
     #Now start looking for the best curies
     synonyms_by_curie = defaultdict(list)
     for s in node.synonyms:
         c = Text.get_curie(s.identifier)
         synonyms_by_curie[c].append(s)
     for type_curie in type_curies:
         potential_identifiers = synonyms_by_curie[type_curie]
         if len(potential_identifiers) > 0:
             if len(potential_identifiers) > 1:
                 pis = [
                     f'{pi.identifier}({pi.label})'
                     for pi in potential_identifiers
                 ]
                 ids_with_labels = list(
                     filter(lambda x: x.label is not None,
                            potential_identifiers))
                 if len(ids_with_labels) > 0:
                     potential_identifiers = ids_with_labels
                 potential_identifiers.sort()
             node.id = potential_identifiers[0].identifier
             #Only replace the label if we have a label.
             if potential_identifiers[0].label != '':
                 node.name = potential_identifiers[0].label
             break
     #Remove any synonyms with extraneous prefixes.  The point of this is not so much to remove
     # unknown prefixes, as to make sure that if we got e.g. a meddra, and we downcast it to a disease,
     # that we don't end up with HP's in the equivalent ids.
     bad_synonyms = set()
     for synonym in node.synonyms:
         if isinstance(synonym, LabeledID):
             prefix = Text.get_curie(synonym.identifier)
         else:
             prefix = Text.get_curie(synonym)
         if prefix not in type_curies:
             bad_synonyms.add(synonym)
     for bs in bad_synonyms:
         node.synonyms.remove(bs)
     if node.id.startswith('DOID'):
         logger.warn("We are ending up with a DOID here")
         logger.warn(node.id)
         logger.warn(node.synonyms)
         logger.warn(node.type)
コード例 #2
0
def synonymize(node,gt):
    if not node.type == node_types.GENE:
        raise Exception("Incorrect node type")
    if Text.get_curie(node.id).upper() == 'UNIPROTKB':
        new_ids = gt.uniprot.get_synonyms(node.id)
        if len(new_ids) > 0:
            labeled_ids = [ LabeledID(identifier=h, label='') for h in new_ids ]
            node.synonyms.update(labeled_ids)
            node.id = new_ids[0]
    if Text.get_curie(node.id).upper() != 'UNIPROTKB':
        g_synonyms = gt.hgnc.get_synonyms(node.id)
    else:
        g_synonyms = set()
    return g_synonyms
コード例 #3
0
    def map_concept_types(self, thing, object_type=None):
        """ Expand high level concepts into concrete types our data sources understand. """

        # Try the CURIE approach.
        the_type = self.guess_type(
            thing.identifier) if thing and thing.identifier else None

        # If that didn't work, get candiddate types based on the (abstract) node type.
        if thing and not the_type:
            the_type = self.concepts.get(thing.node_type, None)
            if the_type:
                # Attempt to map them down to IRIs
                the_type = [self.vocab.get(t, t) for t in the_type]

        # Systematize this:
        # If the concept type is disease but the curie is NAME, we don't have a DOID.
        if isinstance(the_type, str):
            # If we've ended up with just one string, make it a list for conformity of return type
            the_type = [the_type]

        result = the_type if the_type else self.concepts.get(
            object_type, [object_type])

        curie = Text.get_curie(thing.identifier) if thing else None
        if curie:
            result = [self.make_up_curie(curie)]  #[ self.vocab[curie] ]
            #result = [ self.vocab[curie] ]

        return result
コード例 #4
0
def get_synonyms_with_curie_check(identifier, gt, distance=2):
    if gt.oxo.is_valid_curie_prefix(Text.get_curie(identifier)):
        #synonyms = gt.oxo.get_synonymous_curies(identifier, distance=distance)
        synonyms = gt.oxo.get_synonymous_curies_and_labels(identifier,
                                                           distance=distance)
    else:
        synonyms = set()
    return synonyms
コード例 #5
0
def future_test_disease_normalization(rosetta):
    node = KNode('DOID:4325', type=node_types.DISEASE)
    synonyms = synonymize(node, rosetta.core)
    print(synonyms)
    node.add_synonyms(synonyms)
    mondos = node.get_synonyms_by_prefix('MONDO')
    assert len(mondos) > 0
    assert Text.get_curie(node.id) == 'MONDO'
コード例 #6
0
def test_disease_to_symptom(hetio):
    #Crohn's disease has associated Skin Manifesations?
    relations = hetio.disease_to_phenotype(
        KNode('DOID:8778', type=node_types.DISEASE))
    identifiers = [node.id for r, node in relations]
    #everything should be UBERON ids
    for ident in identifiers:
        assert Text.get_curie(ident) == 'MESH'
    assert 'MESH:D012877' in identifiers
コード例 #7
0
def test_gene_to_anatomy(hetio):
    relations = hetio.gene_to_anatomy(
        KNode('NCBIGENE:83752', type=node_types.GENE))
    assert len(relations) < 20 and len(relations) > 10
    identifiers = [node.id for r, node in relations]
    #everything should be UBERON ids
    for ident in identifiers:
        assert Text.get_curie(ident) == 'UBERON'
    assert 'UBERON:0001007' in identifiers
コード例 #8
0
def test_gene_to_process(biolink):
    KIT_protein = KNode('HGNC:6432', type=node_types.GENE)
    results = biolink.gene_get_process_or_function(KIT_protein)
    #When this test fails, it will indicate that monarch fixed the mapping in the gene/function endpoint
    #At that point, change to assert len(results) > 0, and take out all the UniProt jazz in the client.
    assert len(results) == 0
    for ke, kn in results:
        assert kn.type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY
        assert Text.get_curie(kn.id) == "GO"
コード例 #9
0
def test_gene_to_disease(hetio):
    #KRT7 associated with bile duct cancer?
    relations = hetio.gene_to_disease(
        KNode('NCBIGENE:3855', type=node_types.GENE))
    assert len(relations) < 20 and len(relations) > 10
    identifiers = [node.id for r, node in relations]
    #everything should be UBERON ids
    for ident in identifiers:
        assert Text.get_curie(ident) == 'DOID'
    assert 'DOID:4606' in identifiers
コード例 #10
0
def test_anatomy_to_gene(hetio):
    relations = hetio.anatomy_to_gene(
        KNode('UBERON:0001007', type=node_types.ANATOMY))
    nts = [node.type for r, node in relations]
    for nt in nts:
        assert nt == node_types.GENE
    identifiers = [node.id for r, node in relations]
    for ident in identifiers:
        assert Text.get_curie(ident) == 'NCBIGENE'
    assert 'NCBIGENE:83752' in identifiers
コード例 #11
0
def test_mondo_synonymization_2(rosetta):
    node = KNode('MONDO:0005737', type=node_types.DISEASE)
    synonyms = synonymize(node, rosetta.core)
    assert len(synonyms) > 1
    node.add_synonyms(synonyms)
    doids = node.get_synonyms_by_prefix('DOID')
    assert len(doids) == 1
    meshes = node.get_synonyms_by_prefix('MESH')
    assert len(meshes) > 0
    assert Text.get_curie(node.id) == 'MONDO'
コード例 #12
0
 def get_omni_identifier(self, node):
     #Let's start with just the 'best' identifier
     identifier = node.id
     prefix = Text.get_curie(node.id)
     if prefix not in self.prefixes:
         logger.debug("What kinda tomfoolery is this?")
         logger.debug(f"{node.id} {node.type}")
         logger.debug(f"{node.synonyms}")
         return None
     return identifier
コード例 #13
0
 def enhance(self):
     """Enhance nodes,edges with good labels and properties"""
     # TODO: it probably makes sense to push this stuff into the KNode itself
     self.logger.debug('Enhancing nodes with labels')
     for node in self.graph.nodes():
         from greent.util import Text
         if Text.get_curie(node.identifier) == 'DOID':
             print('NOOO {}'.format(node.identifier))
             exit()
         prepare_node_for_output(node, self.rosetta.core)
コード例 #14
0
def test_disease_to_phenotypes(biolink):
    asthma = KNode('DOID:2841', type=node_types.DISEASE)
    results = biolink.disease_get_phenotype(asthma)
    assert len(results) > 90 and len(results) < 110
    identifiers = [node.id for r,node in results]
    #everthing should be MONDO ids
    for ident in identifiers:
        assert Text.get_curie(ident) == 'HP'
    #acute severe asthma should be in there.
    assert 'HP:0012653' in identifiers
コード例 #15
0
def synonymize(node,gt):
    curie = Text.get_curie(node.id)
    synonyms = set()
    if curie == 'MONDO':
        synonyms.update(synonymize_with_MONDO(node,gt))
        #You might think this is wrong,but it is right.  Even though the synonyms will get added to the node
        #outside, we are also going to add them here so that the OXO synonymizer will find them.
        node.synonyms.update(synonyms)
    synonyms.update(synonymize_with_OXO(node,gt))
    return synonyms
コード例 #16
0
 def get_omni_identifier(self, node):
     #Let's start with just the 'best' identifier
     identifier = node.id
     prefix = Text.get_curie(node.id)
     if prefix not in self.prefix_to_uri:
         logger.warn("What kinda tomfoolery is this?")
         logger.warn(f"{node.id} {node.type}")
         logger.warn(f"{node.synonyms}")
         return None
     oident = f'{self.prefix_to_uri[prefix]}{Text.un_curie(node.id)}'
     return oident
コード例 #17
0
def double_check_for_mesh(node, new_synonyms, gt):
    all_synonyms = set()
    all_synonyms.update(node.synonyms)
    all_synonyms.update(new_synonyms)
    for s in all_synonyms:
        if Text.get_curie(s) == 'MESH':
            return
    #No Mesh Found
    meshs = set()
    for s in all_synonyms:
        meshs.update(get_particular_synonyms(s, 'MESH', gt, distance=3))
    node.add_synonyms(meshs)
コード例 #18
0
def synonymize_with_UniChem(node, gt):
    logger.debug(" UniChem: {}".format(node.id))
    all_synonyms = set()
    for synonym in node.synonyms:
        curie = Text.get_curie(synonym.identifier)
        if curie in ('CHEMBL', 'CHEBI', 'DRUGBANK', 'PUBCHEM'):
            new_synonyms = gt.unichem.get_synonyms(synonym.identifier)
            labeled_synonyms = [
                LabeledID(identifier=s, label=synonym.label)
                for s in new_synonyms
            ]
            all_synonyms.update(labeled_synonyms)
    #node.add_synonyms( all_synonyms )
    return all_synonyms
コード例 #19
0
def test_mondo_synonymization(rosetta):
    #Niemann Pick Disease (not type C)
    node = KNode('MONDO:0001982', type=node_types.DISEASE)
    synonyms = synonymize(node, rosetta.core)
    assert len(synonyms) > 10
    node.add_synonyms(synonyms)
    doids = node.get_synonyms_by_prefix('DOID')
    assert len(doids) == 1
    assert doids.pop() == 'DOID:14504'
    meshes = node.get_synonyms_by_prefix('MESH')
    assert len(meshes) == 2
    assert 'MeSH:D009542' in meshes
    assert 'MeSH:D052556' in meshes
    assert Text.get_curie(node.id) == 'MONDO'
コード例 #20
0
    def make_node(self, json_node):
        identifier = json_node['id']
        if identifier == '' or identifier is None:
            return None
        label = json_node['name']
        node_types = json_node['categories']
        if len(node_types) > 1:
            logger.warn("Multiple Node Types from HMDB")
        # normalize smp ids
        if Text.get_curie(identifier) == 'SMPDB':
            identifier = self.normalize_smpdb_ids(identifier)
        node_type = self.concepts_hmdb2robo[node_types[0]]

        return KNode(identifier, name=label, type=node_type)
コード例 #21
0
 def add_chemotext_terms(self,nodes):
     """For each mesh term in a node, find out what chemotext calls that thing so we can query for it"""
     logging.getLogger('application').debug('{} nodes'.format(len(nodes) ))
     for node in nodes:
         logging.getLogger('application').debug('node: {}'.format(node.id) )
         mesh_identifiers = list( filter( lambda x: Text.get_curie(x)=='MESH', node.synonyms))
         for mesh_id in mesh_identifiers:
             logging.getLogger('application').debug('  mesh_id: {}'.format(mesh_id) )
             bare_id = Text.un_curie(mesh_id)
             cterm = self.ctext.get_chemotext_term_from_meshid( bare_id )
             if cterm is None:
                 logging.getLogger('application').warn("  Cannot find chemotext synonym for %s (%s) %s" % (bare_id,mesh_id,node.id))
             else:
                 logging.getLogger('application').debug('  node: {}, label: {}, chemotext: {}'.format(node.id, bare_id, cterm) )
                 self.identifier_to_label[node.id].append(cterm)
コード例 #22
0
 def postgres_get_shared_pmids(self, id1, id2):
     prefix1 = Text.get_curie(id1)
     prefix2 = Text.get_curie(id2)
     start = datetime.datetime.now()
     cur = self.conn.cursor()
     statement = f'''SELECT a.pubmedid 
        FROM omnicorp.{prefix1} a
        JOIN omnicorp.{prefix2} b ON a.pubmedid = b.pubmedid
        WHERE a.curie = %s
        AND b.curie = %s '''
     cur.execute(statement, (id1, id2))
     pmids = [x[0] for x in cur.fetchall()]
     cur.close()
     end = datetime.datetime.now()
     self.total_pair_call += (end - start)
     logger.debug(
         f'Found {len(pmids)} shared ids in {end-start}. Total {self.total_pair_call}'
     )
     self.npair += 1
     if self.npair % 100 == 0:
         logger.info(
             f'NCalls: {self.npair} Total time: {self.total_pair_call}  Avg Time: {self.total_pair_call/self.npair}'
         )
     return pmids
コード例 #23
0
def test_gene_to_disease(biolink):
    """What do we get back for HBB"""
    relations = biolink.gene_get_disease(KNode('HGNC:4827', type=node_types.GENE))
    assert len(relations) > 20 and len(relations) < 40
    identifiers = [node.id for r,node in relations]
    #everthing should be MONDO ids
    for ident in identifiers:
        assert Text.get_curie(ident) == 'MONDO'
    #Sickle cell should be in there.
    assert 'MONDO:0011382' in identifiers
    predicates = [ relation.standard_predicate for relation,n in relations ] 
    pids = set( [p.identifier for p in predicates] )
    plabels = set( [p.label for p in predicates] )
    assert 'RO:0002607' in pids
    assert 'gene_associated_with_condition' in plabels
コード例 #24
0
def synonymize(node, gt):
    logger.debug("Synonymize: {}".format(node.id))
    curie = Text.get_curie(node.id)
    synonyms = set()
    if curie == 'CHEMBL':
        synonyms.update(synonymize_with_UniChem(node, gt))
        #OXO is going to troll the node's synonyms, so we want to add them now
        node.add_synonyms(synonyms)
        synonyms.update(synonymize_with_OXO(node, gt))
        #synonymize_with_CTD(node,gt)
    else:
        synonyms.update(synonymize_with_OXO(node, gt))
        synonyms.update(synonymize_with_UniChem(node, gt))
        #synonymize_with_CTD(node,gt)
    return synonyms
コード例 #25
0
ファイル: hetio.py プロジェクト: stevencox/robokop-interfaces
 def gene_to_disease(self, gene):
     if not Text.get_curie(
             gene.identifier) in ['HGNC', 'UNIPROT', 'PHAROS']:
         return []
     result = self.query(
         "MATCH (d:Disease)-[a1]-(g:Gene) WHERE g.name='{0}' RETURN a1,d".
         format(Text.un_curie(gene.identifier)),
         labels=['Disease'])
     #        result = self.nodes_and_edges (result)
     for r in result:
         print(r)
         print(result)
         print(type(result))
     #print ("-------------------> {}".format (json.dumps (result, indent=2)))
     return [(self.get_edge({'res': r}, predicate='affects'),
              KNode(r['identifier'], node_types.DISEASE)) for r in result]
コード例 #26
0
   def get_neighbor(self, input_id, output_type, subject=True):
       parents = {
           node_types.ANATOMICAL_ENTITY:
           "<http://purl.obolibrary.org/obo/UBERON_0001062>",
           node_types.DISEASE:
           "<http://purl.obolibrary.org/obo/MONDO_0000001>",
           node_types.MOLECULAR_ACTIVITY:
           "<http://purl.obolibrary.org/obo/GO_0003674>",
           node_types.BIOLOGICAL_PROCESS:
           "<http://purl.obolibrary.org/obo/GO_0008150>",
           node_types.CHEMICAL_SUBSTANCE:
           "<http://purl.obolibrary.org/obo/CHEBI_24431>",
           node_types.PHENOTYPIC_FEATURE:
           "<http://purl.obolibrary.org/obo/HP_0000118>"
       }
       pref = Text.get_curie(input_id)
       obo_prefix = f'PREFIX {pref}: <http://purl.obolibrary.org/obo/{pref}_>'
       text = """
       PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
       """ + obo_prefix + """
       select distinct ?output_id ?output_label ?p ?pLabel 
       from <http://reasoner.renci.org/nonredundant>
       from <http://reasoner.renci.org/ontology>
       where {
           graph <http://reasoner.renci.org/nonredundant> {
       """
       if subject:
           text += '	 $input_id ?p ?output_id .'
       else:
           text += '  ?output_id ?p $input_id .'
       text += """
           }
           graph <http://reasoner.renci.org/ontology/closure> {
               ?output_id rdfs:subClassOf $parent .
           }
           ?output_id rdfs:label ?output_label .
 			?p rdfs:label ?pLabel .
       }
       """
       results = self.triplestore.query_template(
           inputs={
               'input_id': input_id,
               'parent': parents[output_type]
           },
           outputs=['output_id', 'output_label', 'p', 'pLabel'],
           template_text=text)
       return results
コード例 #27
0
def test_drug_get_gene_other_table(pharos):
    #pharos should find chembl in the synonyms
    node = KNode('DB:FakeyName', type=node_types.CHEMICAL_SUBSTANCE)
    node.add_synonyms([LabeledID(identifier='CHEMBL:CHEMBL3658657', label='blahbalh')])
    results = pharos.drug_get_gene(node)
    #we get results
    assert len(results) > 0
    #They are gene nodes:
    ntypes = set([n.type for e,n in results])
    assert node_types.GENE in ntypes
    assert len(ntypes) == 1
    #All of the ids should be HGNC
    identifiers = [n.id for e,n in results]
    prefixes = set([ Text.get_curie(i) for i in identifiers])
    assert 'HGNC' in prefixes
    assert len(prefixes) == 1
    #PTGS2 (COX2) (HGNC:9605) should be in there
    assert 'HGNC:6871' in identifiers
コード例 #28
0
 def get_synonyms(self, identifier):
     curie = Text.get_curie(identifier)
     if curie not in self.curie_to_sourceid:
         return set()
     bare_id = Text.un_curie(identifier)
     url = "{0}/src_compound_id/{1}/{2}".format(
         self.url, bare_id, self.curie_to_sourceid[curie])
     #response = requests.get(url).json ()
     response = self.query(url)
     if 'error' in response:
         return set()
     results = set()
     for result in response:
         sid = result['src_id']
         if sid in self.sourceid_to_curie:
             curie = self.sourceid_to_curie[sid]
             results.add(f'{curie}:{result["src_compound_id"]}')
     return results
コード例 #29
0
 def get_out_by_in(self,
                   input_node,
                   output_type,
                   prefixes,
                   subject=True,
                   object=True):
     returnresults = []
     caller = f'uberongraph.{inspect.stack()[1][3]}'
     results = {'subject': [], 'object': []}
     curies = set()
     for pre in prefixes:
         curies.update(input_node.get_synonyms_by_prefix(pre))
     for curie in curies:
         results['subject'] += self.get_neighbor(curie,
                                                 output_type,
                                                 subject=True)
         results['object'] += self.get_neighbor(curie,
                                                output_type,
                                                subject=False)
     for direction in ['subject', 'object']:
         done = set()
         for r in results[direction]:
             key = (r['p'], r['output_id'])
             if key in done:
                 continue
             predicate_curie = Text.obo_to_curie(r['p'])
             prefix = Text.get_curie(predicate_curie)
             prefix = prefix if prefix == 'ubergraph-axioms.ofn' else prefix.upper(
             )
             upper_cased_predicate_curie = prefix + ":" + Text.un_curie(
                 predicate_curie)
             predicate = LabeledID(upper_cased_predicate_curie, r['pLabel'])
             output_node = KNode(r['output_id'],
                                 type=output_type,
                                 name=r['output_label'])
             if direction == 'subject':
                 edge = self.create_edge(input_node, output_node, caller,
                                         curie, predicate)
             else:
                 edge = self.create_edge(output_node, input_node, caller,
                                         curie, predicate)
             done.add(key)
             returnresults.append((edge, output_node))
     return returnresults
コード例 #30
0
 def disease_get_ancestors(self, disease_node):
     curie = disease_node.id
     prefix = Text.get_curie(curie)
     if "MONDO" != prefix:
         return []
     query = f"""
     PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
     prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_>
     select distinct ?output_id ?label
     where {{
       graph <http://reasoner.renci.org/ontology/closure> {{
         $disease_id  rdfs:subClassOf ?output_id .
         ?output_id rdfs:subClassOf MONDO:0000001 .
       }}      
       
       graph <http://reasoner.renci.org/ontology>{{
       ?output_id rdfs:label ?label.
       }}
     }}
     """
     results = self.triplestore.query_template(
         template_text=query,
         inputs={'disease_id': curie},
         outputs=['output_id', 'label'])
     outputs = []
     for row in results:
         ancestor_node = KNode(
             row['output_id'],
             label=row['label'],
             type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE)
         if ancestor_node.id == disease_node.id:
             # refrain from adding edge to the node itself
             continue
         predicate = LabeledID(identifier='rdfs:subClassOf',
                               label='subclass of')
         edge = self.create_edge(
             source_node=disease_node,
             target_node=ancestor_node,
             predicate=predicate,
             provided_by='uberongraph.disease_get_ancestors',
             input_id=disease_node.id)
         outputs.append((edge, ancestor_node))
     return outputs