def normalize(self, node): """Given a node, which will have many potential identifiers, choose the best identifier to be the node ID, where 'best' is defined by the order in which identifiers appear in the id prefix configurations within the concept model.""" #If we have two synonyms with the same id, but one has no label, chuck it smap = defaultdict(list) for labeledid in node.synonyms: smap[labeledid.identifier].append(labeledid.label) for lid, labels in smap.items(): if len(labels) > 1 and (None in labels): node.synonyms.remove(LabeledID(identifier=lid, label=None)) if len(labels) > 1 and ('' in labels): node.synonyms.remove(LabeledID(identifier=lid, label='')) #Now find the bset one for an id type_curies = self.concepts.get(node.type).id_prefixes #Now start looking for the best curies synonyms_by_curie = defaultdict(list) for s in node.synonyms: c = Text.get_curie(s.identifier) synonyms_by_curie[c].append(s) for type_curie in type_curies: potential_identifiers = synonyms_by_curie[type_curie] if len(potential_identifiers) > 0: if len(potential_identifiers) > 1: pis = [ f'{pi.identifier}({pi.label})' for pi in potential_identifiers ] ids_with_labels = list( filter(lambda x: x.label is not None, potential_identifiers)) if len(ids_with_labels) > 0: potential_identifiers = ids_with_labels potential_identifiers.sort() node.id = potential_identifiers[0].identifier #Only replace the label if we have a label. if potential_identifiers[0].label != '': node.name = potential_identifiers[0].label break #Remove any synonyms with extraneous prefixes. The point of this is not so much to remove # unknown prefixes, as to make sure that if we got e.g. a meddra, and we downcast it to a disease, # that we don't end up with HP's in the equivalent ids. bad_synonyms = set() for synonym in node.synonyms: if isinstance(synonym, LabeledID): prefix = Text.get_curie(synonym.identifier) else: prefix = Text.get_curie(synonym) if prefix not in type_curies: bad_synonyms.add(synonym) for bs in bad_synonyms: node.synonyms.remove(bs) if node.id.startswith('DOID'): logger.warn("We are ending up with a DOID here") logger.warn(node.id) logger.warn(node.synonyms) logger.warn(node.type)
def synonymize(node,gt): if not node.type == node_types.GENE: raise Exception("Incorrect node type") if Text.get_curie(node.id).upper() == 'UNIPROTKB': new_ids = gt.uniprot.get_synonyms(node.id) if len(new_ids) > 0: labeled_ids = [ LabeledID(identifier=h, label='') for h in new_ids ] node.synonyms.update(labeled_ids) node.id = new_ids[0] if Text.get_curie(node.id).upper() != 'UNIPROTKB': g_synonyms = gt.hgnc.get_synonyms(node.id) else: g_synonyms = set() return g_synonyms
def map_concept_types(self, thing, object_type=None): """ Expand high level concepts into concrete types our data sources understand. """ # Try the CURIE approach. the_type = self.guess_type( thing.identifier) if thing and thing.identifier else None # If that didn't work, get candiddate types based on the (abstract) node type. if thing and not the_type: the_type = self.concepts.get(thing.node_type, None) if the_type: # Attempt to map them down to IRIs the_type = [self.vocab.get(t, t) for t in the_type] # Systematize this: # If the concept type is disease but the curie is NAME, we don't have a DOID. if isinstance(the_type, str): # If we've ended up with just one string, make it a list for conformity of return type the_type = [the_type] result = the_type if the_type else self.concepts.get( object_type, [object_type]) curie = Text.get_curie(thing.identifier) if thing else None if curie: result = [self.make_up_curie(curie)] #[ self.vocab[curie] ] #result = [ self.vocab[curie] ] return result
def get_synonyms_with_curie_check(identifier, gt, distance=2): if gt.oxo.is_valid_curie_prefix(Text.get_curie(identifier)): #synonyms = gt.oxo.get_synonymous_curies(identifier, distance=distance) synonyms = gt.oxo.get_synonymous_curies_and_labels(identifier, distance=distance) else: synonyms = set() return synonyms
def future_test_disease_normalization(rosetta): node = KNode('DOID:4325', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) print(synonyms) node.add_synonyms(synonyms) mondos = node.get_synonyms_by_prefix('MONDO') assert len(mondos) > 0 assert Text.get_curie(node.id) == 'MONDO'
def test_disease_to_symptom(hetio): #Crohn's disease has associated Skin Manifesations? relations = hetio.disease_to_phenotype( KNode('DOID:8778', type=node_types.DISEASE)) identifiers = [node.id for r, node in relations] #everything should be UBERON ids for ident in identifiers: assert Text.get_curie(ident) == 'MESH' assert 'MESH:D012877' in identifiers
def test_gene_to_anatomy(hetio): relations = hetio.gene_to_anatomy( KNode('NCBIGENE:83752', type=node_types.GENE)) assert len(relations) < 20 and len(relations) > 10 identifiers = [node.id for r, node in relations] #everything should be UBERON ids for ident in identifiers: assert Text.get_curie(ident) == 'UBERON' assert 'UBERON:0001007' in identifiers
def test_gene_to_process(biolink): KIT_protein = KNode('HGNC:6432', type=node_types.GENE) results = biolink.gene_get_process_or_function(KIT_protein) #When this test fails, it will indicate that monarch fixed the mapping in the gene/function endpoint #At that point, change to assert len(results) > 0, and take out all the UniProt jazz in the client. assert len(results) == 0 for ke, kn in results: assert kn.type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY assert Text.get_curie(kn.id) == "GO"
def test_gene_to_disease(hetio): #KRT7 associated with bile duct cancer? relations = hetio.gene_to_disease( KNode('NCBIGENE:3855', type=node_types.GENE)) assert len(relations) < 20 and len(relations) > 10 identifiers = [node.id for r, node in relations] #everything should be UBERON ids for ident in identifiers: assert Text.get_curie(ident) == 'DOID' assert 'DOID:4606' in identifiers
def test_anatomy_to_gene(hetio): relations = hetio.anatomy_to_gene( KNode('UBERON:0001007', type=node_types.ANATOMY)) nts = [node.type for r, node in relations] for nt in nts: assert nt == node_types.GENE identifiers = [node.id for r, node in relations] for ident in identifiers: assert Text.get_curie(ident) == 'NCBIGENE' assert 'NCBIGENE:83752' in identifiers
def test_mondo_synonymization_2(rosetta): node = KNode('MONDO:0005737', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) assert len(synonyms) > 1 node.add_synonyms(synonyms) doids = node.get_synonyms_by_prefix('DOID') assert len(doids) == 1 meshes = node.get_synonyms_by_prefix('MESH') assert len(meshes) > 0 assert Text.get_curie(node.id) == 'MONDO'
def get_omni_identifier(self, node): #Let's start with just the 'best' identifier identifier = node.id prefix = Text.get_curie(node.id) if prefix not in self.prefixes: logger.debug("What kinda tomfoolery is this?") logger.debug(f"{node.id} {node.type}") logger.debug(f"{node.synonyms}") return None return identifier
def enhance(self): """Enhance nodes,edges with good labels and properties""" # TODO: it probably makes sense to push this stuff into the KNode itself self.logger.debug('Enhancing nodes with labels') for node in self.graph.nodes(): from greent.util import Text if Text.get_curie(node.identifier) == 'DOID': print('NOOO {}'.format(node.identifier)) exit() prepare_node_for_output(node, self.rosetta.core)
def test_disease_to_phenotypes(biolink): asthma = KNode('DOID:2841', type=node_types.DISEASE) results = biolink.disease_get_phenotype(asthma) assert len(results) > 90 and len(results) < 110 identifiers = [node.id for r,node in results] #everthing should be MONDO ids for ident in identifiers: assert Text.get_curie(ident) == 'HP' #acute severe asthma should be in there. assert 'HP:0012653' in identifiers
def synonymize(node,gt): curie = Text.get_curie(node.id) synonyms = set() if curie == 'MONDO': synonyms.update(synonymize_with_MONDO(node,gt)) #You might think this is wrong,but it is right. Even though the synonyms will get added to the node #outside, we are also going to add them here so that the OXO synonymizer will find them. node.synonyms.update(synonyms) synonyms.update(synonymize_with_OXO(node,gt)) return synonyms
def get_omni_identifier(self, node): #Let's start with just the 'best' identifier identifier = node.id prefix = Text.get_curie(node.id) if prefix not in self.prefix_to_uri: logger.warn("What kinda tomfoolery is this?") logger.warn(f"{node.id} {node.type}") logger.warn(f"{node.synonyms}") return None oident = f'{self.prefix_to_uri[prefix]}{Text.un_curie(node.id)}' return oident
def double_check_for_mesh(node, new_synonyms, gt): all_synonyms = set() all_synonyms.update(node.synonyms) all_synonyms.update(new_synonyms) for s in all_synonyms: if Text.get_curie(s) == 'MESH': return #No Mesh Found meshs = set() for s in all_synonyms: meshs.update(get_particular_synonyms(s, 'MESH', gt, distance=3)) node.add_synonyms(meshs)
def synonymize_with_UniChem(node, gt): logger.debug(" UniChem: {}".format(node.id)) all_synonyms = set() for synonym in node.synonyms: curie = Text.get_curie(synonym.identifier) if curie in ('CHEMBL', 'CHEBI', 'DRUGBANK', 'PUBCHEM'): new_synonyms = gt.unichem.get_synonyms(synonym.identifier) labeled_synonyms = [ LabeledID(identifier=s, label=synonym.label) for s in new_synonyms ] all_synonyms.update(labeled_synonyms) #node.add_synonyms( all_synonyms ) return all_synonyms
def test_mondo_synonymization(rosetta): #Niemann Pick Disease (not type C) node = KNode('MONDO:0001982', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) assert len(synonyms) > 10 node.add_synonyms(synonyms) doids = node.get_synonyms_by_prefix('DOID') assert len(doids) == 1 assert doids.pop() == 'DOID:14504' meshes = node.get_synonyms_by_prefix('MESH') assert len(meshes) == 2 assert 'MeSH:D009542' in meshes assert 'MeSH:D052556' in meshes assert Text.get_curie(node.id) == 'MONDO'
def make_node(self, json_node): identifier = json_node['id'] if identifier == '' or identifier is None: return None label = json_node['name'] node_types = json_node['categories'] if len(node_types) > 1: logger.warn("Multiple Node Types from HMDB") # normalize smp ids if Text.get_curie(identifier) == 'SMPDB': identifier = self.normalize_smpdb_ids(identifier) node_type = self.concepts_hmdb2robo[node_types[0]] return KNode(identifier, name=label, type=node_type)
def add_chemotext_terms(self,nodes): """For each mesh term in a node, find out what chemotext calls that thing so we can query for it""" logging.getLogger('application').debug('{} nodes'.format(len(nodes) )) for node in nodes: logging.getLogger('application').debug('node: {}'.format(node.id) ) mesh_identifiers = list( filter( lambda x: Text.get_curie(x)=='MESH', node.synonyms)) for mesh_id in mesh_identifiers: logging.getLogger('application').debug(' mesh_id: {}'.format(mesh_id) ) bare_id = Text.un_curie(mesh_id) cterm = self.ctext.get_chemotext_term_from_meshid( bare_id ) if cterm is None: logging.getLogger('application').warn(" Cannot find chemotext synonym for %s (%s) %s" % (bare_id,mesh_id,node.id)) else: logging.getLogger('application').debug(' node: {}, label: {}, chemotext: {}'.format(node.id, bare_id, cterm) ) self.identifier_to_label[node.id].append(cterm)
def postgres_get_shared_pmids(self, id1, id2): prefix1 = Text.get_curie(id1) prefix2 = Text.get_curie(id2) start = datetime.datetime.now() cur = self.conn.cursor() statement = f'''SELECT a.pubmedid FROM omnicorp.{prefix1} a JOIN omnicorp.{prefix2} b ON a.pubmedid = b.pubmedid WHERE a.curie = %s AND b.curie = %s ''' cur.execute(statement, (id1, id2)) pmids = [x[0] for x in cur.fetchall()] cur.close() end = datetime.datetime.now() self.total_pair_call += (end - start) logger.debug( f'Found {len(pmids)} shared ids in {end-start}. Total {self.total_pair_call}' ) self.npair += 1 if self.npair % 100 == 0: logger.info( f'NCalls: {self.npair} Total time: {self.total_pair_call} Avg Time: {self.total_pair_call/self.npair}' ) return pmids
def test_gene_to_disease(biolink): """What do we get back for HBB""" relations = biolink.gene_get_disease(KNode('HGNC:4827', type=node_types.GENE)) assert len(relations) > 20 and len(relations) < 40 identifiers = [node.id for r,node in relations] #everthing should be MONDO ids for ident in identifiers: assert Text.get_curie(ident) == 'MONDO' #Sickle cell should be in there. assert 'MONDO:0011382' in identifiers predicates = [ relation.standard_predicate for relation,n in relations ] pids = set( [p.identifier for p in predicates] ) plabels = set( [p.label for p in predicates] ) assert 'RO:0002607' in pids assert 'gene_associated_with_condition' in plabels
def synonymize(node, gt): logger.debug("Synonymize: {}".format(node.id)) curie = Text.get_curie(node.id) synonyms = set() if curie == 'CHEMBL': synonyms.update(synonymize_with_UniChem(node, gt)) #OXO is going to troll the node's synonyms, so we want to add them now node.add_synonyms(synonyms) synonyms.update(synonymize_with_OXO(node, gt)) #synonymize_with_CTD(node,gt) else: synonyms.update(synonymize_with_OXO(node, gt)) synonyms.update(synonymize_with_UniChem(node, gt)) #synonymize_with_CTD(node,gt) return synonyms
def gene_to_disease(self, gene): if not Text.get_curie( gene.identifier) in ['HGNC', 'UNIPROT', 'PHAROS']: return [] result = self.query( "MATCH (d:Disease)-[a1]-(g:Gene) WHERE g.name='{0}' RETURN a1,d". format(Text.un_curie(gene.identifier)), labels=['Disease']) # result = self.nodes_and_edges (result) for r in result: print(r) print(result) print(type(result)) #print ("-------------------> {}".format (json.dumps (result, indent=2))) return [(self.get_edge({'res': r}, predicate='affects'), KNode(r['identifier'], node_types.DISEASE)) for r in result]
def get_neighbor(self, input_id, output_type, subject=True): parents = { node_types.ANATOMICAL_ENTITY: "<http://purl.obolibrary.org/obo/UBERON_0001062>", node_types.DISEASE: "<http://purl.obolibrary.org/obo/MONDO_0000001>", node_types.MOLECULAR_ACTIVITY: "<http://purl.obolibrary.org/obo/GO_0003674>", node_types.BIOLOGICAL_PROCESS: "<http://purl.obolibrary.org/obo/GO_0008150>", node_types.CHEMICAL_SUBSTANCE: "<http://purl.obolibrary.org/obo/CHEBI_24431>", node_types.PHENOTYPIC_FEATURE: "<http://purl.obolibrary.org/obo/HP_0000118>" } pref = Text.get_curie(input_id) obo_prefix = f'PREFIX {pref}: <http://purl.obolibrary.org/obo/{pref}_>' text = """ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> """ + obo_prefix + """ select distinct ?output_id ?output_label ?p ?pLabel from <http://reasoner.renci.org/nonredundant> from <http://reasoner.renci.org/ontology> where { graph <http://reasoner.renci.org/nonredundant> { """ if subject: text += ' $input_id ?p ?output_id .' else: text += ' ?output_id ?p $input_id .' text += """ } graph <http://reasoner.renci.org/ontology/closure> { ?output_id rdfs:subClassOf $parent . } ?output_id rdfs:label ?output_label . ?p rdfs:label ?pLabel . } """ results = self.triplestore.query_template( inputs={ 'input_id': input_id, 'parent': parents[output_type] }, outputs=['output_id', 'output_label', 'p', 'pLabel'], template_text=text) return results
def test_drug_get_gene_other_table(pharos): #pharos should find chembl in the synonyms node = KNode('DB:FakeyName', type=node_types.CHEMICAL_SUBSTANCE) node.add_synonyms([LabeledID(identifier='CHEMBL:CHEMBL3658657', label='blahbalh')]) results = pharos.drug_get_gene(node) #we get results assert len(results) > 0 #They are gene nodes: ntypes = set([n.type for e,n in results]) assert node_types.GENE in ntypes assert len(ntypes) == 1 #All of the ids should be HGNC identifiers = [n.id for e,n in results] prefixes = set([ Text.get_curie(i) for i in identifiers]) assert 'HGNC' in prefixes assert len(prefixes) == 1 #PTGS2 (COX2) (HGNC:9605) should be in there assert 'HGNC:6871' in identifiers
def get_synonyms(self, identifier): curie = Text.get_curie(identifier) if curie not in self.curie_to_sourceid: return set() bare_id = Text.un_curie(identifier) url = "{0}/src_compound_id/{1}/{2}".format( self.url, bare_id, self.curie_to_sourceid[curie]) #response = requests.get(url).json () response = self.query(url) if 'error' in response: return set() results = set() for result in response: sid = result['src_id'] if sid in self.sourceid_to_curie: curie = self.sourceid_to_curie[sid] results.add(f'{curie}:{result["src_compound_id"]}') return results
def get_out_by_in(self, input_node, output_type, prefixes, subject=True, object=True): returnresults = [] caller = f'uberongraph.{inspect.stack()[1][3]}' results = {'subject': [], 'object': []} curies = set() for pre in prefixes: curies.update(input_node.get_synonyms_by_prefix(pre)) for curie in curies: results['subject'] += self.get_neighbor(curie, output_type, subject=True) results['object'] += self.get_neighbor(curie, output_type, subject=False) for direction in ['subject', 'object']: done = set() for r in results[direction]: key = (r['p'], r['output_id']) if key in done: continue predicate_curie = Text.obo_to_curie(r['p']) prefix = Text.get_curie(predicate_curie) prefix = prefix if prefix == 'ubergraph-axioms.ofn' else prefix.upper( ) upper_cased_predicate_curie = prefix + ":" + Text.un_curie( predicate_curie) predicate = LabeledID(upper_cased_predicate_curie, r['pLabel']) output_node = KNode(r['output_id'], type=output_type, name=r['output_label']) if direction == 'subject': edge = self.create_edge(input_node, output_node, caller, curie, predicate) else: edge = self.create_edge(output_node, input_node, caller, curie, predicate) done.add(key) returnresults.append((edge, output_node)) return returnresults
def disease_get_ancestors(self, disease_node): curie = disease_node.id prefix = Text.get_curie(curie) if "MONDO" != prefix: return [] query = f""" PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_> select distinct ?output_id ?label where {{ graph <http://reasoner.renci.org/ontology/closure> {{ $disease_id rdfs:subClassOf ?output_id . ?output_id rdfs:subClassOf MONDO:0000001 . }} graph <http://reasoner.renci.org/ontology>{{ ?output_id rdfs:label ?label. }} }} """ results = self.triplestore.query_template( template_text=query, inputs={'disease_id': curie}, outputs=['output_id', 'label']) outputs = [] for row in results: ancestor_node = KNode( row['output_id'], label=row['label'], type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE) if ancestor_node.id == disease_node.id: # refrain from adding edge to the node itself continue predicate = LabeledID(identifier='rdfs:subClassOf', label='subclass of') edge = self.create_edge( source_node=disease_node, target_node=ancestor_node, predicate=predicate, provided_by='uberongraph.disease_get_ancestors', input_id=disease_node.id) outputs.append((edge, ancestor_node)) return outputs