def term_to_term(self, node_a, node_b, limit=10000): """Given two terms, find articles in chemotext that connect them, and return as a KEdge. If nothing is found, return None""" logging.getLogger('application').debug( 'chemotext2: "{}" to "{}"'.format(node_a.label, node_b.label)) phrases_a = self.generate_phrases(node_a.label) phrases_b = self.generate_phrases(node_b.label) maxr = -1 besta = '' bestb = '' for p_a in phrases_a: for p_b in phrases_b: if p_a == p_b: continue r = self.chemotext2.get_semantic_similarity(p_a, p_b) if r > maxr: maxr = r besta = p_a bestb = p_b logging.getLogger('application').debug( ' "{}"-"{}": {} ({})'.format(p_a, p_b, r, maxr)) logging.getLogger('application').debug(' "{}"-"{}": {}'.format( besta, bestb, maxr)) if maxr > -1: ke = KEdge('chemotext2', 'term_to_term', { 'similarity': maxr, 'terms': [besta, bestb] }, is_support=True) ke.source_node = node_a ke.target_node = node_b return ke return None
def term_to_term(self, node_a, node_b, limit=10000): """Given two terms, find articles in chemotext that connect them, and return as a KEdge. If nothing is found, return None""" meshes_a = self.get_mesh_labels(node_a) meshes_b = self.get_mesh_labels(node_b) articles = [] from datetime import datetime start = datetime.now() for label_a in meshes_a: for label_b in meshes_b: response = self.ctext.query( query= "MATCH (d:Term)-[r1]-(a:Art)-[r2]-(t:Term) WHERE d.name='%s' AND t.name='%s' RETURN a LIMIT %d" % (label_a, label_b, limit)) for result in response['results']: for data in result['data']: articles += data['row'] end = datetime.now() logging.getLogger('application').debug( 'chemotext: {} to {}: {} ({})'.format(meshes_a, meshes_b, len(articles), end - start)) if len(articles) > 0: ke = KEdge('chemotext', 'term_to_term', {'publications': articles}, is_support=True) ke.source_node = node_a ke.target_node = node_b return ke return None
def term_to_term(self,node_a,node_b,limit = 10000): """Given two terms, find articles in chemotext that connect them, and return as a KEdge. If nothing is found, return None""" logging.getLogger('application').debug('identifiers: {} to {}'.format(node_a.id, node_b.id)) meshes_a = self.get_mesh_labels(node_a) meshes_b = self.get_mesh_labels(node_b) articles=[] from datetime import datetime start = datetime.now() for label_a in meshes_a: for label_b in meshes_b: response = self.ctext.query( query="MATCH (d:Term)-[r1]-(a:Art)-[r2]-(t:Term) WHERE d.name='%s' AND t.name='%s' RETURN a LIMIT %d" % (label_a, label_b, limit)) for result in response['results']: for data in result['data']: articles += data['row'] end = datetime.now() logging.getLogger('application').debug('chemotext: {} to {}: {} ({})'.format(meshes_a, meshes_b, len(articles), end-start)) if len(articles) > 0: #ke= KEdge( 'chemotext', 'term_to_term', { 'publications': articles }, is_support = True ) pmids = [f'PMID:{x["pmid"]}' for x in articles] raise RuntimeError('The following KEdge constructor syntax looks very suspect.') ke = KEdge('chemotext.term_to_term', dt.now(), 'chemotext:1', 'literature_co-occurence', f'{node_a.id},{node_b.id}','chemotext:1','literature_co-occurence',publications=pmids, is_support=True) ke.source_node = node_a ke.target_node = node_b return ke return None
def execute(self): """Execute the query that defines the graph""" self.logger.debug('Executing Query') #GreenT wants a cypherquery to find transitions, and a starting point cyphers = self.userquery.generate_cypher() starts = self.userquery.get_start_node() reverses = self.userquery.get_reversed() lookups = self.userquery.get_lookups() for cypher, start, reverse,lookup in zip(cyphers,starts,reverses,lookups): input_name = Text.un_curie(lookup.identifier) self.logger.debug(start) self.logger.debug('CYPHER') self.logger.debug(cypher) identifier, ntype = start start_node = KNode( identifier, ntype, label=input_name ) kedge = KEdge( 'lookup', 'lookup' ) kedge.source_node = lookup kedge.target_node = start_node self.add_nonsynonymous_edge( kedge ) #Fire this to rosetta, collect the result result_graph = self.rosetta.graph([(None, start_node)],query=cypher) #result_graph contains duplicate edges. Remove them, while preserving order: result_graph = list(OrderedDict.fromkeys( result_graph ) ) self.add_edges( result_graph , reverse ) self.logger.debug('Query Complete')
def make_edge(self,cooc_list, node_a, node_b): k,c = cooc_list[0] #TODO: fix this up with details c[ 'icd9' ] = list(k) ke= KEdge( 'cdw', 'term_to_term', c, is_support = True ) ke.source_node = node_a ke.target_node = node_b return ke
def make_edge(self, cooc_list, node_a, node_b): k, c = cooc_list[0] #TODO: fix this up with details c['icd9'] = list(k) raise RuntimeError( 'The following KEdge constructor looks somewhat suspect.') ke = KEdge('cdw', 'term_to_term', c, is_support=True) ke.source_node = node_a ke.target_node = node_b return ke
def new_edge(self, source, function, properties, source_node=None, target_node=None): raise RuntimeError( 'The following KEdge constructor looks very suspect.') edge = KEdge(source, function, properties) edge.source_node = source_node edge.target_node = target_node return edge
def term_to_term(self,node_a,node_b): articles = self.omnicorp.get_shared_pmids(node_a, node_b) #logger.debug(f'OmniCorp {node_a.identifier} {node_b.identifier}') if len(articles) > 0: #logger.debug(f' -> {len(articles)}') pmids = [f'PMID:{x.split("/")[-1]}' for x in articles] ke = KEdge('omnicorp.term_to_term', dt.now(), 'omnicorp:1', 'literature_co-occurence', f'{node_a.identifier},{node_b.identifier}','omnicorp:1','literature_co-occurence',publications=pmids, is_support=True) ke.source_node = node_a ke.target_node = node_b return ke return None
def process_associations(self, r, predicate, target_node_type, reverse=False): """Given a response from biolink, create our edge and node structures. Sometimes (as in pathway->Genes) biolink returns the query as the object, rather than the subject. reverse=True will handle this case, bringing back the subject of the response, rather than the object.""" edge_nodes = [] for association in r['associations']: pubs = [] if 'publications' in association and association[ 'publications'] is not None: for pub in association['publications']: # Sometimes, we get back something like "uniprotkb" instead of a PMID. We don't want it. pubid_prefix = pub['id'][:4].upper() if pubid_prefix == 'PMID': pubs.append(pub['id']) if reverse: obj = KNode(association['subject']['id'], target_node_type, association['subject']['label']) else: obj = KNode(association['object']['id'], target_node_type, association['object']['label']) rel = { 'typeid': association['relation']['id'], 'label': association['relation']['label'] } props = {'publications': pubs, 'relation': rel} edge = KEdge('biolink', predicate, props) edge_nodes.append((edge, obj)) return edge_nodes
def graph_get_pathways_by_gene(self, gene): #reasoner response = self.triplestore.query_template( inputs={"gene": gene.identifier.split(':')[1].upper()}, outputs=['keggPath'], template_text=""" prefix kegg: <http://chem2bio2rdf.org/kegg/resource/> prefix drugbank: <http://chem2bio2rdf.org/drugbank/resource/> prefix uniprot: <http://chem2bio2rdf.org/uniprot/resource/gene/> prefix ctd: <http://chem2bio2rdf.org/ctd/resource/> prefix mesh: <http://bio2rdf.org/mesh:> select ?drugGenericName ?uniprotGeneID ?pathwayName ?keggPath where { ?keggPath kegg:protein ?swissProtID ; kegg:Pathway_name ?pathwayName . ?keggInter kegg:cid ?pubchemCID . ?dbInter drugbank:GeneBank_ID ?geneBankID ; drugbank:SwissProt_ID ?swissProtID ; drugbank:gene ?uniprotGeneID . ?drugID drugbank:CID ?pubchemCID ; drugbank:Generic_Name ?drugGenericName . ?ctd_disease ctd:diseaseid ?diseaseID ; ctd:cid ?pubchemCID . values ( ?uniprotGeneID ) { ( uniprot:$gene ) } } LIMIT 2000""") results = [] for r in response: edge = KEdge('c2b2r', 'geneToPathway', {}) node = KNode("KEGG:{0}".format(r['keggPath'].split('/')[-1:][0]), node_types.PATHWAY) results.append((edge, node)) return results
def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator): all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes) for source_node_id, results in all_results.items(): # convert the simple edges and nodes to rags objects and write them to the graph for (edge, node) in results: gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties) if self.recreate_sv_node: variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT) variant_node.add_export_labels([node_types.SEQUENCE_VARIANT]) writer.write_node(variant_node) if gene_node.id not in self.written_genes: writer.write_node(gene_node) self.written_genes.add(gene_node.id) predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label) gene_edge = KEdge(source_id=source_node_id, target_id=gene_node.id, provided_by=edge.provided_by, ctime=edge.ctime, original_predicate=predicate, # standard_predicate=predicate, input_id=edge.input_id, properties=edge.properties) writer.write_edge(gene_edge) logger.info(f'added {len(results)} variant relationships for {source_node_id}')
def drug_get_gene(self, subject): """ Get a gene from a pharos disease id. """ pharosid = Text.un_curie (subject.identifier) original_edge_nodes=[] r = requests.get('https://pharos.nih.gov/idg/api/v1/ligands(%s)?view=full' % pharosid) result = r.json() resolved_edge_nodes = [] actions = set() #for testing for link in result['links']: if link['kind'] == 'ix.idg.models.Target': pharos_target_id = int(link['refid']) edge_properties = {} for prop in link['properties']: if prop['label'] == 'Pharmalogical Action': #! actions.add(prop['term'] ) pharos_edge = KEdge( 'pharos', 'drug_get_gene', {'properties': link['properties']} ) #Pharos returns target ids in its own numbering system. Collect other names for it. hgnc = self.target_to_hgnc (pharos_target_id) if hgnc is not None: hgnc_node = KNode (hgnc, node_types.GENE) resolved_edge_nodes.append( (pharos_edge, hgnc_node) ) else: logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id) # for a in actions: # print ('Action: {}'.format(a) ) return resolved_edge_nodes
def drugname_to_pharos(self, namenode): drugname = Text.un_curie(namenode.identifier) pharosids = drugname_string_to_pharos_string(drugname) for pharosid, pharoslabel in pharosids: newnode = KNode( pharosid, node_types.DRUG, label=pharoslabel) newedge = KEdge( 'pharos', 'drugname_to_pharos', {} ) results.append( (newedge, newnode ) ) return results
def drugname_to_ctd(self, namenode): drugname = Text.un_curie(namenode.identifier) ctdids = self.drugname_string_to_ctd_string(drugname) for ctd in ctdids: label = drugname newnode = KNode(ctdid, node_types.DRUG, label=label) newedge = KEdge('CTD', 'drugname_to_ctd', {}) results.append((newedge, newnode)) return results
def get_anatomy_by_cell_graph(self, cell_node): anatomies = self.cell_to_anatomy(cell_node.identifier) results = [] for r in anatomies: edge = KEdge('uberongraph', 'cellToAnatomy') node = KNode (Text.obo_to_curie(r['anatomyID']), \ node_types.ANATOMY ) node.label = r['anatomyLabel'] results.append((edge, node)) return results
def graph_get_genes_by_disease(self, disease): #reasoner disease = disease.identifier.split(':')[1].lower() response = self.get_genes_pathways_by_disease([disease]) results = [] for r in response: edge = KEdge('c2b2r', 'diseaseToGene', {'keggPath': r['keggPath']}) node = KNode( "UNIPROT:{0}".format(r['uniprotGene'].split('/')[-1:][0]), node_types.GENE) results.append((edge, node)) return results
def callback(ch, method, properties, body): body = body.decode() # logger.info(f" [x] Received {body}") if isinstance(body, str) and body == 'flush': writer.flush() return graph = json.loads(body) for node in graph['nodes']: writer.write_node(KNode(node)) for edge in graph['edges']: writer.write_edge(KEdge(edge))
def drugname_to_pharos(self, namenode): drugname = Text.un_curie(namenode.id) pharosids = self.drugname_string_to_pharos_info(drugname) results = [] predicate = LabeledID(identifier='RDFS:id', label='identifies') for pharosid, pharoslabel in pharosids: newnode = KNode(pharosid, type=node_types.CHEMICAL_SUBSTANCE, name=pharoslabel) raise RuntimeError('namenode.id is probably not a ctime...') newedge = KEdge(namenode, newnode, 'pharos.drugname_to_pharos', namenode.id, predicate) results.append((newedge, newnode)) return results
def process_pharos_response (r): try: result = r.json() for link in result['links']: if link['kind'] != 'ix.idg.models.Target': logger.info('Pharos disease returning new kind: %s' % link['kind']) else: pharos_target_id = int(link['refid']) pharos_edge = KEdge( 'pharos', 'disease_get_gene', {'properties': link['properties']} ) original_edge_nodes.append( (pharos_edge, pharos_target_id) ) except JSONDecodeError as e: pass #logger.error ("got exception %s", e)
def test_write_edges(): bf = BufferedWriter(rosetta_mock) edge = KEdge({ 'source_id': 'source:1', 'target_id': 'target:1', 'provided_by': 'test_write_edges' }) # edge.source_id = 'source:1' # edge.target_id = 'target:1' # edge.provided_by = 'test_write_edges' edge.original_predicate = LabeledID(identifier='SEMMEDDB:CAUSES', label='semmed:causes') bf.write_edge(edge) assert bf.written_edges[edge.source_id][edge.target_id] == set( [edge.original_predicate.identifier]) assert len(bf.edge_queues) == 1 # try to write it twice and it should be keeping edge queues as 1 bf.write_edge(edge) assert len(bf.edge_queues) == 1 bf.write_edge(edge, force_create=True) assert len(bf.edge_queues) == 2
def get_edge (self, props={}, predicate=None, pmids=[]): """ Generate graph edges in a standard way, propagating information needed for scoring and semantic context above. """ if not isinstance (props, dict): raise ValueError ("Properties must be a dict") # Add a predicate describing the connection between subject and object. # Pass up pmids for provenance and confidence scoring. # print (pmids) props['stdprop'] = { 'predicate' : predicate, 'pmids' : pmids } return KEdge (self.name, predicate, props, is_synonym = (predicate=='synonym'))
def create_edge(self, source_node, target_node, provided_by, input_id, predicate, analysis_id=None, publications=[], url=None, properties={}): ctime = time.time() if provided_by is None: raise 'missing edge source' return KEdge(source_id=source_node.id, target_id=target_node.id, provided_by=provided_by, ctime=ctime, original_predicate=predicate, # standard_predicate=standard_predicate, # This is now moved to Buffered writer flush method input_id=input_id, publications=publications, url=url, properties=properties)
def test_edge_changing_node_ids(): bf = BufferedWriter(rosetta_mock) # flush edge def write_transaction_mock_edge(export_func, edges, edge_label, merge_edges): import os assert os.environ.get('MERGE_EDGES', False) == merge_edges # make sure this is the right function assert edge_label == 'causes' # make sure we have out node id in there assert export_func == export_edge_chunk edge = edges[0] assert edge.source_id == 'CHEBI:127682' assert edge.target_id == 'NCBIGene:84125' print(edges) # pass the mock tester to bf and let it rip source_node = KNode('PUBCHEM:44490445') target_node = KNode('HGNC:25708') edge = KEdge({ 'source_id': source_node.id, 'target_id': target_node.id, 'provided_by': 'test_write_edges', 'original_predicate': LabeledID(identifier='SEMMEDDB:CAUSES', label='semmed:causes'), 'standard_predicate': None, 'input_id': 'PUBCHEM:44490445', 'publications': [], }) bf.write_node(source_node) bf.write_node(target_node) # a mock for writing node session_for_node = Mock() session_for_node.write_transaction = lambda export_func, node_list, labels: None # we are not testing for nodes here bf.flush_nodes(session_for_node) assert bf.synonym_map == { 'PUBCHEM:44490445': 'CHEBI:127682', 'HGNC:25708': 'NCBIGene:84125' } session_for_edge = Mock() session_for_edge.write_transaction = write_transaction_mock_edge bf.write_edge(edge) bf.flush_edges(session_for_edge)
def create_edge(self, source_node, target_node, provided_by, input_id, predicate, publications=None, url=None, properties=None): ctime = time.time() standard_predicate=self.standardize_predicate(predicate, source_node.id, target_node.id) if provided_by is None: raise 'missing edge source' return KEdge(source_id=source_node.id, target_id=target_node.id, provided_by=provided_by, ctime=ctime, original_predicate=predicate, standard_predicate=standard_predicate, input_id=input_id, publications=publications, url=url, properties=properties)
def get_drugs_by_condition_graph(self, conditions): drugs = self.get_drugs_by_condition(conditions.identifier) results = [] for r in drugs: edge = KEdge('c2b2r', 'conditionToDrug', { 'cid': r['pubChemCID'], 'pmids': r['diseasePMIDs'] }) node = KNode( r['drugID'].split('/')[-1:][0], #"http://chem2bio2rdf.org/drugbank/resource/drugbank_drug", node_types.DRUG, r['drugGenericName']) results.append((edge, node)) #logger.debug ("chembio drugs by condition: {}".format (results)) return results
def generate_all_edges(self, nodelist): results = self.omnicorp.get_all_shared_pmids(nodelist) predicate = LabeledID(identifier='omnicorp:1', label='literature_co-occurrence') edges = [ KEdge(k[0], k[1], 'omnicorp.term_to_term', time.time(), predicate, predicate, f'{k[0].id},{k[1].id}', publications=v, is_support=True) for k, v in results.items() ] return edges
def disease_get_gene0(self, subject): """ Get a gene from a pharos disease id. """ pharosids = self.translate (subject) print ("pharos ids: {}".format (pharosids)) original_edge_nodes=[] for pharosid in pharosids: logger.debug ('pharos> https://pharos.nih.gov/idg/api/v1/diseases(%s)?view=full' % pharosid) r = requests.get('https://pharos.nih.gov/idg/api/v1/diseases(%s)?view=full' % pharosid) result = r.json() for link in result['links']: if link['kind'] != 'ix.idg.models.Target': logger.info('Pharos disease returning new kind: %s' % link['kind']) else: pharos_target_id = int(link['refid']) pharos_edge = KEdge( 'pharos', 'disease_get_gene', {'properties': link['properties']} ) original_edge_nodes.append( (pharos_edge, pharos_target_id) )
def generate_graph(size: int): """ generate triplets of length size""" for i in range(0, size): source_curie = f'SOURCE:{i}' target_curie = f'TARGER:{i}' source_node = KNode(source_curie, type=node_types.CHEMICAL_SUBSTANCE) target_node = KNode(target_curie, type=node_types.CHEMICAL_SUBSTANCE) edge = KEdge( source_id=source_node.id, target_id=target_node.id, provided_by='stress_tester', ctime='now', original_predicate=LabeledID('RO:0000052', 'affects'), input_id=source_node.id, ) yield (source_node, edge, target_node)
def hgnc_to_uniprotkb(self, node): """Given a node representing an HGNC retrieve the UniProtKB identifier""" if node.node_type != node_types.GENE: raise ValueError('Node must be a gene') identifier_parts = node.identifier.split(':') if not identifier_parts[0].upper() == 'HGNC': raise ValueError('Node must represent an HGNC identifier.') hgnc_id = identifier_parts[1] headers = {'Accept':'application/json'} r = requests.get('{0}/hgnc_id/{1}'.format(self.url, hgnc_id), headers= headers).json() try: uniprots = r['response']['docs'][0]['uniprot_ids'] return [ ( KEdge( 'hgnc', 'ncbigene_to_uniprotkb', is_synonym=True ),\ KNode( identifier='UNIPROTKB:{}'.format(uniprot), node_type = node_types.GENE )) \ for uniprot in uniprots ] except (IndexError,KeyError): #No results back return []
def drug_to_gene(self, subject): """ Get a gene from a ctd drug id. """ ctdid = Text.un_curie(subject.identifier) actions = set() edge_nodes = [] for link in self.drug_genes[ctdid]: target_id = link['gene_id'] edge_properties = { 'actions': link['actions'], 'publications': link['publications'] } actions.update(link['actions']) edge = KEdge('ctd', 'drug_get_gene', {'properties': edge_properties}) node = KNode(target_id, node_types.GENE) edge_nodes.append((edge, node)) # for action in actions: # print( 'Action: {}'.format(action) ) return edge_nodes