async def get_pubchem_data(self, pubchem_id, retries = 0): """ Gets pubchem annotations. """ conf = self.get_prefix_config('PUBCHEM') url = conf['url'] + pubchem_id.split(':')[-1] headers = { 'Accept': 'application/json' } result = await self.async_get_raw_response(url, headers= headers) # async with result as result_json: result_json = result['json'] # pubmed api blocks if too many req are sent throttle = result['headers']['X-Throttling-Control'] throttle_warnings = { Text.snakify(value.split(':')[0].lower()) : value.split(':')[1] for value in throttle.split(',') if ':' in value } if 'Yellow' in throttle_warnings['request_time_status'] or 'Yellow' in throttle_warnings['request_count_status']: logger.warn('Pubchem requests reached Yellow') await asyncio.sleep(0.5) if 'Red' in throttle_warnings['request_time_status'] or 'Red' in throttle_warnings['request_count_status']: logger.warn('Pubchem requests reached RED') await asyncio.sleep(2) if 'Black' in throttle_warnings['request_time_status'] or 'Black' in throttle_warnings['request_count_status']: sleep_sec = 3 * ( retries + 1 ) # logger.error(f'Pubchem request blocked, sleeping {sleep_sec} seconds, no of retries {retries}') await asyncio.sleep(sleep_sec) # repeat call if retries has changed till 3 if retries < 3: return await self.get_pubchem_data(pubchem_id, retries + 1) else: # exceeding retries return {} logger.warn(f'retry limit exceed for {pubchem_id} , returning empty') return {} return self.extract_pubchem_data(result_json, conf['keys'])
async def get_chemical_roles(self, chebi_id): """ Gets all the roles assigned to a chebi id. Should return along result along chebi_id, useful when making bulk request concurrently to keep track. """ text = """ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX has_role: <http://purl.obolibrary.org/obo/RO_0000087> PREFIX chemical_entity: <http://purl.obolibrary.org/obo/CHEBI_24431> PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> SELECT DISTINCT ?role_label from <http://reasoner.renci.org/ontology> from <http://reasoner.renci.org/redundant> where { $chebi_id has_role: ?role. ?role rdfs:label ?role_label. GRAPH <http://reasoner.renci.org/ontology/closure> { ?role rdfs:subClassOf CHEBI:50906. } } """ query_result = await self.tripleStore.async_query_template( inputs = {'chebi_id': chebi_id}, outputs = [ 'role_label' ], template_text = text ) for r in query_result: r['role_label'] = Text.snakify(r['role_label']) return {chebi_id: query_result}
def get_transitions(self, graph, query): """ Execute a cypher query and walk the results to build a set of transitions to execute. The query should be such that it returns a path (node0-relation0-node1-relation1-node2), and an array of the relation start nodes. For the path above, start nodes like (node0,node1) would indicate a unidirectional path, while (node0,node2) would indicate an end-based path meeting in the middle. Each node in the path can be described with an arbitrary node index. Note that this index does not have to correspond to the order of calling or any structural property of the graph. It simply points to a particular node in the call map. Returns: nodes: A map from a node index to the concept. transitions: a map from a node index to an (operation, output index) pair """ with graph.driver.session() as session: result = session.run(query) plans = [] for row in result: nodes = row['nodes'] edges = row['edges'] # extract transitions transitions = { node_id: {node_id: [] for node_id in nodes} for node_id in nodes } for e in edges: edge = edges[e] source_id = edge['source'] target_id = edge['target'] qedge = next(e2 for e2 in self.query_graph['edges'] if e2.id == e) qedge_type = qedge.type predicate = [ Text.snakify(e2type) for e2type in qedge_type ] if isinstance( qedge_type, list) and qedge_type else Text.snakify( qedge_type) if isinstance(qedge_type, str) else None trans = { "op": edge['op'], "link": edge['predicate'], "predicate": predicate } transitions[source_id][target_id].append(trans) plans.append(transitions) return plans
def write_edge(self,edge): if edge in self.written_edges[edge.source_id][edge.target_id]: return self.written_edges[edge.source_id][edge.target_id].add(edge) label = Text.snakify(edge.standard_predicate.label) typed_edges = self.edge_queues[label] typed_edges.append(edge) if len(typed_edges) >= self.edge_buffer_size: self.flush()
async def get_mondo_properties(self, mondo_curie): """ Gets the ascestors from onto and maps them to the ones we are intereseted in. """ conf = self.get_prefix_config('MONDO') ancestors_url = conf['url'] + mondo_curie response = await self.async_get_json(ancestors_url) if 'superterms' not in response: return {} ancestors = response['superterms'] properties = { Text.snakify(conf['keys'][x]): True for x in ancestors if x in conf['keys'] } return properties
def grab_edge_props(self,result): if result['pred'] is not None and len(result['pred']) > 1: rel = Text.snakify(result['pred']).lower() else: rel = 'interacts_with' predicate = LabeledID(identifier=f'GAMMA:{rel}', label=rel) if 'pubmed_ids' in result and result['pubmed_ids'] is not None: pmids = [ f'PMID:{r}' for r in result['pubmed_ids'].split('|')] else: pmids = [] props = {} if result['affinity'] is not None: props['affinity'] = float(result['affinity']) props['affinity_parameter'] = result['affinity_parameter'] return predicate, pmids, props
def process_op(self, link, source_node, history): op_name = link['op'] key = f"{op_name}({Text.upper_curie(source_node.id)})" maxtime = timedelta(minutes=2) try: try: results = self.rosetta.cache.get(key) except Exception as e: # logger.warning(e) results = None if results is not None: logger.debug(f"cache hit: {key} size:{len(results)}") else: logger.debug(f"exec op: {key}") op = self.rosetta.get_ops(op_name) start = dt.now() results = op(source_node) end = dt.now() logger.debug(f'Call {key} took {end-start}') if (end - start) > maxtime: logger.warn(f"Call {key} exceeded {maxtime}") self.rosetta.cache.set(key, results) logger.debug(f"cache.set-> {key} length:{len(results)}") logger.debug(f" {[node for _, node in results]}") results = list( filter(lambda x: x[1].id not in self.excluded_identifiers, results)) for edge, node in results: edge_label = Text.snakify(edge.original_predicate.label) if link['predicate'] is None or edge_label == link[ 'predicate'] or (isinstance(link['predicate'], list) and (edge_label in link['predicate'])): self.process_node(node, history, edge) else: pass except pika.exceptions.ChannelClosed: traceback.print_exc() raise except Exception as e: traceback.print_exc() log_text = f" -- {key}" logger.warning(f"Error invoking> {log_text}")
def make_edge(self,chem,gene,r,identifier,url): rel=Text.snakify(r['type']).lower() predicate = LabeledID(identifier=f'GAMMA:{rel}',label=rel) #if r['type'] == 'Agonist': # predicate = LabeledID(identifier='CTD:increases_activity_of', label='increases activity of') #elif r['type'] in ['Antagonist','Channel blocker', 'Inhibitor', 'Gating inhibitor']: # predicate = LabeledID(identifier='CTD:decreases_activity_of', label='decreases activity of') #else: # predicate = LabeledID(identifier='RO:0002434', label='interacts with') props = {x: r[x] for x in ['primaryTarget', 'affinityParameter', 'endogenous'] } try: affins = [float(x.strip()) for x in r['affinity'].split('-') ] if len(affins) > 0: props['affinity'] = sum(affins) / len(affins) except: logger.debug(f"Can't parse affinity {r['affinity']}") pass edge = self.create_edge(chem,gene,'gtopdb.ligand_to_gene',identifier,predicate, publications=[f"PMID:{x['pmid']}" for x in r['refs'] if x['pmid']],url=url,properties=props) return edge
def get_transitions_disconnected(self, graph, op_filter): """ Function adjusted for crawler works on the assumptions that quetion contains unform types of pairs of nodes which we don't have pair to pair connections. I.e (a)->(b) (c) -> (d) but no (b)->(c) """ source_node = self.query_graph['nodes'][0].concept_cypher_signature( 'n0') target_node = self.query_graph['nodes'][1].concept_cypher_signature( 'n1') cypher = [f'MATCH {source_node}-[e]-> {target_node}'] cypher += ['WHERE Exists(e.op) RETURN Collect(e) as edges'] query = '\n'.join(cypher) result = '' with graph.driver.session() as session: result = session.run(query) edges = [] for row in result: for edge in row['edges']: e = { "op": edge['op'], "link": edge['predicate'], "predicate": Text.snakify(edge['type']) if edge['type'] else None } edges.append(e) p = {} for edge in self.query_graph['edges']: p[edge.source_id] = {} p[edge.source_id][edge.target_id] = { e['op']: e for e in edges if op_filter(e['op']) }.values() return p
def sort_edges_by_label(edges): el = defaultdict(list) deque( map( lambda x: el[Text.snakify(x[2]['object'].standard_predicate.label)].append(x), edges ) ) return el
def get_gene_by_drug(self, input_node): drugbank_ids = input_node.get_synonyms_by_prefix('DRUGBANK') response = [] for drugbank_id in drugbank_ids: drugbank_id = Text.un_curie(drugbank_id) url = f'{self.url}chem/{drugbank_id}?fields=drugbank.enzymes,drugbank.targets,drugbank.carriers,drugbank.transporters' logger.debug(url) results = self.query(url) if 'drugbank' in results: # maybe gene are everywhere they are enzymes they are genes = [] if 'enzymes' in results['drugbank']: # maybe we don't need this filter ... ??? logger.debug('found enzymes') genes += list( filter( lambda x: ('organism' in x) and (x['organism'] == 'Humans'), results['drugbank']['enzymes'])) if 'transporters' in results['drugbank']: logger.debug('found transporters') genes += list( filter( lambda x: ('organism' in x) and (x['organism'] == 'Humans'), results['drugbank']['transporters'])) if 'carriers' in results['drugbank']: logger.debug('found some carriers') genes += list( filter( lambda x: ('organism' in x) and (x['organism'] == 'Humans'), results['drugbank']['carriers'])) if 'targets' in results['drugbank']: logger.debug('found targets') genes += list( filter( lambda x: ('organism' in x) and (x['organism'] == 'Humans'), results['drugbank']['targets'])) for gene in genes: # Actions relate what the drug does to the enzyme ... ?./> # so I think we can treat actions as relationship types # eg : Alfuzosin (DB00346) is a substrate for CYP34A (Uniprokb:P08684) which implies its metabolized by that enzyme .... # we might have (A drug) that (inhibits) a gene and the action here is inhibitor. # So I think its safe to generalize the actions are what the drug is to the enzyme. Or how the enzyme acts to the drug. # so more like (Drug) - is a/an (action) for -> (Enzyme/gene), but some <- so list contains direction #action_to_predicate_map = { # 'substrate': (LabeledID(identifier='CTD:increases_degradation_of', label= 'substrate'), True), #(label, direction where true means reverse) # 'inhibitor': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "inhibitor"), False), # 'inducer': (LabeledID(identifier = 'CTD:increases_activity_of', label="inducer"), False), # 'antagonist': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "antagonist"), False), # 'weak inhibitor': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "weak_inhibitor"), False), # 'partial antagonist': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "partial_antagonist"), False), # 'blocker': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "blocker"), False), # 'inverse agonist': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "inverse_agonist"), False), # 'binder': (LabeledID(identifier='CTD:molecularly_interacts_with', label= 'binder'), False), # 'activator': (LabeledID(identifier = 'CTD:increases_activity_of', label="activator"), False), # 'agonist': (LabeledID(identifier = 'CTD:increases_activity_of', label="agonist"), False), # 'partial agonist': (LabeledID(identifier = 'CTD:increases_activity_of', label="partial_agonist"), False), # 'potentiator': (LabeledID(identifier = 'CTD:increases_activity_of', label="potentiator"), False), # 'carrier': (LabeledID(identifier = 'CTD:increases_transport_of', label="potentiator"), True), # 'product of': (LabeledID(identifier= 'CTD:increases_synthesis_of', label = "product_of"), False), # 'inhibition of synthesis': (LabeledID(identifier= 'CTD:decreases_synthesis_of', label = "inhibition_of_synthesis"), False), # 'inactivator': (LabeledID(identifier= 'CTD:decreases_activity_of', label = "inactivator"), False), #} reverse = ['substrate', 'carrier'] #Some genes are more like gene familes, and we don't want them if 'actions' in gene and 'uniprot' in gene: actions = gene['actions'] if type( gene['actions']) == type( []) else [gene['actions']] # create the gene node if 'gene_name' in gene: nm = gene['gene_name'] else: nm = '' gene_node = KNode(f"UNIPROTKB:{gene['uniprot']}", name=nm, type=node_types.GENE) publications = [f'PMID:{x}' for x in gene['pmids'] ] if 'pmids' in gene else [] for action in actions: if action in reverse: direction = True else: direction = False #predicate,direction = action_to_predicate_map.get(action, (LabeledID(identifier= 'CTD:interacts_with', label=action),False)) rel = Text.snakify(action) rel = Text.normalize_predicate(rel) predicate = LabeledID(identifier=f'GAMMA:{rel}', label=rel) source_node = input_node target_node = gene_node if direction: # swap input and target nodes source_node = gene_node target_node = input_node if predicate: edge = self.create_edge( source_node, target_node, 'mychem.get_gene_by_drug', source_node.id, predicate, publications=publications) response.append((edge, gene_node)) return response