def from_dict(self, query_graph_dict): query_graph = QueryGraph() query_graph.nodes = [] query_graph.edges = [] if "nodes" in query_graph_dict: for node in query_graph_dict["nodes"]: qnode = QNode().from_dict(node) query_graph.nodes.append(qnode) if "edges" in query_graph_dict: for edge in query_graph_dict["edges"]: qedge = QEdge().from_dict(edge) query_graph.edges.append(qedge) return query_graph
def _decorate_existing_edges(self): # This function decorates all existing edges in the knowledge graph with ICEES data, stored in EdgeAttributes knowledge_graph = self.message.knowledge_graph log = self.response # Query ICEES for each edge in the knowledge graph that ICEES can provide data on (use known curies) num_edges_obtained_icees_data_for = 0 edges_by_node_pair = self._get_edges_by_node_pair( knowledge_graph) # Don't duplicate effort for parallel edges for node_pair_key, node_pair_edges in edges_by_node_pair.items(): source_id = node_pair_edges[0].source_id target_id = node_pair_edges[0].target_id accepted_source_synonyms = self._get_accepted_synonyms(source_id) accepted_target_synonyms = self._get_accepted_synonyms(target_id) if accepted_source_synonyms and accepted_target_synonyms: # Query ICEES for each possible combination of accepted source/target synonyms for source_curie_to_try, target_curie_to_try in itertools.product( accepted_source_synonyms, accepted_target_synonyms): qedge = QEdge(id=f"icees_e00", source_id=source_curie_to_try, target_id=target_curie_to_try) log.debug( f"Sending query to ICEES+ for {source_curie_to_try}--{target_curie_to_try}" ) p_value = self._get_icees_p_value_for_edge(qedge, log) if p_value is not None: num_edges_obtained_icees_data_for += len( node_pair_edges) new_edge_attribute = self._create_icees_edge_attribute( p_value) # Add the data as new EdgeAttributes on the existing edges with this source/target ID for edge in node_pair_edges: if not edge.edge_attributes: edge.edge_attributes = [] edge.edge_attributes.append(new_edge_attribute) # Don't worry about checking remaining synonym combos if we got results break if num_edges_obtained_icees_data_for: log.info( f"Overlayed {num_edges_obtained_icees_data_for} edges with exposures data from ICEES+" ) else: log.warning( f"Could not find ICEES+ exposures data for any edges in the KG" ) return self.response
def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: if "drug" in node.type or "chemical_substance" in node.type: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node.id) if parameters['target_qnode_id'] in node.qnode_ids: if "disease" in node.type or "phenotypic_feature" in node.type: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node.id) added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_source_curie, converted_target_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "probably_treats" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "probably_treats" relation = parameters['virtual_relation_label'] qedge_id = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id']) # TODO: ok to make the id and type the same thing? self.message.query_graph.edges.append(q_edge) return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() for node in self.message.knowledge_graph.nodes: curie_to_type[node.id] = node.type # then iterate over the edges and decorate if appropriate for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types)): temp_value = 0 # loop over all pairs of equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_source_curie, converted_target_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_target_curie, converted_source_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.edge_attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def add_virtual_edge(self, name="", default=0.): """ Generic function to add a virtual edge to the KG an QG :name: name of the functionality of the KP to use """ parameters = self.parameters source_curies_to_decorate = set() target_curies_to_decorate = set() curies_to_names = dict( ) # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: source_curies_to_decorate.add(node.id) curies_to_names[ node. id] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs if parameters['target_qnode_id'] in node.qnode_ids: target_curies_to_decorate.add(node.id) curies_to_names[ node. id] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be edge_attribute = self.make_edge_attribute_from_curies( source_curie, target_curie, source_name=curies_to_names[source_curie], target_name=curies_to_names[target_curie], default=default, name=name) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = f"has_{name}_with" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = f"has_{name}_with" relation = parameters['virtual_relation_label'] qedge_ids = [parameters['virtual_relation_label']] q_edge = QEdge( id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id'] ) # TODO: ok to make the id and type the same thing? self.message.query_graph.edges.append(q_edge)
def fisher_exact_test(self): """ Peform the fisher's exact test to expand or decorate the knowledge graph :return: response """ self.response.info( f"Performing Fisher's Exact Test to add p-value to edge attribute of virtual edge" ) # check the input parameters if 'source_qnode_id' not in self.parameters: self.response.error( f"The argument 'source_qnode_id' is required for fisher_exact_test function" ) return self.response else: source_qnode_id = self.parameters['source_qnode_id'] if 'virtual_relation_label' not in self.parameters: self.response.error( f"The argument 'virtual_relation_label' is required for fisher_exact_test function" ) return self.response else: virtual_relation_label = str( self.parameters['virtual_relation_label']) if 'target_qnode_id' not in self.parameters: self.response.error( f"The argument 'target_qnode_id' is required for fisher_exact_test function" ) return self.response else: target_qnode_id = self.parameters['target_qnode_id'] rel_edge_id = self.parameters[ 'rel_edge_id'] if 'rel_edge_id' in self.parameters else None top_n = int( self.parameters['top_n']) if 'top_n' in self.parameters else None cutoff = float( self.parameters['cutoff']) if 'cutoff' in self.parameters else None # initialize some variables nodes_info = {} edge_expand_kp = [] source_node_list = [] target_node_dict = {} size_of_target = {} source_node_exist = False target_node_exist = False query_edge_id = set() rel_edge_type = set() source_node_type = None target_node_type = None ## Check if source_qnode_id and target_qnode_id are in the Query Graph try: if len(self.message.query_graph.nodes) != 0: for node in self.message.query_graph.nodes: if node.id == source_qnode_id: source_node_exist = True source_node_type = node.type elif node.id == target_qnode_id: target_node_exist = True target_node_type = node.type else: pass else: self.response.error(f"There is no query node in QG") return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong with retrieving nodes in message QG") return self.response if source_node_exist: if target_node_exist: pass else: self.response.error( f"No query node with target qnode id {target_qnode_id} detected in QG for Fisher's Exact Test" ) return self.response else: self.response.error( f"No query node with source qnode id {source_qnode_id} detected in QG for Fisher's Exact Test" ) return self.response ## Check if there is a query edge connected to both source_qnode_id and target_qnode_id in the Query Graph try: if len(self.message.query_graph.edges) != 0: for edge in self.message.query_graph.edges: if edge.source_id == source_qnode_id and edge.target_id == target_qnode_id and edge.relation == None: query_edge_id.update( [edge.id]) # only actual query edge is added elif edge.source_id == target_qnode_id and edge.target_id == source_qnode_id and edge.relation == None: query_edge_id.update( [edge.id]) # only actual query edge is added else: continue else: self.response.error(f"There is no query edge in Query Graph") return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong with retrieving edges in message QG") return self.response if len(query_edge_id) != 0: if rel_edge_id: if rel_edge_id in query_edge_id: pass else: self.response.error( f"No query edge with qedge id {rel_edge_id} connected to both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id} detected in QG for Fisher's Exact Test" ) return self.response else: pass else: self.response.error( f"No query edge connected to both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id} detected in QG for Fisher's Exact Test" ) return self.response ## loop over all nodes in KG and collect their node information try: count = 0 for node in self.message.knowledge_graph.nodes: nodes_info[node.id] = { 'count': count, 'qnode_ids': node.qnode_ids, 'type': node.type[0], 'edge_index': [] } count = count + 1 except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong with retrieving nodes in message KG") return self.response ## loop over all edges in KG and create source node list and target node dict based on source_qnode_id, target_qnode_id as well as rel_edge_id (optional, otherwise all edges are considered) try: count = 0 for edge in self.message.knowledge_graph.edges: if edge.provided_by != "ARAX": nodes_info[edge.source_id]['edge_index'].append(count) nodes_info[edge.target_id]['edge_index'].append(count) if rel_edge_id: if rel_edge_id in edge.qedge_ids: if source_qnode_id in nodes_info[ edge.source_id]['qnode_ids']: edge_expand_kp.append(edge.is_defined_by) rel_edge_type.update([edge.type]) source_node_list.append(edge.source_id) if edge.target_id not in target_node_dict.keys( ): target_node_dict[edge.target_id] = { edge.source_id } else: target_node_dict[edge.target_id].update( [edge.source_id]) else: edge_expand_kp.append(edge.is_defined_by) rel_edge_type.update([edge.type]) source_node_list.append(edge.target_id) if edge.source_id not in target_node_dict.keys( ): target_node_dict[edge.source_id] = { edge.target_id } else: target_node_dict[edge.source_id].update( [edge.target_id]) else: pass else: if source_qnode_id in nodes_info[ edge.source_id]['qnode_ids']: if target_qnode_id in nodes_info[ edge.target_id]['qnode_ids']: edge_expand_kp.append(edge.is_defined_by) source_node_list.append(edge.source_id) if edge.target_id not in target_node_dict.keys( ): target_node_dict[edge.target_id] = { edge.source_id } else: target_node_dict[edge.target_id].update( [edge.source_id]) else: pass elif target_qnode_id in nodes_info[ edge.source_id]['qnode_ids']: if source_qnode_id in nodes_info[ edge.target_id]['qnode_ids']: edge_expand_kp.append(edge.is_defined_by) source_node_list.append(edge.target_id) if edge.source_id not in target_node_dict.keys( ): target_node_dict[edge.source_id] = { edge.target_id } else: target_node_dict[edge.source_id].update( [edge.target_id]) else: pass else: pass else: pass count = count + 1 ## record edge position in message.knowledge_graph except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong with retrieving edges in message KG") return self.response source_node_list = list( set(source_node_list)) ## remove the duplicate source node id ## check if there is no source node in message KG if len(source_node_list) == 0: self.response.error( f"No source node found in message KG for Fisher's Exact Test") return self.response ## check if there is no target node in message KG if len(target_node_dict) == 0: self.response.error( f"No target node found in message KG for Fisher's Exact Test") return self.response ## check if source node has more than one type. If so, throw an error if source_node_type is None: self.response.error( f"Source node with qnode id {source_qnode_id} was set to None in Query Graph. Please specify the node type" ) return self.response else: pass ## check if target node has more than one type. If so, throw an error if target_node_type is None: self.response.error( f"Target node with qnode id {target_qnode_id} was set to None in Query Graph. Please specify the node type" ) return self.response else: pass ##check how many kps were used in message KG. If more than one, the one with the max number of edges connnected to both source nodes and target nodes was used if len(collections.Counter(edge_expand_kp)) == 1: kp = edge_expand_kp[0] else: occurrences = collections.Counter(edge_expand_kp) max_index = max( [(value, index) for index, value in enumerate(occurrences.values())] )[1] # if there are more than one kp having the maximum number of edges, then the last one based on alphabetical order will be chosen. kp = list(occurrences.keys())[max_index] self.response.debug(f"{occurrences}") self.response.warning( f"More than one knowledge provider was detected to be used for expanding the edges connected to both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id}" ) self.response.warning( f"The knowledge provider {kp} was used to calculate Fisher's exact test because it has the maximum number of edges both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id}" ) ## Print out some information used to calculate FET if len(source_node_list) == 1: self.response.debug( f"{len(source_node_list)} source node with qnode id {source_qnode_id} and node type {source_node_type} was found in message KG and used to calculate Fisher's Exact Test" ) else: self.response.debug( f"{len(source_node_list)} source nodes with qnode id {source_qnode_id} and node type {source_node_type} was found in message KG and used to calculate Fisher's Exact Test" ) if len(target_node_dict) == 1: self.response.debug( f"{len(target_node_dict)} target node with qnode id {target_qnode_id} and node type {target_node_type} was found in message KG and used to calculate Fisher's Exact Test" ) else: self.response.debug( f"{len(target_node_dict)} target nodes with qnode id {target_qnode_id} and node type {target_node_type} was found in message KG and used to calculate Fisher's Exact Test" ) # find all nodes with the same type of 'source_qnode_id' nodes in specified KP ('ARAX/KG1','ARAX/KG2','BTE') that are adjacent to target nodes if kp == "ARAX/KG1": # query adjacent node in one DSL command by providing a list of query nodes to add_qnode() if rel_edge_id: if len( rel_edge_type ) == 1: # if the edge with rel_edge_id has only type, we use this rel_edge_type to find all source nodes in KP self.response.debug( f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total adjacent nodes in Fisher's Exact Test" ) result = self.query_size_of_adjacent_nodes( node_curie=list(target_node_dict.keys()), adjacent_type=source_node_type, kp=kp, rel_type=list(rel_edge_type)[0], use_cypher_command=True) else: # if the edge with rel_edge_id has more than one type, we ignore the edge type and use all types to find all source nodes in KP self.response.warning( f"The edges with specified qedge id {rel_edge_id} have more than one type, we ignore the edge type and use all types to calculate Fisher's Exact Test" ) self.response.debug( f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test" ) result = self.query_size_of_adjacent_nodes( node_curie=list(target_node_dict.keys()), adjacent_type=source_node_type, kp=kp, rel_type=None, use_cypher_command=True) else: # if no rel_edge_id is specified, we ignore the edge type and use all types to find all source nodes in KP self.response.debug( f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test" ) result = self.query_size_of_adjacent_nodes( node_curie=list(target_node_dict.keys()), adjacent_type=source_node_type, kp=kp, rel_type=None, use_cypher_command=True) if result is None: return self.response ## Something wrong happened for querying the adjacent nodes else: size_of_target = result else: # query adjacent node for query nodes one by one in parallel if rel_edge_id: if len( rel_edge_type ) == 1: # if the edge with rel_edge_id has only type, we use this rel_edge_type to find all source nodes in KP self.response.debug( f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total adjacent nodes in Fisher's Exact Test" ) parameter_list = [ (node, source_node_type, kp, list(rel_edge_type)[0]) for node in list(target_node_dict.keys()) ] else: # if the edge with rel_edge_id has more than one type, we ignore the edge type and use all types to find all source nodes in KP self.response.warning( f"The edges with specified qedge id {rel_edge_id} have more than one type, we ignore the edge type and use all types to calculate Fisher's Exact Test" ) self.response.debug( f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test" ) parameter_list = [(node, source_node_type, kp, None) for node in list(target_node_dict.keys()) ] else: # if no rel_edge_id is specified, we ignore the edge type and use all types to find all source nodes in KP self.response.debug( f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test" ) parameter_list = [(node, source_node_type, kp, None) for node in list(target_node_dict.keys())] ## get the count of all nodes with the type of 'source_qnode_id' nodes in KP for each target node in parallel try: with multiprocessing.Pool() as executor: target_count_res = [ elem for elem in executor.map( self._query_size_of_adjacent_nodes_parallel, parameter_list) ] executor.close() except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong with querying adjacent nodes in parallel" ) return self.response if any([type(elem) is list for elem in target_count_res]): for msg in [ elem2 for elem1 in target_count_res if type(elem1) is list for elem2 in elem1 ]: if type(msg) is tuple: self.response.error(msg[0], error_code=msg[1]) else: self.response.error(msg) return self.response ## Something wrong happened for querying the adjacent nodes else: for index in range(len(target_node_dict)): node = list(target_node_dict.keys())[index] size_of_target[node] = target_count_res[index] ## Based on KP detected in message KG, find the total number of node with the same type of source node if kp == 'ARAX/KG1': size_of_total = self.size_of_given_type_in_KP( node_type=source_node_type, use_cypher_command=True, kg='KG1') ## Try cypher query first if size_of_total is not None: if size_of_total != 0: self.response.debug( f"ARAX/KG1 and cypher query were used to calculate total number of node with the same type of source node in Fisher's Exact Test" ) self.response.debug( f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1" ) pass else: size_of_total = self.size_of_given_type_in_KP( node_type=source_node_type, use_cypher_command=False, kg='KG1' ) ## If cypher query fails, then try kgNodeIndex if size_of_total == 0: self.response.error( f"KG1 has 0 node with the same type of source node with qnode id {source_qnode_id}" ) return self.response else: self.response.debug( f"ARAX/KG1 and kgNodeIndex were used to calculate total number of node with the same type of source node in Fisher's Exact Test" ) self.response.debug( f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1" ) pass else: return self.response ## Something wrong happened for querying total number of node with the same type of source node elif kp == 'ARAX/KG2': ## check KG1 first as KG2 might have many duplicates. If KG1 is 0, then check KG2 size_of_total = self.size_of_given_type_in_KP( node_type=source_node_type, use_cypher_command=True, kg='KG1') ## Try cypher query first if size_of_total is not None: if size_of_total != 0: self.response.warning( f"Although ARAX/KG2 was found to have the maximum number of edges connected to both {source_qnode_id} and {target_qnode_id}, ARAX/KG1 and cypher query were used to find the total number of nodes with the same type of source node with qnode id {source_qnode_id} as KG2 might have many duplicates" ) self.response.debug( f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1" ) pass else: size_of_total = self.size_of_given_type_in_KP( node_type=source_node_type, use_cypher_command=False, kg='KG1' ) ## If cypher query fails, then try kgNodeIndex if size_of_total is not None: if size_of_total != 0: self.response.warning( f"Although ARAX/KG2 was found to have the maximum number of edges connected to both {source_qnode_id} and {target_qnode_id}, ARAX/KG1 and kgNodeIndex were used to find the total number of nodes with the same type of source node with qnode id {source_qnode_id} as KG2 might have many duplicates" ) self.response.debug( f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1" ) pass else: size_of_total = self.size_of_given_type_in_KP( node_type=source_node_type, use_cypher_command=False, kg='KG2') if size_of_total is None: return self.response ## Something wrong happened for querying total number of node with the same type of source node elif size_of_total == 0: self.response.error( f"KG2 has 0 node with the same type of source node with qnode id {source_qnode_id}" ) return self.response else: self.response.debug( f"ARAX/KG2 and kgNodeIndex were used to calculate total number of node with the same type of source node in Fisher's Exact Test" ) self.response.debug( f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG2" ) pass else: return self.response ## Something wrong happened for querying total number of node with the same type of source node else: return self.response ## Something wrong happened for querying total number of node with the same type of source node else: self.response.error( f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally" ) return self.response size_of_query_sample = len(source_node_list) self.response.debug(f"Computing Fisher's Exact Test P-value") # calculate FET p-value for each target node in parallel parameter_list = [ (node, len(target_node_dict[node]), size_of_target[node] - len(target_node_dict[node]), size_of_query_sample - len(target_node_dict[node]), (size_of_total - size_of_target[node]) - (size_of_query_sample - len(target_node_dict[node]))) for node in target_node_dict ] try: with multiprocessing.Pool() as executor: FETpvalue_list = [ elem for elem in executor.map( self._calculate_FET_pvalue_parallel, parameter_list) ] executor.close() except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong with computing Fisher's Exact Test P-value" ) return self.response if any([type(elem) is list for elem in FETpvalue_list]): for msg in [ elem2 for elem1 in FETpvalue_list if type(elem1) is list for elem2 in elem1 ]: if type(msg) is tuple: self.response.error(msg[0], error_code=msg[1]) else: self.response.error(msg) return self.response else: output = dict(FETpvalue_list) # check if the results need to be filtered output = dict(sorted(output.items(), key=lambda x: x[1])) if cutoff: output = dict(filter(lambda x: x[1] < cutoff, output.items())) else: pass if top_n: output = dict(list(output.items())[:top_n]) else: pass # add the virtual edge with FET result to message KG self.response.debug( f"Adding virtual edge with FET result to message KG") virtual_edge_list = [ Edge(id=f"{value[0]}_{index}", type='has_fisher_exact_test_p-value_with', relation=value[0], source_id=value[2], target_id=value[3], is_defined_by="ARAX", defined_datetime=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), provided_by="ARAX", confidence=None, weight=None, edge_attributes=[ EdgeAttribute(type="data:1669", name="fisher_exact_test_p-value", value=str(value[1]), url=None) ], qedge_ids=[value[0]]) for index, value in enumerate( [(virtual_relation_label, output[adj], node, adj) for adj in target_node_dict if adj in output.keys() for node in target_node_dict[adj]], 1) ] self.message.knowledge_graph.edges.extend(virtual_edge_list) count = len(virtual_edge_list) self.response.debug( f"{count} new virtual edges were added to message KG") # add the virtual edge to message QG if count > 0: self.response.debug(f"Adding virtual edge to message QG") edge_type = "has_fisher_exact_test_p-value_with" q_edge = QEdge(id=virtual_relation_label, type=edge_type, relation=virtual_relation_label, source_id=source_qnode_id, target_id=target_qnode_id) self.message.query_graph.edges.append(q_edge) self.response.debug(f"One virtual edge was added to message QG") return self.response
def _add_virtual_edges(self, source_qnode_id, target_qnode_id): # This function adds ICEES exposures data as virtual edges between nodes with the specified qnode IDs knowledge_graph = self.message.knowledge_graph log = self.response nodes_by_qg_id = self._get_nodes_by_qg_id(knowledge_graph) source_curies = set(nodes_by_qg_id.get(source_qnode_id)) target_curies = set(nodes_by_qg_id.get(target_qnode_id)) # Determine which curies ICEES 'knows' about known_source_curies = { curie for curie in source_curies if self._get_accepted_synonyms(curie) } known_target_curies = { curie for curie in target_curies if self._get_accepted_synonyms(curie) } num_node_pairs_recognized = 0 for source_curie, target_curie in itertools.product( source_curies, target_curies): # Query ICEES only for synonyms it 'knows' about if source_curie in known_source_curies and target_curie in known_target_curies: accepted_source_synonyms = self._get_accepted_synonyms( source_curie) accepted_target_synonyms = self._get_accepted_synonyms( target_curie) for source_synonym, target_synonym in itertools.product( accepted_source_synonyms, accepted_target_synonyms): qedge = QEdge( id=f"icees_{source_synonym}--{target_synonym}", source_id=source_synonym, target_id=target_synonym) log.debug( f"Sending query to ICEES+ for {source_synonym}--{target_synonym}" ) p_value = self._get_icees_p_value_for_edge(qedge, log) if p_value is not None: num_node_pairs_recognized += 1 # Add a new virtual edge with this data virtual_edge = self._create_icees_virtual_edge( source_curie, target_curie, p_value) knowledge_graph.edges.append(virtual_edge) break # Don't worry about checking remaining synonym combos if we got results # Add an 'empty' virtual edge (p-value of None) if we couldn't find any results for this node pair #1009 empty_virtual_edge = self._create_icees_virtual_edge( source_curie, target_curie, None) knowledge_graph.edges.append(empty_virtual_edge) # Add a qedge to the query graph that corresponds to our new virtual edges new_qedge = QEdge(id=self.virtual_relation_label, source_id=source_qnode_id, target_id=target_qnode_id, type=self.icees_edge_type) self.message.query_graph.edges.append(new_qedge) if num_node_pairs_recognized: log.info( f"ICEES+ returned data for {num_node_pairs_recognized} node pairs" ) else: log.warning( f"Could not find ICEES+ exposures data for any {source_qnode_id}--{target_qnode_id} node pairs" )
def add_qedge(self, message, input_parameters, describe=False): """ Adds a new QEdge object to the QueryGraph inside the Message object :return: Response object with execution information :rtype: Response """ # #### Internal documentation setup allowable_parameters = { 'id': { 'Any string that is unique among all QEdge id fields, with recommended format e00, e01, e02, etc.' }, 'source_id': { 'id of the source QNode already present in the QueryGraph (e.g. n01, n02)' }, 'target_id': { 'id of the target QNode already present in the QueryGraph (e.g. n01, n02)' }, 'type': { 'Any valid Translator/BioLink relationship type (e.g. physically_interacts_with, participates_in)' }, } if describe: #allowable_parameters['action'] = { 'None' } #allowable_parameters = dict() allowable_parameters[ 'dsl_command'] = '`add_qedge()`' # can't get this name at run-time, need to manually put it in per https://www.python.org/dev/peps/pep-3130/ allowable_parameters[ 'brief_description'] = """The `add_qedge` method adds an additional QEdge to the QueryGraph in the Message object. Currently source_id and target_id QNodes must already be present in the QueryGraph. The specified type is not currently checked that it is a valid Translator/BioLink relationship type, but it should be.""" return allowable_parameters #### Define a default response response = Response() self.response = response self.message = message #### Basic checks on arguments if not isinstance(input_parameters, dict): response.error("Provided parameters is not a dict", error_code="ParametersNotDict") return response #### Define a complete set of allowed parameters and their defaults parameters = { 'id': None, 'source_id': None, 'target_id': None, 'type': None, } #### Loop through the input_parameters and override the defaults and make sure they are allowed for key, value in input_parameters.items(): if key not in parameters: response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter") else: parameters[key] = value #### Return if any of the parameters generated an error (showing not just the first one) if response.status != 'OK': return response #### Store these final parameters for convenience response.data['parameters'] = parameters self.parameters = parameters #### Now apply the filters. Order of operations is probably quite important #### Scalar value filters probably come first like minimum_confidence, then complex logic filters #### based on edge or node properties, and then finally maximum_results response.info( f"Adding a QueryEdge to Message with parameters {parameters}") #### Make sure there's a query_graph already here if message.query_graph is None: message.query_graph = QueryGraph() message.query_graph.nodes = [] message.query_graph.edges = [] if message.query_graph.edges is None: message.query_graph.edges = [] #### Create a QEdge qedge = QEdge() if parameters['id'] is not None: id = parameters['id'] else: id = self.__get_next_free_edge_id() qedge.id = id #### Get the list of available node_ids qnodes = message.query_graph.nodes ids = {} for qnode in qnodes: id = qnode.id ids[id] = 1 #### Add the source_id if parameters['source_id'] is not None: if parameters['source_id'] not in ids: response.error( f"While trying to add QEdge, there is no QNode with id {parameters['source_id']}", error_code="UnknownSourceId") return response qedge.source_id = parameters['source_id'] else: response.error( f"While trying to add QEdge, source_id is a required parameter", error_code="MissingSourceId") return response #### Add the target_id if parameters['target_id'] is not None: if parameters['target_id'] not in ids: response.error( f"While trying to add QEdge, there is no QNode with id {parameters['target_id']}", error_code="UnknownTargetId") return response qedge.target_id = parameters['target_id'] else: response.error( f"While trying to add QEdge, target_id is a required parameter", error_code="MissingTargetId") return response #### Add the type if any. Need to verify it's an allowed type. FIXME if parameters['type'] is not None: qedge.type = parameters['type'] #### Add it to the query_graph edge list message.query_graph.edges.append(qedge) #### Return the response return response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the edge_attributes :default: The default value to set for NGD if it returns a nan :return: response """ if self.response.status != 'OK': # Catches any errors that may have been logged during initialization self._close_database() return self.response parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info( f"Computing the normalized Google distance: weighting edges based on source/target node " f"co-occurrence frequency in PubMed abstracts") self.response.info( "Converting CURIE identifiers to human readable names") node_curie_to_name = dict() try: for node in self.message.knowledge_graph.nodes: node_curie_to_name[node.id] = node.name except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Something went wrong when converting names") self.response.error(tb, error_code=error_type.__name__) name = "normalized_google_distance" type = "EDAM:data_2526" value = self.parameters['default_value'] url = "https://arax.rtx.ai/api/rtx/v1/ui/#/PubmedMeshNgd" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curies_to_names = dict() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: source_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name if parameters['target_qnode_id'] in node.qnode_ids: target_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name # Convert these curies to their canonicalized curies (needed for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map( list(source_curies_to_decorate.union( target_curies_to_decorate))) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) added_flag = False # check to see if any edges where added num_computed_total = 0 num_computed_slow = 0 self.response.debug( f"Looping through node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be source_name = curies_to_names[source_curie] target_name = curies_to_names[target_curie] num_computed_total += 1 canonical_source_curie = canonicalized_curie_map.get( source_curie, source_curie) canonical_target_curie = canonicalized_curie_map.get( target_curie, target_curie) ngd_value = self.calculate_ngd_fast(canonical_source_curie, canonical_target_curie) if ngd_value is None: ngd_value = self.NGD.get_ngd_for_all( [source_curie, target_curie], [source_name, target_name]) self.response.debug( f"Had to use eUtils to compute NGD between {source_name} " f"({canonical_source_curie}) and {target_name} ({canonical_target_curie}). " f"Value is: {ngd_value}") num_computed_slow += 1 if np.isfinite( ngd_value ): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value edge_attribute = EdgeAttribute( type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "has_normalized_google_distance_with" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = "has_normalized_google_distance_with" relation = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id']) self.message.query_graph.edges.append(q_edge) self.response.info(f"NGD values successfully added to edges") num_computed_fast = num_computed_total - num_computed_slow percent_computed_fast = round( (num_computed_fast / num_computed_total) * 100) self.response.debug( f"Used fastNGD for {percent_computed_fast}% of edges " f"({num_computed_fast} of {num_computed_total})") else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map( [node.id for node in self.message.knowledge_graph.nodes]) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) num_computed_total = 0 num_computed_slow = 0 self.response.debug( f"Looping through edges and calculating NGD values") for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [ ] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_name = node_curie_to_name[source_curie] target_name = node_curie_to_name[target_curie] num_computed_total += 1 canonical_source_curie = canonicalized_curie_map.get( source_curie, source_curie) canonical_target_curie = canonicalized_curie_map.get( target_curie, target_curie) ngd_value = self.calculate_ngd_fast( canonical_source_curie, canonical_target_curie) if ngd_value is None: ngd_value = self.NGD.get_ngd_for_all( [source_curie, target_curie], [source_name, target_name]) self.response.debug( f"Had to use eUtils to compute NGD between {source_name} " f"({canonical_source_curie}) and {target_name} ({canonical_target_curie}). " f"Value is: {ngd_value}") num_computed_slow += 1 if np.isfinite( ngd_value ): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value ngd_edge_attribute = EdgeAttribute( type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute edge.edge_attributes.append( ngd_edge_attribute ) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") num_computed_fast = num_computed_total - num_computed_slow percent_computed_fast = round( (num_computed_fast / num_computed_total) * 100) self.response.debug( f"Used fastNGD for {percent_computed_fast}% of edges " f"({num_computed_fast} of {num_computed_total})") self._close_database() return self.response
def copy_qedge(old_qedge: QEdge) -> QEdge: new_qedge = QEdge() for edge_property in new_qedge.to_dict(): value = getattr(old_qedge, edge_property) setattr(new_qedge, edge_property, value) return new_qedge
def answer(source_node_ID, target_node_type, association_node_type, use_json=False, threshold=0.2, n=20): """ Answers the question what X are similar to Y based on overlap of common Z nodes. X is target_node_type, Y is source_node_ID, Z is association_node_type. The relationships are automatically determined in SimilarNodesInCommon by looking for 1 hop relationships and poping the FIRST one (you are warned). :param source_node_ID: actual name in the KG :param target_node_type: kinds of nodes you want returned :param association_node_type: kind of node you are computing the Jaccard overlap on :param use_json: print the results in standardized format :param threshold: only return results where jaccard is >= this threshold :param n: number of results to return (default 20) :return: reponse (or printed text) """ # Initialize the response class response = FormatOutput.FormatResponse(5) # add the column names for the row data response.message.table_column_names = [ "source name", "source ID", "target name", "target ID", "Jaccard index" ] # Initialize the similar nodes class similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon() # get the description source_node_description = RU.get_node_property(source_node_ID, 'name') # get the source node label source_node_label = RU.get_node_property(source_node_ID, 'label') # Get the nodes in common node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association( source_node_ID, target_node_type, association_node_type, threshold) # reduce to top 100 if len(node_jaccard_tuples_sorted) > n: node_jaccard_tuples_sorted = node_jaccard_tuples_sorted[0:n] # make sure that the input node isn't in the list node_jaccard_tuples_sorted = [ i for i in node_jaccard_tuples_sorted if i[0] != source_node_ID ] # check for an error if error_code is not None or error_message is not None: if not use_json: print(error_message) return else: response.add_error_message(error_code, error_message) response.print() return #### If use_json not specified, then return results as a fairly plain list if not use_json: to_print = "The %s's involving similar %ss as %s are: \n" % ( target_node_type, association_node_type, source_node_description) for other_disease_ID, jaccard in node_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'name'), jaccard) print(to_print) #### Else if use_json requested, return the results in the Translator standard API JSON format else: #### Create the QueryGraph for this type of question query_graph = QueryGraph() source_node = QNode() source_node.id = "n00" source_node.curie = source_node_ID source_node.type = source_node_label association_node = QNode() association_node.id = "n01" association_node.type = association_node_type association_node.is_set = True target_node = QNode() target_node.id = "n02" target_node.type = target_node_type query_graph.nodes = [source_node, association_node, target_node] #source_association_relationship_type = "unknown1" edge1 = QEdge() edge1.id = "en00-n01" edge1.source_id = "n00" edge1.target_id = "n01" #edge1.type = source_association_relationship_type #association_target_relationship_type = "unknown2" edge2 = QEdge() edge2.id = "en01-n02" edge2.source_id = "n01" edge2.target_id = "n02" #edge2.type = association_target_relationship_type query_graph.edges = [edge1, edge2] #### DONT Suppress the query_graph because we can now do the knowledge_map with v0.9.1 response.message.query_graph = query_graph #### Create a mapping dict with the source curie and node types and edge types. This dict is used for reverse lookups by type #### for mapping to the QueryGraph. There is a potential point of failure here if there are duplicate node or edge types. FIXME response._type_map = dict() response._type_map[source_node.curie] = source_node.id response._type_map[association_node.type] = association_node.id response._type_map[target_node.type] = target_node.id response._type_map["e" + edge1.source_id + "-" + edge1.target_id] = edge1.id response._type_map["e" + edge2.source_id + "-" + edge2.target_id] = edge2.id #### Extract the sorted IDs from the list of tuples node_jaccard_ID_sorted = [ id for id, jac in node_jaccard_tuples_sorted ] # print(RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type, # [association_node_type], with_rel=[], directed=True, debug=True)) # get the entire subgraph g = RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type, [association_node_type], with_rel=[], directed=False, debug=False) # extract the source_node_number for node, data in g.nodes(data=True): if data['properties']['id'] == source_node_ID: source_node_number = node break # Get all the target numbers target_id2numbers = dict() node_jaccard_ID_sorted_set = set(node_jaccard_ID_sorted) for node, data in g.nodes(data=True): if data['properties']['id'] in node_jaccard_ID_sorted_set: target_id2numbers[data['properties']['id']] = node for other_disease_ID, jaccard in node_jaccard_tuples_sorted: target_name = RU.get_node_property(other_disease_ID, 'name') to_print = "The %s %s involves similar %ss as %s with similarity value %f" % ( target_node_type, target_name, association_node_type, source_node_description, jaccard) # get all the shortest paths between source and target all_paths = nx.all_shortest_paths( g, source_node_number, target_id2numbers[other_disease_ID]) # get all the nodes on these paths #try: if 1 == 1: rel_nodes = set() for path in all_paths: for node in path: rel_nodes.add(node) if rel_nodes: # extract the relevant subgraph sub_g = nx.subgraph(g, rel_nodes) # add it to the response res = response.add_subgraph(sub_g.nodes(data=True), sub_g.edges(data=True), to_print, jaccard, return_result=True) res.essence = "%s" % target_name # populate with essence of question result res.essence_type = target_node_type row_data = [] # initialize the row data row_data.append("%s" % source_node_description) row_data.append("%s" % source_node_ID) row_data.append("%s" % target_name) row_data.append("%s" % other_disease_ID) row_data.append("%f" % jaccard) res.row_data = row_data # except: # pass response.print()
def answer(self, source_name, target_label, relationship_type, use_json=False, directed=False): """ Answer a question of the type "What proteins does drug X target" but is general: what <node X type> does <node Y grounded> <relatioship Z> that can be answered in one hop in the KG (increasing the step size if necessary). :param query_terms: a triple consisting of a source node name (KG neo4j node name, the target label (KG neo4j "node label") and the relationship type (KG neo4j "Relationship type") :param source_name: KG neo4j node name (eg "carbetocin") :param target_label: KG node label (eg. "protein") :param relationship_type: KG relationship type (eg. "physically_interacts_with") :param use_json: If the answer should be in Eric's Json standardized API output format :return: list of dictionaries containing the nodes that are one hop (along relationship type) that connect source to target. """ # Get label/kind of node the source is source_label = RU.get_node_property(source_name, "label") # Get the subgraph (all targets along relationship) has_intermediate_node = False try: g = RU.return_subgraph_paths_of_type(source_name, source_label, None, target_label, [relationship_type], directed=directed) except CustomExceptions.EmptyCypherError: try: has_intermediate_node = True g = RU.return_subgraph_paths_of_type( source_name, source_label, None, target_label, ['subclass_of', relationship_type], directed=directed) except CustomExceptions.EmptyCypherError: error_message = "No path between %s and %s via relationship %s" % ( source_name, target_label, relationship_type) error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) return response # extract the source_node_number for node, data in g.nodes(data=True): if data['properties']['id'] == source_name: source_node_number = node break # Get all the target numbers target_numbers = [] for node, data in g.nodes(data=True): if data['properties']['id'] != source_name: target_numbers.append(node) # if there's an intermediate node, get the name if has_intermediate_node: neighbors = list(g.neighbors(source_node_number)) if len(neighbors) > 1: error_message = "More than one intermediate node" error_code = "AmbiguousPath" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) return response else: intermediate_node = neighbors.pop() #### If use_json not specified, then return results as a fairly plain list if not use_json: results_list = list() for target_number in target_numbers: data = g.nodes[target_number] results_list.append({ 'type': list(set(data['labels']) - {'Base'}).pop(), 'name': data['properties']['name'], 'desc': data['properties']['name'], 'prob': 1 }) # All these are known to be true return results_list #### Else if use_json requested, return the results in the Translator standard API JSON format else: response = FormatOutput.FormatResponse(3) # it's a Q3 question response.message.table_column_names = [ "source name", "source ID", "target name", "target ID" ] source_description = g.nodes[source_node_number]['properties'][ 'name'] #### Create the QueryGraph for this type of question query_graph = QueryGraph() source_node = QNode() source_node.id = "n00" source_node.curie = g.nodes[source_node_number]['properties']['id'] source_node.type = g.nodes[source_node_number]['properties'][ 'category'] target_node = QNode() target_node.id = "n01" target_node.type = target_label query_graph.nodes = [source_node, target_node] edge1 = QEdge() edge1.id = "e00" edge1.source_id = "n00" edge1.target_id = "n01" edge1.type = relationship_type query_graph.edges = [edge1] response.message.query_graph = query_graph #### Create a mapping dict with the source curie and the target type. This dict is used for reverse lookups by type #### for mapping to the QueryGraph. response._type_map = dict() response._type_map[source_node.curie] = source_node.id response._type_map[target_node.type] = target_node.id response._type_map[edge1.type] = edge1.id #### Loop over all the returned targets and put them into the response structure for target_number in target_numbers: target_description = g.nodes[target_number]['properties'][ 'name'] if not has_intermediate_node: subgraph = g.subgraph([source_node_number, target_number]) else: subgraph = g.subgraph( [source_node_number, intermediate_node, target_number]) res = response.add_subgraph( subgraph.nodes(data=True), subgraph.edges(data=True), "%s and %s are connected by the relationship %s" % (source_description, target_description, relationship_type), 1, return_result=True) res.essence = "%s" % target_description # populate with essence of question result res.essence_type = g.nodes[target_number]['properties'][ 'category'] # populate with the type of the essence of question result row_data = [] # initialize the row data row_data.append("%s" % source_description) row_data.append( "%s" % g.nodes[source_node_number]['properties']['id']) row_data.append("%s" % target_description) row_data.append("%s" % g.nodes[target_number]['properties']['id']) res.row_data = row_data return response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the edge_attributes :default: The default value to set for NGD if it returns a nan :return: response """ parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info(f"Computing the normalized Google distance: weighting edges based on source/target node " f"co-occurrence frequency in PubMed abstracts") self.response.info("Converting CURIE identifiers to human readable names") node_curie_to_name = dict() try: for node in self.message.knowledge_graph.nodes: node_curie_to_name[node.id] = node.name except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Something went wrong when converting names") self.response.error(tb, error_code=error_type.__name__) self.response.warning(f"Utilizing API calls to NCBI eUtils, so this may take a while...") name = "normalized_google_distance" type = "data:2526" value = self.parameters['default_value'] url = "https://arax.rtx.ai/api/rtx/v1/ui/#/PubmedMeshNgd" ngd_method_counts = {"fast": 0, "slow": 0} # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curies_to_names = dict() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: source_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name if parameters['target_qnode_id'] in node.qnode_ids: target_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be source_name = curies_to_names[source_curie] target_name = curies_to_names[target_curie] self.response.debug(f"Computing NGD between {source_name} and {target_name}") ngd_value, method_used = self.NGD.get_ngd_for_all_fast([source_curie, target_curie], [source_name, target_name]) ngd_method_counts[method_used] += 1 if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "has_normalized_google_distance_with" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = "has_normalized_google_distance_with" relation = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters[ 'target_qnode_id']) self.message.query_graph.edges.append(q_edge) else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_name = node_curie_to_name[source_curie] target_name = node_curie_to_name[target_curie] ngd_value, method_used = self.NGD.get_ngd_for_all_fast([source_curie, target_curie], [source_name, target_name]) ngd_method_counts[method_used] += 1 if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute edge.edge_attributes.append(ngd_edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") self.response.debug(f"Used fast NGD for {ngd_method_counts['fast']} edges, back-up NGD method for {ngd_method_counts['slow']}") return self.response
def compute_jaccard(self): message = self.message parameters = self.parameters self.response.debug(f"Computing Jaccard distance and adding this information as virtual edges") self.response.info(f"Computing Jaccard distance and adding this information as virtual edges") self.response.info("Getting all relevant nodes") # TODO: should I check that they're connected to the start node, or just assume that they are? # TODO: For now, assume that they are try: intermediate_nodes = set() end_node_to_intermediate_node_set = dict() # keys will be end node curies, values will be tuples the (intermediate curie ids, edge_type) for node in message.knowledge_graph.nodes: if parameters['intermediate_node_id'] in node.qnode_ids: intermediate_nodes.add(node.id) # add the intermediate node by it's identifier # also look for the source node id if parameters['start_node_id'] in node.qnode_ids: source_node_id = node.id if parameters['end_node_id'] in node.qnode_ids: end_node_to_intermediate_node_set[node.id] = set() # now iterate over the edges to look for the ones we need to add # TODO: Here, I won't care which direction the edges are pointing for edge in message.knowledge_graph.edges: if edge.source_id in intermediate_nodes: # if source is intermediate if edge.target_id in end_node_to_intermediate_node_set: end_node_to_intermediate_node_set[edge.target_id].add((edge.source_id, edge.type)) # add source elif edge.target_id in intermediate_nodes: # if target is intermediate if edge.source_id in end_node_to_intermediate_node_set: end_node_to_intermediate_node_set[edge.source_id].add((edge.target_id, edge.type)) # add target # now compute the actual jaccard indexes denom = len(intermediate_nodes) end_node_to_jaccard = dict() for end_node_id in end_node_to_intermediate_node_set: # TODO: add code here if you care about edge types numerator = len(end_node_to_intermediate_node_set[end_node_id]) jacc = numerator / float(denom) end_node_to_jaccard[end_node_id] = jacc # now add them all as virtual edges # edge properties j_iter = 0 now = datetime.now() #edge_type = parameters['virtual_edge_type'] edge_type = 'has_jaccard_index_with' qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the jaccard index the weight try: source_id = source_node_id except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.warning( f"Source node id: {parameters['start_node_id']} not found in the KG. Perhaps the KG is empty?") #self.response.error(tb, error_code=error_type.__name__) # edge attribute properties description = f"Jaccard index based on intermediate query nodes {parameters['intermediate_node_id']}" attribute_type = 'data:1772' name = "jaccard_index" url = None # now actually add the virtual edges in for end_node_id, value in end_node_to_jaccard.items(): edge_attribute = EdgeAttribute(type=attribute_type, name=name, value=value, url=url) id = f"J{j_iter}" j_iter += 1 target_id = end_node_id edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['start_node_id'], target_id=parameters['end_node_id']) # TODO: ok to make the id and type the same thing? self.message.query_graph.edges.append(q_edge) return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Something went wrong when computing the Jaccard index") self.response.error(tb, error_code=error_type.__name__)
def interpret_query_graph(self, query): """Try to interpret a QueryGraph and convert it into something RTX can process """ #### Create a default response dict response = { "message_code": "InternalError", "code_description": "interpret_query_graph exited abnormally" } query_graph = query["message"]["query_graph"] nodes = query_graph["nodes"] edges = query_graph["edges"] n_nodes = len(nodes) n_edges = len(edges) eprint("DEBUG: n_nodes = %d, n_edges = %d" % (n_nodes, n_edges)) #### Handle impossible cases if n_nodes == 0: response = { "message_code": "QueryGraphZeroNodes", "code_description": "Submitted QueryGraph has 0 nodes. At least 1 node is required" } return (response) if n_nodes == 1 and n_edges > 0: response = { "message_code": "QueryGraphTooManyEdges", "code_description": "Submitted QueryGraph may not have edges if there is only one node" } return (response) if n_nodes == 2 and n_edges > 1: response = { "message_code": "QueryGraphTooManyEdges", "code_description": "Submitted QueryGraph may not have more than 1 edge if there are only 2 nodes" } return (response) if n_nodes > 2: response = { "message_code": "UnsupportedQueryGraph", "code_description": "Submitted QueryGraph may currently only have 1 or 2 node. Support for 3 or more nodes coming soon." } return (response) #### Handle the single node case if n_nodes == 1: response = { "message_code": "OK", "code_description": "Interpreted QueryGraph as single node Q0" } response["id"] = "Q0" entity = nodes[0]["curie"] eprint("DEBUG: Q0 - entity = %s" % entity) response["terms"] = {"term": entity} response["original_question"] = "Submitted QueryGraph" response["restated_question"] = "What is %s?" % entity return (response) #### Handle the 2 node case if n_nodes == 2: eprint("DEBUG: Handling the 2-node case") source_type = None source_name = None target_type = None edge_type = None #### Loop through nodes trying to figure out which is the source and target for qnode in nodes: node = QNode.from_dict(qnode) if node.type == "gene": if node.curie is None: node.type = "protein" else: response = { "message_code": "UnsupportedNodeType", "code_description": "At least one of the nodes in the QueryGraph is a specific gene, which cannot be handled at the moment, a generic gene type with no curie is translated into a protein by RTX." } return (response) if node.curie is None: if node.type is None: response = { "message_code": "UnderspecifiedNode", "code_description": "At least one of the nodes in the QueryGraph has neither a CURIE nor a type. It must have one of those." } return (response) else: if target_type is None: target_type = node.type else: response = { "message_code": "TooManyTargets", "code_description": "Both nodes have only types and are interpreted as targets. At least one node must have an exact identity." } return (response) else: if re.match(r"'", node.curie): response = { "message_code": "IllegalCharacters", "code_description": "Node type contains one or more illegal characters." } return (response) if source_name is None: if node.type is None: response = { "message_code": "UnderspecifiedSourceNode", "code_description": "The source node must have a type in addition to a curie." } return (response) else: source_name = node.curie source_type = node.type else: response = { "message_code": "OverspecifiedQueryGraph", "code_description": "All nodes in the QueryGraph have exact identities, so there is really nothing left to query." } return (response) #### Loop over the edges (should be just 1), ensuring that it has a type and recording it for qedge in edges: edge = QEdge.from_dict(qedge) if edge.type is None: response = { "message_code": "EdgeWithNoType", "code_description": "At least one edge has no type. All edges must have a type." } return (response) else: edge_type = edge.type #### Perform a crude sanitation of the input parameters to make sure the shell command won't fail or cause harm if re.match(r"'", edge_type) or re.match( r"'", target_type) or re.match(r"'", source_name): response = { "message_code": "IllegalCharacters", "code_description": "The input query_graph entities contain one or more illegal characters." } return (response) #### Create the necessary components to hand off the queries to Q3Solution.py response = { "message_code": "OK", "code_description": "Interpreted QueryGraph as a single hop question" } response["id"] = "1hop" response["terms"] = { source_type: source_name, "target_label": target_type, "rel_type": edge_type } response["original_question"] = "Submitted QueryGraph" response[ "restated_question"] = "Which %s(s) are connected to the %s %s via edge type %s?" % ( target_type, source_type, source_name, edge_type) #response["execution_string"] = "Q3Solution.py -s '%s' -t '%s' -r '%s' -j --directed" % (source_name,target_type,edge_type) response[ "execution_string"] = "Q3Solution.py -s '%s' -t '%s' -r '%s' -j" % ( source_name, target_type, edge_type) return (response) return (response)