def _merge_answer_into_message_kg(answer_dict_kg: DictKnowledgeGraph, dict_kg: DictKnowledgeGraph, log: Response): # This function merges an answer KG (from the current edge/node expansion) into the overarching KG log.debug("Merging answer into Message.KnowledgeGraph") for qnode_id, nodes in answer_dict_kg.nodes_by_qg_id.items(): for node_key, node in nodes.items(): dict_kg.add_node(node, qnode_id) for qedge_id, edges_dict in answer_dict_kg.edges_by_qg_id.items(): for edge_key, edge in edges_dict.items(): dict_kg.add_edge(edge, qedge_id)
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using BTE. :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_ids for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ enforce_directionality = self.response.data['parameters'].get( 'enforce_directionality') continue_if_no_results = self.response.data['parameters'].get( 'continue_if_no_results') log = self.response answer_kg = DictKnowledgeGraph() edge_to_nodes_map = dict() valid_bte_inputs_dict = self._get_valid_bte_inputs_dict() # Validate our input to make sure it will work with BTE qedge, input_qnode, output_qnode = self._validate_and_pre_process_input( query_graph=query_graph, valid_bte_inputs_dict=valid_bte_inputs_dict, enforce_directionality=enforce_directionality, log=log) if log.status != 'OK': return answer_kg, edge_to_nodes_map # Use BTE to answer the query answer_kg, accepted_curies = self._answer_query_using_bte( input_qnode=input_qnode, output_qnode=output_qnode, qedge=qedge, answer_kg=answer_kg, valid_bte_inputs_dict=valid_bte_inputs_dict, log=log) if log.status != 'OK': return answer_kg, edge_to_nodes_map # Hack to achieve a curie-to-curie query, if necessary if eu.qg_is_fulfilled( query_graph, answer_kg) and input_qnode.curie and output_qnode.curie: answer_kg = self._prune_answers_to_achieve_curie_to_curie_query( answer_kg, output_qnode, qedge) # Report our findings if eu.qg_is_fulfilled(query_graph, answer_kg): answer_kg = eu.switch_kg_to_arax_curie_format(answer_kg) edge_to_nodes_map = self._create_edge_to_nodes_map( answer_kg, input_qnode.id, output_qnode.id) else: self._log_proper_no_results_message( accepted_curies, continue_if_no_results, valid_bte_inputs_dict['curie_prefixes'], log) return answer_kg, edge_to_nodes_map
def _expand_node(self, qnode_id: str, kp_to_use: str, continue_if_no_results: bool, query_graph: QueryGraph, use_synonyms: bool, synonym_handling: str, log: Response) -> DictKnowledgeGraph: # This function expands a single node using the specified knowledge provider log.debug(f"Expanding node {qnode_id} using {kp_to_use}") query_node = eu.get_query_node(query_graph, qnode_id) answer_kg = DictKnowledgeGraph() if log.status != 'OK': return answer_kg if not query_node.curie: log.error( f"Cannot expand a single query node if it doesn't have a curie", error_code="InvalidQuery") return answer_kg copy_of_qnode = eu.copy_qnode(query_node) if use_synonyms: self._add_curie_synonyms_to_query_nodes(qnodes=[copy_of_qnode], log=log, kp=kp_to_use) if copy_of_qnode.type in ["protein", "gene"]: copy_of_qnode.type = ["protein", "gene"] log.debug(f"Modified query node is: {copy_of_qnode.to_dict()}") # Answer the query using the proper KP valid_kps_for_single_node_queries = ["ARAX/KG1", "ARAX/KG2"] if kp_to_use in valid_kps_for_single_node_queries: from Expand.kg_querier import KGQuerier kg_querier = KGQuerier(log, kp_to_use) answer_kg = kg_querier.answer_single_node_query(copy_of_qnode) log.info( f"Query for node {copy_of_qnode.id} returned results ({eu.get_printable_counts_by_qg_id(answer_kg)})" ) # Make sure all qnodes have been fulfilled (unless we're continuing if no results) if log.status == 'OK' and not continue_if_no_results: if copy_of_qnode.id not in answer_kg.nodes_by_qg_id or not answer_kg.nodes_by_qg_id[ copy_of_qnode.id]: log.error( f"Returned answer KG does not contain any results for QNode {copy_of_qnode.id}", error_code="UnfulfilledQGID") return answer_kg if synonym_handling != 'add_all': answer_kg, edge_node_usage_map = self._deduplicate_nodes( dict_kg=answer_kg, edge_to_nodes_map={}, log=log) return answer_kg else: log.error( f"Invalid knowledge provider: {kp_to_use}. Valid options for single-node queries are " f"{', '.join(valid_kps_for_single_node_queries)}", error_code="InvalidKP") return answer_kg
def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kp: str, query_graph: QueryGraph, log: Response) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug(f"Processing query results for edge {query_graph.edges[0].id}") final_kg = DictKnowledgeGraph() edge_to_nodes_map = dict() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(neo4j_results[0]) if kp == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith('nodes'): # Example column name: 'nodes_n00' column_qnode_id = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): swagger_node = self._convert_neo4j_node_to_swagger_node(neo4j_node, kp) final_kg.add_node(swagger_node, column_qnode_id) # Load answer edges into our knowledge graph elif column_name.startswith('edges'): # Example column name: 'edges_e01' column_qedge_id = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): if kp == "KG2": swagger_edge = self._convert_kg2_edge_to_swagger_edge(neo4j_edge) else: swagger_edge = self._convert_kg1_edge_to_swagger_edge(neo4j_edge, node_uuid_to_curie_dict) # Record which of this edge's nodes correspond to which qnode_id if swagger_edge.id not in edge_to_nodes_map: edge_to_nodes_map[swagger_edge.id] = dict() for qnode in query_graph.nodes: edge_to_nodes_map[swagger_edge.id][qnode.id] = neo4j_edge.get(qnode.id) # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge, column_qedge_id) return final_kg, edge_to_nodes_map
def answer_single_node_query(self, qnode: QNode) -> DictKnowledgeGraph: continue_if_no_results = self.response.data['parameters']['continue_if_no_results'] kp = self.kp log = self.response final_kg = DictKnowledgeGraph() # Build and run a cypher query to get this node/nodes where_clause = f"{qnode.id}.id='{qnode.curie}'" if type(qnode.curie) is str else f"{qnode.id}.id in {qnode.curie}" cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode)} WHERE {where_clause} RETURN {qnode.id}" log.info(f"Sending cypher query for node {qnode.id} to {kp} neo4j") results = self._run_cypher_query(cypher_query, kp, log) # Load the results into swagger object model and add to our answer knowledge graph if not results: if continue_if_no_results: log.warning(f"No paths were found in {kp} satisfying this query graph") else: log.error(f"No paths were found in {kp} satisfying this query graph", error_code="NoResults") for result in results: neo4j_node = result.get(qnode.id) swagger_node = self._convert_neo4j_node_to_swagger_node(neo4j_node, kp) final_kg.add_node(swagger_node, qnode.id) return final_kg
def answer_one_hop_query(self, query_graph: QueryGraph) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using either KG1 or KG2. :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_ids for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response enforce_directionality = self.response.data['parameters']['enforce_directionality'] continue_if_no_results = self.response.data['parameters']['continue_if_no_results'] kp = self.kp final_kg = DictKnowledgeGraph() edge_to_nodes_map = dict() # Verify this is a valid one-hop query graph if len(query_graph.edges) != 1: log.error(f"KGQuerier.answer_one_hop_query() was passed a query graph that is not one-hop: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg, edge_to_nodes_map if len(query_graph.nodes) != 2: log.error(f"KGQuerier.answer_one_hop_query() was passed a query graph with more than two nodes: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg, edge_to_nodes_map qedge_id = query_graph.edges[0].id # Run the actual query and process results cypher_query = self._convert_one_hop_query_graph_to_cypher_query(query_graph, enforce_directionality, kp, log) if log.status != 'OK': return final_kg, edge_to_nodes_map neo4j_results = self._answer_one_hop_query_using_neo4j(cypher_query, qedge_id, kp, continue_if_no_results, log) if log.status != 'OK': return final_kg, edge_to_nodes_map final_kg, edge_to_nodes_map = self._load_answers_into_kg(neo4j_results, kp, query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map return final_kg, edge_to_nodes_map
def _deduplicate_nodes( dict_kg: DictKnowledgeGraph, edge_to_nodes_map: Dict[str, Dict[str, str]], log: Response ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug(f"Deduplicating nodes") deduplicated_kg = DictKnowledgeGraph( nodes={qnode_id: dict() for qnode_id in dict_kg.nodes_by_qg_id}, edges={qedge_id: dict() for qedge_id in dict_kg.edges_by_qg_id}) updated_edge_to_nodes_map = { edge_id: dict() for edge_id in edge_to_nodes_map } curie_mappings = dict() # First deduplicate the nodes for qnode_id, nodes in dict_kg.nodes_by_qg_id.items(): # Load preferred curie info from NodeSynonymizer for nodes we haven't seen before unmapped_node_ids = set(nodes).difference(set(curie_mappings)) log.debug( f"Getting preferred curies for {qnode_id} nodes returned in this step" ) canonicalized_nodes = eu.get_preferred_curies( list(unmapped_node_ids), log) if unmapped_node_ids else dict() if log.status != 'OK': return deduplicated_kg, updated_edge_to_nodes_map for node_id in unmapped_node_ids: # Figure out the preferred curie/name for this node node = nodes.get(node_id) canonicalized_node = canonicalized_nodes.get(node_id) if canonicalized_node: preferred_curie = canonicalized_node.get( 'preferred_curie', node_id) preferred_name = canonicalized_node.get( 'preferred_name', node.name) preferred_type = eu.convert_string_or_list_to_list( canonicalized_node.get('preferred_type', node.type)) curie_mappings[node_id] = preferred_curie else: # Means the NodeSynonymizer didn't recognize this curie preferred_curie = node_id preferred_name = node.name preferred_type = node.type curie_mappings[node_id] = preferred_curie # Add this node into our deduplicated KG as necessary # TODO: merge certain fields, like uri? if preferred_curie not in deduplicated_kg.nodes_by_qg_id[ qnode_id]: node.id = preferred_curie node.name = preferred_name node.type = preferred_type deduplicated_kg.add_node(node, qnode_id) # Then update the edges to reflect changes made to the nodes for qedge_id, edges in dict_kg.edges_by_qg_id.items(): for edge_id, edge in edges.items(): edge.source_id = curie_mappings.get(edge.source_id) edge.target_id = curie_mappings.get(edge.target_id) if not edge.source_id or not edge.target_id: log.error( f"Could not find preferred curie mappings for edge {edge_id}'s node(s)" ) return deduplicated_kg, updated_edge_to_nodes_map deduplicated_kg.add_edge(edge, qedge_id) # Update the edge-to-node map for this edge (used down the line for pruning) for qnode_id, corresponding_node_id in edge_to_nodes_map[ edge_id].items(): updated_edge_to_nodes_map[edge_id][ qnode_id] = curie_mappings.get(corresponding_node_id) log.debug( f"After deduplication, answer KG counts are: {eu.get_printable_counts_by_qg_id(deduplicated_kg)}" ) return deduplicated_kg, updated_edge_to_nodes_map
def _expand_edge( self, qedge: QEdge, kp_to_use: str, dict_kg: DictKnowledgeGraph, continue_if_no_results: bool, query_graph: QueryGraph, use_synonyms: bool, synonym_handling: str, log: Response ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: # This function answers a single-edge (one-hop) query using the specified knowledge provider log.info(f"Expanding edge {qedge.id} using {kp_to_use}") answer_kg = DictKnowledgeGraph() edge_to_nodes_map = dict() # Create a query graph for this edge (that uses synonyms as well as curies found in prior steps) edge_query_graph = self._get_query_graph_for_edge( qedge, query_graph, dict_kg, use_synonyms, kp_to_use, log) if log.status != 'OK': return answer_kg, edge_to_nodes_map if not any(qnode for qnode in edge_query_graph.nodes if qnode.curie): log.error( f"Cannot expand an edge for which neither end has any curies. (Could not find curies to use from " f"a prior expand step, and neither qnode has a curie specified.)", error_code="InvalidQuery") return answer_kg, edge_to_nodes_map valid_kps = ["ARAX/KG1", "ARAX/KG2", "BTE", "COHD", "NGD"] if kp_to_use not in valid_kps: log.error( f"Invalid knowledge provider: {kp_to_use}. Valid options are {', '.join(valid_kps)}", error_code="InvalidKP") return answer_kg, edge_to_nodes_map else: if kp_to_use == 'BTE': from Expand.bte_querier import BTEQuerier kp_querier = BTEQuerier(log) elif kp_to_use == 'COHD': from Expand.COHD_querier import COHDQuerier kp_querier = COHDQuerier(log) elif kp_to_use == 'NGD': from Expand.ngd_querier import NGDQuerier kp_querier = NGDQuerier(log) else: from Expand.kg_querier import KGQuerier kp_querier = KGQuerier(log, kp_to_use) answer_kg, edge_to_nodes_map = kp_querier.answer_one_hop_query( edge_query_graph) if log.status != 'OK': return answer_kg, edge_to_nodes_map log.debug( f"Query for edge {qedge.id} returned results ({eu.get_printable_counts_by_qg_id(answer_kg)})" ) # Do some post-processing (deduplicate nodes, remove self-edges..) if synonym_handling != 'add_all': answer_kg, edge_to_nodes_map = self._deduplicate_nodes( answer_kg, edge_to_nodes_map, log) if eu.qg_is_fulfilled(edge_query_graph, answer_kg): answer_kg = self._remove_self_edges(answer_kg, edge_to_nodes_map, qedge.id, edge_query_graph.nodes, log) # Make sure our query has been fulfilled (unless we're continuing if no results) if not eu.qg_is_fulfilled(edge_query_graph, answer_kg): if continue_if_no_results: log.warning( f"No paths were found in {kp_to_use} satisfying this query graph" ) else: log.error( f"No paths were found in {kp_to_use} satisfying this query graph", error_code="NoResults") return answer_kg, edge_to_nodes_map
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2). :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_ids for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response continue_if_no_results = self.response.data['parameters'][ 'continue_if_no_results'] final_kg = DictKnowledgeGraph() edge_to_nodes_map = dict() # Verify this is a valid one-hop query graph self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # Find potential answers using KG2 qedge = query_graph.edges[0] source_qnode = next(qnode for qnode in query_graph.nodes if qnode.id == qedge.source_id) target_qnode = next(qnode for qnode in query_graph.nodes if qnode.id == qedge.target_id) qedge_params_str = ", ".join( list( filter(None, [ f"id={qedge.id}", f"source_id={source_qnode.id}", f"target_id={target_qnode.id}", self._get_dsl_qedge_type_str(qedge) ]))) source_params_str = ", ".join( list( filter(None, [ f"id={source_qnode.id}", self._get_dsl_qnode_curie_str(source_qnode), self._get_dsl_qnode_type_str(source_qnode) ]))) target_params_str = ", ".join( list( filter(None, [ f"id={target_qnode.id}", self._get_dsl_qnode_curie_str(target_qnode), self._get_dsl_qnode_type_str(target_qnode) ]))) actions_list = [ f"add_qnode({source_params_str})", f"add_qnode({target_params_str})", f"add_qedge({qedge_params_str})", f"expand(kp=ARAX/KG2)", f"return(message=true, store=false)", ] kg2_answer_kg = self._run_arax_query(actions_list, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # Go through those answers from KG2 and calculate ngd for each edge kg2_edges_map = {edge.id: edge for edge in kg2_answer_kg.edges} kg2_nodes_map = {node.id: node for node in kg2_answer_kg.nodes} self.cngd.load_curie_to_pmids_data(kg2_nodes_map) kg2_edge_ngd_map = dict() for kg2_edge in kg2_edges_map.values(): kg2_node_1 = kg2_nodes_map.get( kg2_edge.source_id ) # These are already canonicalized (default behavior) kg2_node_2 = kg2_nodes_map.get(kg2_edge.target_id) # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional) if source_qnode.id in kg2_node_1.qnode_ids and target_qnode.id in kg2_node_2.qnode_ids: ngd_source_id = kg2_node_1.id ngd_target_id = kg2_node_2.id else: ngd_source_id = kg2_node_2.id ngd_target_id = kg2_node_1.id ngd_value = self.cngd.calculate_ngd_fast(ngd_source_id, ngd_target_id) kg2_edge_ngd_map[kg2_edge.id] = { "ngd_value": ngd_value, "source_id": ngd_source_id, "target_id": ngd_target_id } # Create edges for those from KG2 found to have a low enough ngd value for kg2_edge_id, ngd_info_dict in kg2_edge_ngd_map.items(): ngd_value = ngd_info_dict['ngd_value'] if ngd_value is not None and ngd_value < 0.5: # TODO: Make determination of the threshold much more sophisticated source_id = ngd_info_dict["source_id"] target_id = ngd_info_dict["target_id"] ngd_edge = self._create_ngd_edge(ngd_value, source_id, target_id) ngd_source_node = self._create_ngd_node( kg2_nodes_map.get(ngd_edge.source_id)) ngd_target_node = self._create_ngd_node( kg2_nodes_map.get(ngd_edge.target_id)) final_kg.add_edge(ngd_edge, qedge.id) final_kg.add_node(ngd_source_node, source_qnode.id) final_kg.add_node(ngd_target_node, target_qnode.id) edge_to_nodes_map[ngd_edge.id] = { source_qnode.id: ngd_source_node.id, target_qnode.id: ngd_target_node.id } if not eu.qg_is_fulfilled(query_graph, final_kg): if continue_if_no_results: log.warning( f"No paths were found satisfying this query graph using NGD" ) else: log.error( f"No paths were found satisfying this query graph using NGD", error_code="NoResults") return final_kg, edge_to_nodes_map
def _add_answers_to_kg(self, answer_kg: DictKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_id: str, output_qnode_id: str, qedge_id: str, log: Response) -> DictKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict( reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_ids = dict() log.debug( f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)" ) for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_id = node.get('id') swagger_node.name = node.get('name') swagger_node.type = eu.convert_string_or_list_to_list( eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_ids in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_id) if bte_qg_id == "n0": qnode_id = input_qnode_id elif bte_qg_id == "n1": qnode_id = output_qnode_id else: log.error("Could not map BTE qg_id to ARAX qnode_id", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_id == output_qnode_id: if bte_node_id in remapped_node_ids: swagger_node.id = remapped_node_ids.get(bte_node_id) else: equivalent_curies = [ f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get( 'equivalent_identifiers').items() for local_id in local_ids ] swagger_node.id = self._get_best_equivalent_bte_curie( equivalent_curies, swagger_node.type[0]) remapped_node_ids[bte_node_id] = swagger_node.id else: swagger_node.id = bte_node_id answer_kg.add_node(swagger_node, qnode_id) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge.id = edge.get("id") swagger_edge.type = edge.get('type') swagger_edge.source_id = remapped_node_ids.get( edge.get('source_id'), edge.get('source_id')) swagger_edge.target_id = remapped_node_ids.get( edge.get('target_id'), edge.get('target_id')) swagger_edge.is_defined_by = "BTE" swagger_edge.provided_by = edge.get('edge_source') # Map the returned BTE qg_id back to the original qedge_id in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge.id) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_id", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge, qedge_id) return answer_kg