def get_canonical_curies_dict(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, Dict[str, str]]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return {} else: if canonical_curies_dict is not None: unrecognized_curies = { input_curie for input_curie in canonical_curies_dict if not canonical_curies_dict.get(input_curie) } if unrecognized_curies: log.warning( f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}" ) return canonical_curies_dict else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return {}
def _add_inverted_predicates(qg: QueryGraph, log: ARAXResponse) -> QueryGraph: # For now, we'll consider BOTH predicates in an inverse pair (TODO: later tailor to what we know is in KG2) qedge = next(qedge for qedge in qg.edges.values()) response = requests.get( "https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.yaml" ) if response.status_code == 200: qedge.predicate = eu.convert_to_list(qedge.predicate) biolink_model = yaml.safe_load(response.text) inverse_predicates = set() for predicate in qedge.predicate: english_predicate = predicate.split(":")[-1].replace( "_", " ") # Converts to 'subclass of' format biolink_predicate_info = biolink_model["slots"].get( english_predicate) if biolink_predicate_info and "inverse" in biolink_predicate_info: english_inverse_predicate = biolink_predicate_info[ "inverse"] machine_inverse_predicate = f"biolink:{english_inverse_predicate.replace(' ', '_')}" inverse_predicates.add(machine_inverse_predicate) log.debug( f"Found inverse predicate for {predicate}: {machine_inverse_predicate}" ) qedge.predicate = list( set(qedge.predicate).union(inverse_predicates)) else: log.warning( f"Cannot check for inverse predicates: Failed to load Biolink Model yaml file. " f"(Page gave status {response.status_code}.)") return qg
def get_preferred_categories(curie: Union[str, List[str]], log: ARAXResponse) -> Optional[List[str]]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") if canonical_curies_dict is not None: recognized_input_curies = { input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") preferred_categories = { canonical_curies_dict[recognized_curie].get('preferred_category') for recognized_curie in recognized_input_curies } if preferred_categories: return list(preferred_categories) else: log.warning( f"Unable to find any preferred categories; will default to biolink:NamedThing" ) return ["biolink:NamedThing"] else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def _answer_query_using_bte(self, input_qnode_key: str, output_qnode_key: str, qg: QueryGraph, answer_kg: QGOrganizedKnowledgeGraph, valid_bte_inputs_dict: Dict[str, Set[str]], log: ARAXResponse) -> Tuple[QGOrganizedKnowledgeGraph, Set[str]]: accepted_curies = set() qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] input_qnode = qg.nodes[input_qnode_key] output_qnode = qg.nodes[output_qnode_key] # Send this single-edge query to BTE, input curie by input curie (adding findings to our answer KG as we go) for curie in input_qnode.id: # Consider all different combinations of qnode types (can be multiple if gene/protein) for input_qnode_category, output_qnode_category in itertools.product(input_qnode.category, output_qnode.category): if eu.get_curie_prefix(curie) in valid_bte_inputs_dict['curie_prefixes']: accepted_curies.add(curie) try: loop = asyncio.new_event_loop() seqd = SingleEdgeQueryDispatcher(input_cls=input_qnode_category, output_cls=output_qnode_category, pred=qedge.predicate, input_id=eu.get_curie_prefix(curie), values=eu.get_curie_local_id(curie), loop=loop) log.debug(f"Sending query to BTE: {curie}-{qedge.predicate if qedge.predicate else ''}->{output_qnode_category}") seqd.query() reasoner_std_response = seqd.to_reasoner_std() except Exception: trace_back = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem while using BioThings Explorer. {trace_back}", error_code=error_type.__name__) return answer_kg, accepted_curies else: answer_kg = self._add_answers_to_kg(answer_kg, reasoner_std_response, input_qnode_key, output_qnode_key, qedge_key, log) return answer_kg, accepted_curies
def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): node_key, node = self._convert_neo4j_node_to_trapi_node( neo4j_node, kg_name) final_kg.add_node(node_key, node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) final_kg.add_edge(edge_key, edge, column_qedge_key) return final_kg
def QGI_test5(): # This is to test forked/non-linear queries (currently not working properly) input_query_graph = { "message": { "query_graph": { "nodes": { "n0": { "categories": ["biolink:Gene"] }, "n1": { "ids": ["CHEBI:45783"], "categories": ["biolink:ChemicalEntity"] }, "n2": { "ids": ["MONDO:0005301"], "categories": ["biolink:Disease"] }, "n3": { "categories": ["biolink:ChemicalEntity"] } }, "edges": { "e01": { "subject": "n0", "object": "n1", "predicates": ["biolink:related_to"] }, "e02": { "subject": "n0", "object": "n2", "predicates": ["biolink:related_to"] }, "e03": { "subject": "n0", "object": "n3", "predicates": ["biolink:related_to"] } } } } } #### Create a template Message response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) message = ARAXMessenger().from_dict(input_query_graph['message']) response.envelope.message.query_graph = message.query_graph interpreter = ARAXQueryGraphInterpreter() interpreter.translate_to_araxi(response) if response.status != 'OK': print(response.show(level=ARAXResponse.DEBUG)) return response araxi_commands = response.data['araxi_commands'] for cmd in araxi_commands: print(f" - {cmd}")
def _convert_one_hop_query_graph_to_cypher_query( self, qg: QueryGraph, enforce_directionality: bool, log: ARAXResponse) -> str: qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] log.debug(f"Generating cypher for edge {qedge_key} query graph") try: # Build the match clause subject_qnode_key = qedge.subject object_qnode_key = qedge.object qedge_cypher = self._get_cypher_for_query_edge( qedge_key, qg, enforce_directionality) source_qnode_cypher = self._get_cypher_for_query_node( subject_qnode_key, qg) target_qnode_cypher = self._get_cypher_for_query_node( object_qnode_key, qg) match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}" # Build the where clause where_fragments = [] for qnode_key in [subject_qnode_key, object_qnode_key]: qnode = qg.nodes[qnode_key] if qnode.id and isinstance(qnode.id, list) and len(qnode.id) > 1: where_fragments.append(f"{qnode_key}.id in {qnode.id}") if qnode.category: qnode.category = eu.convert_to_list(qnode.category) if len(qnode.category) > 1: # Create where fragment that looks like 'n00:biolink:Disease OR n00:biolink:PhenotypicFeature..' category_sub_fragments = [ f"{qnode_key}:`{category}`" for category in qnode.category ] category_where_fragment = f"({' OR '.join(category_sub_fragments)})" where_fragments.append(category_where_fragment) where_clause = f"WHERE {' AND '.join(where_fragments)}" if where_fragments else "" # Build the with clause source_qnode_col_name = f"nodes_{subject_qnode_key}" target_qnode_col_name = f"nodes_{object_qnode_key}" qedge_col_name = f"edges_{qedge_key}" # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID extra_edge_properties = "{.*, " + f"id:ID({qedge_key}), {subject_qnode_key}:{subject_qnode_key}.id, {object_qnode_key}:{object_qnode_key}.id" + "}" with_clause = f"WITH collect(distinct {subject_qnode_key}) as {source_qnode_col_name}, " \ f"collect(distinct {object_qnode_key}) as {target_qnode_col_name}, " \ f"collect(distinct {qedge_key}{extra_edge_properties}) as {qedge_col_name}" # Build the return clause return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}" cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}" return cypher_query except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Problem generating cypher for query. {tb}", error_code=error_type.__name__) return ""
def _send_query_to_kp(self, query_graph: QueryGraph, log: ARAXResponse) -> Dict[str, any]: # Send query to their API (stripping down qnode/qedges to only the properties they like) stripped_qnodes = [] for qnode_key, qnode in query_graph.nodes.items(): stripped_qnode = {'id': qnode_key, 'type': qnode.category} if qnode.id: stripped_qnode['curie'] = qnode.id stripped_qnodes.append(stripped_qnode) qedge_key = next(qedge_key for qedge_key in query_graph.edges) # Our query graph is single-edge qedge = query_graph.edges[qedge_key] stripped_qedge = { 'id': qedge_key, 'source_id': qedge.subject, 'target_id': qedge.object, 'type': list(self.accepted_edge_types)[0] } source_stripped_qnode = next(qnode for qnode in stripped_qnodes if qnode['id'] == qedge.subject) input_curies = eu.convert_string_or_list_to_list( source_stripped_qnode['curie']) combined_response = dict() for input_curie in input_curies: # Until we have batch querying, ping them one-by-one for each input curie log.debug( f"Sending {qedge_key} query to {self.kp_name} for {input_curie}" ) source_stripped_qnode['curie'] = input_curie kp_response = requests.post(self.kp_query_endpoint, json={ 'message': { 'query_graph': { 'nodes': stripped_qnodes, 'edges': [stripped_qedge] } } }, headers={'accept': 'application/json'}) if kp_response.status_code != 200: log.warning( f"{self.kp_name} KP API returned response of {kp_response.status_code}" ) else: kp_response_json = kp_response.json() if kp_response_json.get('results'): if not combined_response: combined_response = kp_response_json else: combined_response['knowledge_graph'][ 'nodes'] += kp_response_json['knowledge_graph'][ 'nodes'] combined_response['knowledge_graph'][ 'edges'] += kp_response_json['knowledge_graph'][ 'edges'] combined_response['results'] += kp_response_json[ 'results'] return combined_response
def _run_arax_query(request_body: dict, log: ARAXResponse) -> Tuple[ARAXResponse, Message]: araxq = ARAXQuery() sub_query_response = araxq.query(request_body, mode="RTXKG2") if sub_query_response.status != 'OK': log.error( f"Encountered an error running ARAXQuery within Expand: {sub_query_response.show(level=sub_query_response.DEBUG)}" ) return sub_query_response, araxq.message
def test_example1(): query_graph = { "edges": { "e00": { "subject": "n00", "object": "n01" }, "e01": { "subject": "n00", "object": "n01", "predicate": "biolink:contraindicated_for", "exclude": True } }, "nodes": { "n00": { "id": "MONDO:0001627", "category": "biolink:Disease" }, "n01": { "category": "biolink:ChemicalSubstance" } } } from ARAX_messenger import ARAXMessenger response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) response.envelope.message.query_graph = QueryGraph().from_dict(query_graph) query_graph_info = QueryGraphInfo() result = query_graph_info.assess(response.envelope.message) response.merge(result) if result.status != 'OK': print(response.show(level=ARAXResponse.DEBUG)) return response query_graph_info_dict = { 'n_nodes': query_graph_info.n_nodes, 'n_edges': query_graph_info.n_edges, 'is_bifurcated_graph': query_graph_info.is_bifurcated_graph, 'start_node': query_graph_info.start_node, 'node_info': query_graph_info.node_info, 'edge_info': query_graph_info.edge_info, 'node_order': query_graph_info.node_order, 'edge_order': query_graph_info.edge_order, 'node_category_map': query_graph_info.node_category_map, 'edge_predicate_map': query_graph_info.edge_predicate_map, } print( json.dumps(ast.literal_eval(repr(query_graph_info_dict)), sort_keys=True, indent=2))
def _run_arax_query(actions_list: List[str], log: ARAXResponse) -> Tuple[ARAXResponse, Message]: araxq = ARAXQuery() sub_query_response = araxq.query( {"operations": { "actions": actions_list }}) if sub_query_response.status != 'OK': log.error( f"Encountered an error running ARAXQuery within Expand: {sub_query_response.show(level=sub_query_response.DEBUG)}" ) return sub_query_response, araxq.message
def QGI_test3(): input_query_graph = { "message": { "query_graph": { "nodes": { "n00": { "id": "MONDO:0002715" }, "n01": { "category": "biolink:ChemicalSubstance" }, "n02": { "category": "biolink:Gene" } }, "edges": { "e00": { "predicate": "biolink:correlated_with", "subject": "n00", "object": "n01" }, "e01": { "predicate": "biolink:related_to", "subject": "n01", "object": "n02" } } } } } #### Create a template Message response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) message = ARAXMessenger().from_dict(input_query_graph['message']) response.envelope.message.query_graph = message.query_graph interpreter = ARAXQueryGraphInterpreter() interpreter.translate_to_araxi(response) if response.status != 'OK': print(response.show(level=ARAXResponse.DEBUG)) return response araxi_commands = response.data['araxi_commands'] for cmd in araxi_commands: print(f" - {cmd}") #### Show the final result print('-------------------------') print(response.show(level=ARAXResponse.DEBUG)) print(json.dumps(message.to_dict(), sort_keys=True, indent=2))
def apply(self, input_message, input_parameters): #### Define a default response response = ARAXResponse() self.response = response self.message = input_message #### Basic checks on arguments if not isinstance(input_parameters, dict): response.error("Provided parameters is not a dict", error_code="ParametersNotDict") return response #### Define a complete set of allowed parameters and their defaults parameters = { 'maximum_results': None, 'minimum_confidence': None, 'start_node': 1 } #### Loop through the input_parameters and override the defaults and make sure they are allowed for key, value in input_parameters.items(): if key not in parameters: response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter") else: parameters[key] = value #### Return if any of the parameters generated an error (showing not just the first one) if response.status != 'OK': return response #### Store these final parameters for convenience response.data['parameters'] = parameters self.parameters = parameters #### Now apply the filters. Order of operations is probably quite important #### Scalar value filters probably come first like minimum_confidence, then complex logic filters #### based on edge or node properties, and then finally maximum_results response.debug( f"Applying filter to Message with parameters {parameters}") #### First, as a test, blow away the results and see if we can recompute them #message.n_results = 0 #message.results = [] #self.__recompute_results() #### Apply scalar value filters first to do easy things and reduce the problem # TODO #### Complex logic filters probably come next. These may be hard # TODO #### Finally, if the maximum_results parameter is set, then limit the number of results to that last if parameters['maximum_results'] is not None: self.__apply_maximum_results_filter(parameters['maximum_results']) #### Return the response return response
def determine_virtual_qedge_option_group(subject_qnode_key: str, object_qnode_key: str, query_graph: QueryGraph, log: ARAXResponse) -> Optional[str]: # Determines what option group ID a virtual qedge between the two input qnodes should have qnodes = [qnode for key, qnode in query_graph.nodes.items() if key in {subject_qnode_key, object_qnode_key}] qnode_option_group_ids = {qnode.option_group_id for qnode in qnodes if qnode.option_group_id} if len(qnode_option_group_ids) == 1: return list(qnode_option_group_ids)[0] elif len(qnode_option_group_ids) > 1: log.error(f"Cannot add a virtual qedge between two qnodes that belong to different option groups {qnode_option_group_ids}", error_code="InvalidQEdge") return None else: return None
def _answer_query_using_neo4j( self, cypher_query: str, qedge_key: str, kg_name: str, log: ARAXResponse) -> List[Dict[str, List[Dict[str, any]]]]: log.info( f"Sending cypher query for edge {qedge_key} to {kg_name} neo4j") results_from_neo4j = self._run_cypher_query(cypher_query, kg_name, log) if log.status == 'OK': columns_with_lengths = dict() for column in results_from_neo4j[0]: columns_with_lengths[column] = len( results_from_neo4j[0].get(column)) return results_from_neo4j
def test_add_qnode_bad_parameters(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' bad_parameters_list = [ { 'parameters': ['ids', 'PICKLES:123'], 'error_code': 'ParametersNotDict' }, { 'parameters': { 'pickles': 'on the side' }, 'error_code': 'UnknownParameter' }, { 'parameters': { 'ids': 'n2', 'category': 'biolink:Disease' }, 'error_code': 'UnknownParameter' }, ] template_response = copy.deepcopy(response) for bad_parameters in bad_parameters_list: response = copy.deepcopy(template_response) message = response.envelope.message print(bad_parameters) messenger.add_qnode(response, bad_parameters['parameters']) assert response.status == 'ERROR' assert len(message.query_graph.nodes) == 0 assert response.error_code == bad_parameters['error_code']
def test_add_qedge_duplicate_key(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message messenger.add_qnode(response, { 'key': 'n00', 'ids': ['CHEMBL.COMPOUND:CHEMBL112'] }) messenger.add_qnode(response, { 'key': 'n01', 'categories': ['biolink:Protein'] }) messenger.add_qedge(response, { 'key': 'e00', 'subject': 'n00', 'object': 'n01' }) assert response.status == 'OK' messenger.add_qedge( response, { 'key': 'e00', 'subject': 'n00', 'object': 'n01', 'predicates': ['biolink:treats'] }) print( json.dumps(ast.literal_eval(repr(message.query_graph.edges)), sort_keys=True, indent=2)) assert response.status == 'ERROR' assert isinstance(message.query_graph.nodes, dict) assert len(message.query_graph.edges) == 1 assert response.error_code == 'QEdgeDuplicateKey'
def QGI_test4(): input_query_graph = { "message": { "query_graph": { "nodes": { "n00": { "categories": [ "biolink:Gene" ], "is_set": False }, "n01": { "ids": [ "MONDO:0018177" ], "categories": [ "biolink:Disease" ], "is_set": False } }, "edges": { "e00": { "subject": "n00", "object": "n01", "exclude": False } } } } } #### Create a template Message response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) message = ARAXMessenger().from_dict(input_query_graph['message']) response.envelope.message.query_graph = message.query_graph interpreter = ARAXQueryGraphInterpreter() interpreter.translate_to_araxi(response) if response.status != 'OK': print(response.show(level=ARAXResponse.DEBUG)) return response araxi_commands = response.data['araxi_commands'] for cmd in araxi_commands: print(f" - {cmd}")
def __init__(self, log: ARAXResponse = ARAXResponse()): self.meta_map_path = f"{os.path.dirname(os.path.abspath(__file__))}/meta_map_v2.pickle" self.timeout_record_path = f"{os.path.dirname(os.path.abspath(__file__))}/kp_timeout_record.pickle" self.log = log self.all_kps = eu.get_all_kps() self.timeout_record = self._load_timeout_record() self.meta_map = self._load_meta_map() self.biolink_helper = BiolinkHelper()
def test_create_message_basic(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message assert response.envelope.type == 'translator_reasoner_response' assert response.envelope.schema_version == '1.0.0'
def check_for_canonical_predicates( kg: QGOrganizedKnowledgeGraph, kp_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: non_canonical_predicates_used = set() biolink_helper = BiolinkHelper() for qedge_id, edges in kg.edges_by_qg_id.items(): for edge in edges.values(): canonical_predicate = biolink_helper.get_canonical_predicates( edge.predicate)[0] if canonical_predicate != edge.predicate: non_canonical_predicates_used.add(edge.predicate) _ = flip_edge(edge, canonical_predicate) if non_canonical_predicates_used: log.warning( f"{kp_name}: Found edges in {kp_name}'s answer that use non-canonical " f"predicates: {non_canonical_predicates_used}. I corrected these.") return kg
def update_results_with_overlay_edge(subject_knode_key: str, object_knode_key: str, kedge_key: str, message: Message, log: ARAXResponse): try: new_edge_binding = EdgeBinding(id=kedge_key) for result in message.results: for qedge_key in result.edge_bindings.keys(): if kedge_key not in set([x.id for x in result.edge_bindings[qedge_key]]): if qedge_key not in message.query_graph.edges: log.warning(f"Encountered a result edge binding which does not exist in the query graph") continue subject_nodes = [x.id for x in result.node_bindings[message.query_graph.edges[qedge_key].subject]] object_nodes = [x.id for x in result.node_bindings[message.query_graph.edges[qedge_key].object]] result_nodes = set(subject_nodes).union(set(object_nodes)) if subject_knode_key in result_nodes and object_knode_key in result_nodes: result.edge_bindings[qedge_key].append(new_edge_binding) except: tb = traceback.format_exc() log.error(f"Error encountered when modifying results with overlay edge (subject_knode_key)-kedge_key-(object_knode_key):\n{tb}", error_code="UncaughtError")
def _load_answers_into_kg( self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node( neo4j_node, kg_name) final_kg.add_node(swagger_node_key, swagger_node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) # Record which of this edge's nodes correspond to which qnode_key if swagger_edge_key not in edge_to_nodes_map: edge_to_nodes_map[swagger_edge_key] = dict() for qnode_key in qg.nodes: edge_to_nodes_map[swagger_edge_key][ qnode_key] = neo4j_edge.get(qnode_key) # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, column_qedge_key) return final_kg, edge_to_nodes_map
def _prune_highly_connected_nodes(kg: QGOrganizedKnowledgeGraph, qedge_key: str, input_curies: Set[str], input_qnode_key: str, max_edges_per_input_curie: int, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: # First create a lookup of which edges belong to which input curies input_nodes_to_edges_dict = defaultdict(set) for edge_key, edge in kg.edges_by_qg_id[qedge_key].items(): if edge.subject in input_curies: input_nodes_to_edges_dict[edge.subject].add(edge_key) if edge.object in input_curies: input_nodes_to_edges_dict[edge.object].add(edge_key) # Then prune down highly-connected nodes (delete edges per input curie in excess of some set limit) for node_key, connected_edge_keys in input_nodes_to_edges_dict.items(): connected_edge_keys_list = list(connected_edge_keys) if len(connected_edge_keys_list) > max_edges_per_input_curie: random.shuffle(connected_edge_keys_list) # Make it random which edges we keep for this input curie edge_keys_to_remove = connected_edge_keys_list[max_edges_per_input_curie:] log.debug(f"Randomly removing {len(edge_keys_to_remove)} edges from answer for input curie {node_key}") for edge_key in edge_keys_to_remove: kg.edges_by_qg_id[qedge_key].pop(edge_key, None) # Document that not all answers for this input curie are included node = kg.nodes_by_qg_id[input_qnode_key].get(node_key) if node: if not node.attributes: node.attributes = [] if not any(attribute.attribute_type_id == "biolink:incomplete_result_set" for attribute in node.attributes): node.attributes.append(Attribute(attribute_type_id="biolink:incomplete_result_set", # TODO: request this as actual biolink item? value_type_id="metatype:Boolean", value=True, attribute_source="infores:rtx-kg2", description=f"This attribute indicates that not all " f"nodes/edges returned as answers for this input " f"curie were included in the final answer due to " f"size limitations. {max_edges_per_input_curie} " f"edges for this input curie were kept.")) # Then delete any nodes orphaned by removal of edges node_keys_used_by_edges = kg.get_all_node_keys_used_by_edges() for qnode_key, nodes in kg.nodes_by_qg_id.items(): orphan_node_keys = set(nodes).difference(node_keys_used_by_edges) if orphan_node_keys: log.debug(f"Removing {len(orphan_node_keys)} {qnode_key} nodes orphaned by the above step") for orphan_node_key in orphan_node_keys: del kg.nodes_by_qg_id[qnode_key][orphan_node_key] return kg
def test_create_message_node_edge_types(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message assert isinstance(message.knowledge_graph.nodes, dict) assert isinstance(message.knowledge_graph.edges, dict) assert isinstance(message.query_graph.nodes, dict) assert isinstance(message.query_graph.edges, dict)
def test_add_qnode_type(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message messenger.add_qnode(response,{ 'category': 'biolink:Protein' }) assert response.status == 'OK' assert isinstance(message.query_graph.nodes, dict) assert len(message.query_graph.nodes) == 1 assert message.query_graph.nodes['n00'].category == 'biolink:Protein'
def test_add_qnode_name(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message messenger.add_qnode(response,{ 'name': 'acetaminophen' }) assert response.status == 'OK' assert isinstance(message.query_graph.nodes, dict) assert len(message.query_graph.nodes) == 1 assert message.query_graph.nodes['n00'].id == 'CHEMBL.COMPOUND:CHEMBL112'
def test_add_qnode_curie_list(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message messenger.add_qnode(response,{ 'id': ['UniProtKB:P14136','UniProtKB:P35579'] }) assert response.status == 'OK' assert isinstance(message.query_graph.nodes, dict) assert len(message.query_graph.nodes) == 1 assert len(message.query_graph.nodes['n00'].id) == 2
def test_add_qnode_basic(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message messenger.add_qnode(response,{}) assert response.status == 'OK' assert isinstance(message.query_graph.nodes, dict) assert len(message.query_graph.nodes) == 1 assert message.query_graph.nodes['n00'].id == None
def test_add_qnode_bad_name(): response = ARAXResponse() messenger = ARAXMessenger() messenger.create_envelope(response) assert response.status == 'OK' message = response.envelope.message messenger.add_qnode(response,{ 'name': 'Big Bird' }) assert response.status == 'ERROR' assert isinstance(message.query_graph.nodes, dict) assert len(message.query_graph.nodes) == 0 assert response.error_code == 'UnresolvableNodeName'