def _add_inverted_predicates(qg: QueryGraph, log: ARAXResponse) -> QueryGraph: # For now, we'll consider BOTH predicates in an inverse pair (TODO: later tailor to what we know is in KG2) qedge = next(qedge for qedge in qg.edges.values()) response = requests.get( "https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.yaml" ) if response.status_code == 200: qedge.predicate = eu.convert_to_list(qedge.predicate) biolink_model = yaml.safe_load(response.text) inverse_predicates = set() for predicate in qedge.predicate: english_predicate = predicate.split(":")[-1].replace( "_", " ") # Converts to 'subclass of' format biolink_predicate_info = biolink_model["slots"].get( english_predicate) if biolink_predicate_info and "inverse" in biolink_predicate_info: english_inverse_predicate = biolink_predicate_info[ "inverse"] machine_inverse_predicate = f"biolink:{english_inverse_predicate.replace(' ', '_')}" inverse_predicates.add(machine_inverse_predicate) log.debug( f"Found inverse predicate for {predicate}: {machine_inverse_predicate}" ) qedge.predicate = list( set(qedge.predicate).union(inverse_predicates)) else: log.warning( f"Cannot check for inverse predicates: Failed to load Biolink Model yaml file. " f"(Page gave status {response.status_code}.)") return qg
def _prune_answers_to_achieve_curie_to_curie_query(kg: QGOrganizedKnowledgeGraph, output_qnode_key: str, qg: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This is a way of hacking around BTE's limitation where it can only do (node with curie)-->(non-specific node) kinds of queries. We do the non-specific query, and then use this function to remove all of the answer nodes that do not correspond to the curie we wanted for the 'output' node. """ # Remove 'output' nodes in the KG that aren't actually the ones we were looking for output_qnode = qg.nodes[output_qnode_key] qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] desired_output_curies = set(eu.convert_to_list(output_qnode.id)) all_output_node_keys = set(kg.nodes_by_qg_id[output_qnode_key]) output_node_keys_to_remove = all_output_node_keys.difference(desired_output_curies) for node_key in output_node_keys_to_remove: kg.nodes_by_qg_id[output_qnode_key].pop(node_key) # And remove any edges that used them edge_keys_to_remove = set() for edge_key, edge in kg.edges_by_qg_id[qedge_key].items(): if edge.object in output_node_keys_to_remove: # Edge object always contains output node ID for BTE edge_keys_to_remove.add(edge_key) for edge_key in edge_keys_to_remove: kg.edges_by_qg_id[qedge_key].pop(edge_key) return kg
def _convert_one_hop_query_graph_to_cypher_query( self, qg: QueryGraph, enforce_directionality: bool, log: ARAXResponse) -> str: qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] log.debug(f"Generating cypher for edge {qedge_key} query graph") try: # Build the match clause subject_qnode_key = qedge.subject object_qnode_key = qedge.object qedge_cypher = self._get_cypher_for_query_edge( qedge_key, qg, enforce_directionality) source_qnode_cypher = self._get_cypher_for_query_node( subject_qnode_key, qg) target_qnode_cypher = self._get_cypher_for_query_node( object_qnode_key, qg) match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}" # Build the where clause where_fragments = [] for qnode_key in [subject_qnode_key, object_qnode_key]: qnode = qg.nodes[qnode_key] if qnode.id and isinstance(qnode.id, list) and len(qnode.id) > 1: where_fragments.append(f"{qnode_key}.id in {qnode.id}") if qnode.category: qnode.category = eu.convert_to_list(qnode.category) if len(qnode.category) > 1: # Create where fragment that looks like 'n00:biolink:Disease OR n00:biolink:PhenotypicFeature..' category_sub_fragments = [ f"{qnode_key}:`{category}`" for category in qnode.category ] category_where_fragment = f"({' OR '.join(category_sub_fragments)})" where_fragments.append(category_where_fragment) where_clause = f"WHERE {' AND '.join(where_fragments)}" if where_fragments else "" # Build the with clause source_qnode_col_name = f"nodes_{subject_qnode_key}" target_qnode_col_name = f"nodes_{object_qnode_key}" qedge_col_name = f"edges_{qedge_key}" # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID extra_edge_properties = "{.*, " + f"id:ID({qedge_key}), {subject_qnode_key}:{subject_qnode_key}.id, {object_qnode_key}:{object_qnode_key}.id" + "}" with_clause = f"WITH collect(distinct {subject_qnode_key}) as {source_qnode_col_name}, " \ f"collect(distinct {object_qnode_key}) as {target_qnode_col_name}, " \ f"collect(distinct {qedge_key}{extra_edge_properties}) as {qedge_col_name}" # Build the return clause return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}" cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}" return cypher_query except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Problem generating cypher for query. {tb}", error_code=error_type.__name__) return ""
def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_keys = dict() log.debug(f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)") for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_key = node.get('id') swagger_node.name = node.get('name') swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_keys in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key) if bte_qg_id == "n0": qnode_key = input_qnode_key elif bte_qg_id == "n1": qnode_key = output_qnode_key else: log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_key == output_qnode_key: if bte_node_key in remapped_node_keys: swagger_node_key = remapped_node_keys.get(bte_node_key) else: equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get('equivalent_identifiers').items() for local_id in local_ids] swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0]) remapped_node_keys[bte_node_key] = swagger_node_key else: swagger_node_key = bte_node_key answer_kg.add_node(swagger_node_key, swagger_node, qnode_key) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge_key = edge.get("id") swagger_edge.predicate = edge.get('type') swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id')) swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id')) swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))] # Map the returned BTE qg_id back to the original qedge_key in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) return answer_kg
def _convert_kg1_node_to_swagger_node( self, neo4j_node: Dict[str, any]) -> Tuple[str, Node]: swagger_node = Node() swagger_node_key = neo4j_node.get('id') swagger_node.name = neo4j_node.get('name') node_category = neo4j_node.get('category') swagger_node.category = eu.convert_to_list(node_category) other_properties = ["symbol", "description", "uri"] swagger_node.attributes = self._create_swagger_attributes( other_properties, neo4j_node) return swagger_node_key, swagger_node
def _get_supported_prefixes(self, categories: List[str], kp: str) -> Set[str]: bh = BiolinkHelper() categories_with_descendants = bh.get_descendants( eu.convert_to_list(categories), include_mixins=False) supported_prefixes = { prefix.upper() for category in categories_with_descendants for prefix in self.meta_map[kp]["prefixes"].get(category, set()) } return supported_prefixes
def _build_kg_to_qg_id_dict(results: Dict[str, any]) -> Dict[str, Dict[str, List[str]]]: kg_to_qg_ids = {'nodes': dict(), 'edges': dict()} for node_binding in results['node_bindings']: node_key = node_binding['kg_id'] qnode_key = node_binding['qg_id'] kg_to_qg_ids['nodes'][node_key] = qnode_key for edge_binding in results['edge_bindings']: edge_keys = eu.convert_to_list(edge_binding['kg_id']) qedge_keys = edge_binding['qg_id'] for kg_id in edge_keys: kg_to_qg_ids['edges'][kg_id] = qedge_keys return kg_to_qg_ids
def get_desirable_equivalent_curies(self, curies: List[str], categories: Optional[List[str]], kp: str) -> List[str]: """ For each input curie, this function returns an equivalent curie(s) that uses a prefix the KP supports. """ self.log.debug( f"{kp}: Converting curies in the QG to kinds that {kp} can answer") if not self.meta_map.get(kp): self.log.warning( f"{kp}: Somehow missing meta info for {kp}. Cannot do curie prefix conversion; will send " f"curies as they are.") return curies elif not self.meta_map[kp].get("prefixes"): self.log.warning( f"{kp}: No supported prefix info is available for {kp}. Will send curies as they are." ) return curies else: supported_prefixes = self._get_supported_prefixes( eu.convert_to_list(categories), kp) self.log.debug( f"{kp}: Prefixes {kp} supports for categories {categories} (and descendants) are: " f"{supported_prefixes}") converted_curies = set() unsupported_curies = set() synonyms_dict = eu.get_curie_synonyms_dict(curies) # Convert each input curie to a preferred, supported prefix for input_curie, equivalent_curies in synonyms_dict.items(): input_curie_prefix = self._get_uppercase_prefix(input_curie) supported_equiv_curies_by_prefix = defaultdict(set) for curie in equivalent_curies: prefix = self._get_uppercase_prefix(curie) if prefix in supported_prefixes: supported_equiv_curies_by_prefix[prefix].add(curie) if supported_equiv_curies_by_prefix: # Grab equivalent curies with the same prefix as the input curie, if available if input_curie_prefix in supported_equiv_curies_by_prefix: curies_to_send = supported_equiv_curies_by_prefix[ input_curie_prefix] # Otherwise pick any supported curie prefix present else: curies_to_send = next( curie_set for curie_set in supported_equiv_curies_by_prefix.values()) converted_curies = converted_curies.union(curies_to_send) else: unsupported_curies.add(input_curie) if unsupported_curies: self.log.warning( f"{kp}: Could not find curies with prefixes {kp} prefers for these curies: " f"{unsupported_curies}; will not send these to KP") return list(converted_curies)
def _convert_kg2c_node_to_trapi_node( self, neo4j_node: Dict[str, any]) -> Tuple[str, Node]: node = Node() node_key = neo4j_node.get('id') node.name = neo4j_node.get('name') node.category = eu.convert_to_list(neo4j_node.get('category')) # Add all additional properties on KG2c nodes as TRAPI Attribute objects other_properties = [ "iri", "description", "all_names", "all_categories", "expanded_categories", "equivalent_curies", "publications" ] node.attributes = self._create_trapi_attributes( other_properties, neo4j_node) return node_key, node
def _convert_kg2_node_to_trapi_node( self, neo4j_node: Dict[str, any]) -> Tuple[str, Node]: node = Node() node_key = neo4j_node.get('id') node.name = neo4j_node.get('name') node.category = eu.convert_to_list(neo4j_node.get('category')) # Add all additional properties on KG2 nodes as TRAPI Attribute objects other_properties = [ "iri", "full_name", "description", "publications", "synonym", "provided_by", "deprecated", "update_date" ] node.attributes = self._create_trapi_attributes( other_properties, neo4j_node) return node_key, node
def _convert_kg2_node_to_swagger_node( self, neo4j_node: Dict[str, any]) -> Tuple[str, Node]: swagger_node = Node() swagger_node_key = neo4j_node.get('id') swagger_node.name = neo4j_node.get('name') node_category = neo4j_node.get('category_label') swagger_node.category = eu.convert_to_list(node_category) # Add all additional properties on KG2 nodes as swagger Attribute objects other_properties = [ "full_name", "description", "iri", "publications", "synonym", "category", "provided_by", "deprecated", "update_date" ] swagger_node.attributes = self._create_swagger_attributes( other_properties, neo4j_node) return swagger_node_key, swagger_node
def make_qg_use_supported_prefixes( self, qg: QueryGraph, kp_name: str, log: ARAXResponse) -> Optional[QueryGraph]: for qnode_key, qnode in qg.nodes.items(): if qnode.ids: if kp_name == "infores:rtx-kg2": # Just convert them into canonical curies qnode.ids = eu.get_canonical_curies_list(qnode.ids, log) else: # Otherwise figure out which kind of curies KPs want categories = eu.convert_to_list(qnode.categories) supported_prefixes = self._get_supported_prefixes( categories, kp_name) used_prefixes = { self._get_uppercase_prefix(curie) for curie in qnode.ids } # Only convert curie(s) if any use an unsupported prefix if used_prefixes.issubset(supported_prefixes): self.log.debug( f"{kp_name}: All {qnode_key} curies use prefix(es) {kp_name} supports; no " f"conversion necessary") else: self.log.debug( f"{kp_name}: One or more {qnode_key} curies use a prefix {kp_name} doesn't " f"support; will convert these") converted_curies = self.get_desirable_equivalent_curies( qnode.ids, qnode.categories, kp_name) if converted_curies: log.debug( f"{kp_name}: Converted {qnode_key}'s {len(qnode.ids)} curies to a list of " f"{len(converted_curies)} curies tailored for {kp_name}" ) qnode.ids = converted_curies else: log.info( f"{kp_name} cannot answer the query because no equivalent curies were found " f"with prefixes it supports for qnode {qnode_key}. Original curies were: " f"{qnode.ids}") return None return qg
def _validate_and_pre_process_input(qg: QueryGraph, valid_bte_inputs_dict: Dict[str, Set[str]], enforce_directionality: bool, use_synonyms: bool, log: ARAXResponse) -> Tuple[str, str]: # Make sure we have a valid one-hop query graph if len(qg.edges) != 1 or len(qg.nodes) != 2: log.error(f"BTE can only accept one-hop query graphs (your QG has {len(qg.nodes)} nodes and " f"{len(qg.edges)} edges)", error_code="InvalidQueryGraph") return "", "" qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] # Make sure at least one of our qnodes has a curie qnodes_with_curies = [qnode_key for qnode_key, qnode in qg.nodes.items() if qnode.id] if not qnodes_with_curies: log.error(f"Neither qnode for qedge {qedge_key} has a curie specified. BTE requires that at least one of " f"them has a curie. Your query graph is: {qg.to_dict()}", error_code="UnsupportedQueryForKP") return "", "" # Figure out which query node is input vs. output if enforce_directionality: input_qnode_key = qedge.subject output_qnode_key = qedge.object else: input_qnode_key = next(qnode_key for qnode_key, qnode in qg.nodes.items() if qnode.id) output_qnode_key = list(set(qg.nodes).difference({input_qnode_key}))[0] log.warning(f"BTE cannot do bidirectional queries; the query for this edge will be directed, going: " f"{input_qnode_key}-->{output_qnode_key}") input_qnode = qg.nodes[input_qnode_key] output_qnode = qg.nodes[output_qnode_key] # Make sure predicate is allowed if qedge.predicate: accepted_predicates = set(qedge.predicate).intersection(valid_bte_inputs_dict['predicates']) # Throw an error if none of the predicates are supported if not accepted_predicates: log.error(f"BTE does not accept predicate(s) {qedge.predicate}. Valid options are " f"{valid_bte_inputs_dict['predicates']}", error_code="UnsupportedQueryForKP") return "", "" # Give a warning if only some of the predicates are supported elif len(accepted_predicates) < len(qedge.predicate): unaccepted_predicates = set(qedge.predicate).difference(accepted_predicates) log.warning(f"Some of qedge {qedge_key}'s predicates are not accepted by BTE: {unaccepted_predicates}." f" Valid options are: {valid_bte_inputs_dict['predicates']}") qedge.predicate = list(accepted_predicates) # Process qnode types (convert to preferred format, make sure allowed) input_qnode.category = [eu.convert_string_to_pascal_case(node_category) for node_category in eu.convert_to_list(input_qnode.category)] output_qnode.category = [eu.convert_string_to_pascal_case(node_category) for node_category in eu.convert_to_list(output_qnode.category)] qnodes_missing_type = [qnode_key for qnode_key in [input_qnode_key, output_qnode_key] if not qg.nodes[qnode_key].category] if qnodes_missing_type: log.error(f"BTE requires every query node to have a category. QNode(s) missing a category: " f"{', '.join(qnodes_missing_type)}", error_code="InvalidInput") return "", "" invalid_qnode_categories = [node_category for qnode in [input_qnode, output_qnode] for node_category in qnode.category if node_category not in valid_bte_inputs_dict['node_categories']] if invalid_qnode_categories: log.error(f"BTE does not accept QNode category(s): {', '.join(invalid_qnode_categories)}. Valid options are " f"{valid_bte_inputs_dict['node_categories']}", error_code="InvalidInput") return "", "" # Sub in curie synonyms as appropriate if use_synonyms: qnodes_with_curies = [qnode for qnode in [input_qnode, output_qnode] if qnode.id] for qnode in qnodes_with_curies: synonymized_curies = eu.get_curie_synonyms(qnode.id, log) qnode.id = synonymized_curies # Make sure our input node curies are in list form and use prefixes BTE prefers input_curie_list = eu.convert_to_list(input_qnode.id) input_qnode.id = [eu.convert_curie_to_bte_format(curie) for curie in input_curie_list] return input_qnode_key, output_qnode_key
def _convert_kg2c_plover_node_to_trapi_node(node_tuple: list) -> Node: node = Node(name=node_tuple[0], categories=eu.convert_to_list(node_tuple[1])) return node