Ejemplo n.º 1
0
    def _convert_kg2_node_to_swagger_node(self, neo4j_node):
        swagger_node = Node()
        swagger_node.id = neo4j_node.get('id')
        swagger_node.name = neo4j_node.get('name')
        swagger_node.description = neo4j_node.get('description')
        swagger_node.uri = neo4j_node.get('iri')
        swagger_node.node_attributes = []

        node_category = neo4j_node.get('category_label')
        swagger_node.type = eu.convert_string_or_list_to_list(node_category)

        # Fill out the 'symbol' property (only really relevant for nodes from UniProtKB)
        if swagger_node.symbol is None and swagger_node.id.lower().startswith(
                "uniprot"):
            swagger_node.symbol = neo4j_node.get('name')
            swagger_node.name = neo4j_node.get('full_name')

        # Add all additional properties on KG2 nodes as swagger NodeAttribute objects
        additional_kg2_node_properties = [
            'publications', 'synonym', 'category', 'provided_by', 'deprecated',
            'update_date'
        ]
        node_attributes = self._create_swagger_attributes(
            "node", additional_kg2_node_properties, neo4j_node)
        swagger_node.node_attributes += node_attributes

        return swagger_node
Ejemplo n.º 2
0
    def _prune_answers_to_achieve_curie_to_curie_query(
            kg: DictKnowledgeGraph, output_qnode: QNode,
            qedge: QEdge) -> DictKnowledgeGraph:
        """
        This is a way of hacking around BTE's limitation where it can only do (node with curie)-->(non-specific node)
        kinds of queries. We do the non-specific query, and then use this function to remove all of the answer nodes
        that do not correspond to the curie we wanted for the 'output' node.
        """
        # Remove 'output' nodes in the KG that aren't actually the ones we were looking for
        desired_output_curies = set(
            eu.convert_string_or_list_to_list(output_qnode.curie))
        all_output_node_ids = set(kg.nodes_by_qg_id[output_qnode.id])
        output_node_ids_to_remove = all_output_node_ids.difference(
            desired_output_curies)
        for node_id in output_node_ids_to_remove:
            kg.nodes_by_qg_id[output_qnode.id].pop(node_id)

        # And remove any edges that used them
        edge_ids_to_remove = set()
        for edge_id, edge in kg.edges_by_qg_id[qedge.id].items():
            if edge.target_id in output_node_ids_to_remove:  # Edge target_id always contains output node ID for BTE
                edge_ids_to_remove.add(edge_id)
        for edge_id in edge_ids_to_remove:
            kg.edges_by_qg_id[qedge.id].pop(edge_id)

        return kg
Ejemplo n.º 3
0
 def _override_qnode_types_as_needed(self,
                                     query_graph: QueryGraph) -> QueryGraph:
     for qnode_key, qnode in query_graph.nodes.items():
         overriden_categories = {
             self.node_category_overrides_for_kp.get(
                 qnode_category, qnode_category)
             for qnode_category in eu.convert_string_or_list_to_list(
                 qnode.category)
         }
         qnode.category = list(overriden_categories)
     return query_graph
Ejemplo n.º 4
0
 def _build_kg_to_qg_id_dict(results):
     kg_to_qg_ids = {'nodes': dict(), 'edges': dict()}
     for node_binding in results['node_bindings']:
         node_id = node_binding['kg_id']
         qnode_id = node_binding['qg_id']
         kg_to_qg_ids['nodes'][node_id] = qnode_id
     for edge_binding in results['edge_bindings']:
         edge_ids = eu.convert_string_or_list_to_list(edge_binding['kg_id'])
         qedge_ids = edge_binding['qg_id']
         for kg_id in edge_ids:
             kg_to_qg_ids['edges'][kg_id] = qedge_ids
     return kg_to_qg_ids
Ejemplo n.º 5
0
    def _convert_kg1_node_to_swagger_node(neo4j_node: Dict[str, any]) -> Node:
        swagger_node = Node()
        swagger_node.id = neo4j_node.get('id')
        swagger_node.name = neo4j_node.get('name')
        swagger_node.symbol = neo4j_node.get('symbol')
        swagger_node.description = neo4j_node.get('description')
        swagger_node.uri = neo4j_node.get('uri')
        swagger_node.node_attributes = []

        node_category = neo4j_node.get('category')
        swagger_node.type = eu.convert_string_or_list_to_list(node_category)

        return swagger_node
Ejemplo n.º 6
0
 def _pre_process_query_graph(self, query_graph: QueryGraph,
                              log: ARAXResponse) -> QueryGraph:
     for qnode_key, qnode in query_graph.nodes.items():
         # Convert node types to preferred format and verify we can do this query
         formatted_qnode_categories = {
             self.node_category_overrides_for_kp.get(
                 qnode_category, qnode_category)
             for qnode_category in eu.convert_string_or_list_to_list(
                 qnode.category)
         }
         accepted_qnode_categories = formatted_qnode_categories.intersection(
             self.accepted_node_categories)
         if not accepted_qnode_categories:
             log.error(
                 f"{self.kp_name} can only be used for queries involving {self.accepted_node_categories} "
                 f"and QNode {qnode_key} has category '{qnode.category}'",
                 error_code="UnsupportedQueryForKP")
             return query_graph
         else:
             qnode.category = list(accepted_qnode_categories)[0]
         # Convert curies to equivalent curies accepted by the KP (depending on qnode type)
         if qnode.id:
             equivalent_curies = eu.get_curie_synonyms(qnode.id, log)
             desired_curies = [
                 curie for curie in equivalent_curies if curie.startswith(
                     f"{self.kp_preferred_prefixes[qnode.category]}:")
             ]
             if desired_curies:
                 qnode.id = desired_curies if len(
                     desired_curies) > 1 else desired_curies[0]
                 log.debug(
                     f"Converted qnode {qnode_key} curie to {qnode.id}")
             else:
                 log.warning(
                     f"Could not convert qnode {qnode_key} curie(s) to preferred prefix ({self.kp_preferred_prefixes[qnode.category]})"
                 )
     return query_graph
Ejemplo n.º 7
0
    def apply(self, input_message, input_parameters, response=None):

        if response is None:
            response = Response()
        self.response = response
        self.message = input_message

        # Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        # Define a complete set of allowed parameters and their defaults
        parameters = self.parameters
        parameters['kp'] = "ARAX/KG1"
        parameters['enforce_directionality'] = False
        parameters['use_synonyms'] = True
        parameters['synonym_handling'] = 'map_back'
        parameters['continue_if_no_results'] = False
        for key, value in input_parameters.items():
            if key and key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                if type(value) is str and value.lower() == "true":
                    value = True
                elif type(value) is str and value.lower() == "false":
                    value = False
                parameters[key] = value

        # Default to expanding the entire query graph if the user didn't specify what to expand
        if not parameters['edge_id'] and not parameters['node_id']:
            parameters['edge_id'] = [
                edge.id for edge in self.message.query_graph.edges
            ]
            parameters['node_id'] = self._get_orphan_query_node_ids(
                self.message.query_graph)

        if response.status != 'OK':
            return response

        response.data['parameters'] = parameters
        self.parameters = parameters

        # Do the actual expansion
        response.debug(
            f"Applying Expand to Message with parameters {parameters}")
        input_edge_ids = eu.convert_string_or_list_to_list(
            parameters['edge_id'])
        input_node_ids = eu.convert_string_or_list_to_list(
            parameters['node_id'])
        kp_to_use = self.parameters['kp']
        continue_if_no_results = self.parameters['continue_if_no_results']

        # Convert message knowledge graph to dictionary format, for faster processing
        dict_kg = eu.convert_standard_kg_to_dict_kg(
            self.message.knowledge_graph)

        # Expand any specified edges
        if input_edge_ids:
            query_sub_graph = self._extract_query_subgraph(
                input_edge_ids, self.message.query_graph)
            if response.status != 'OK':
                return response
            self.response.debug(
                f"Query graph for this Expand() call is: {query_sub_graph.to_dict()}"
            )

            # Expand the query graph edge by edge (much faster for neo4j queries, and allows easy integration with BTE)
            ordered_qedges_to_expand = self._get_order_to_expand_edges_in(
                query_sub_graph)
            node_usages_by_edges_map = dict()

            for qedge in ordered_qedges_to_expand:
                answer_kg, edge_node_usage_map = self._expand_edge(
                    qedge, kp_to_use, dict_kg, continue_if_no_results,
                    self.message.query_graph)
                if response.status != 'OK':
                    return response
                node_usages_by_edges_map[qedge.id] = edge_node_usage_map

                self._process_and_merge_answer(answer_kg, dict_kg)
                if response.status != 'OK':
                    return response

                self._prune_dead_end_paths(dict_kg, query_sub_graph,
                                           node_usages_by_edges_map)
                if response.status != 'OK':
                    return response

        # Expand any specified nodes
        if input_node_ids:
            for qnode_id in input_node_ids:
                answer_kg = self._expand_node(qnode_id, kp_to_use,
                                              continue_if_no_results,
                                              self.message.query_graph)
                if response.status != 'OK':
                    return response

                self._process_and_merge_answer(answer_kg, dict_kg)
                if response.status != 'OK':
                    return response

        # Convert message knowledge graph back to API standard format
        self.message.knowledge_graph = eu.convert_dict_kg_to_standard_kg(
            dict_kg)

        # Return the response and done
        kg = self.message.knowledge_graph
        response.info(
            f"After Expand, Message.KnowledgeGraph has {len(kg.nodes)} nodes and {len(kg.edges)} edges"
        )
        return response
Ejemplo n.º 8
0
    def _deduplicate_nodes(
            dict_kg: DictKnowledgeGraph,
            edge_to_nodes_map: Dict[str, Dict[str, str]], log: Response
    ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(f"Deduplicating nodes")
        deduplicated_kg = DictKnowledgeGraph(
            nodes={qnode_id: dict()
                   for qnode_id in dict_kg.nodes_by_qg_id},
            edges={qedge_id: dict()
                   for qedge_id in dict_kg.edges_by_qg_id})
        updated_edge_to_nodes_map = {
            edge_id: dict()
            for edge_id in edge_to_nodes_map
        }
        curie_mappings = dict()

        # First deduplicate the nodes
        for qnode_id, nodes in dict_kg.nodes_by_qg_id.items():
            # Load preferred curie info from NodeSynonymizer for nodes we haven't seen before
            unmapped_node_ids = set(nodes).difference(set(curie_mappings))
            log.debug(
                f"Getting preferred curies for {qnode_id} nodes returned in this step"
            )
            canonicalized_nodes = eu.get_preferred_curies(
                list(unmapped_node_ids), log) if unmapped_node_ids else dict()
            if log.status != 'OK':
                return deduplicated_kg, updated_edge_to_nodes_map

            for node_id in unmapped_node_ids:
                # Figure out the preferred curie/name for this node
                node = nodes.get(node_id)
                canonicalized_node = canonicalized_nodes.get(node_id)
                if canonicalized_node:
                    preferred_curie = canonicalized_node.get(
                        'preferred_curie', node_id)
                    preferred_name = canonicalized_node.get(
                        'preferred_name', node.name)
                    preferred_type = eu.convert_string_or_list_to_list(
                        canonicalized_node.get('preferred_type', node.type))
                    curie_mappings[node_id] = preferred_curie
                else:
                    # Means the NodeSynonymizer didn't recognize this curie
                    preferred_curie = node_id
                    preferred_name = node.name
                    preferred_type = node.type
                    curie_mappings[node_id] = preferred_curie

                # Add this node into our deduplicated KG as necessary # TODO: merge certain fields, like uri?
                if preferred_curie not in deduplicated_kg.nodes_by_qg_id[
                        qnode_id]:
                    node.id = preferred_curie
                    node.name = preferred_name
                    node.type = preferred_type
                    deduplicated_kg.add_node(node, qnode_id)

        # Then update the edges to reflect changes made to the nodes
        for qedge_id, edges in dict_kg.edges_by_qg_id.items():
            for edge_id, edge in edges.items():
                edge.source_id = curie_mappings.get(edge.source_id)
                edge.target_id = curie_mappings.get(edge.target_id)
                if not edge.source_id or not edge.target_id:
                    log.error(
                        f"Could not find preferred curie mappings for edge {edge_id}'s node(s)"
                    )
                    return deduplicated_kg, updated_edge_to_nodes_map
                deduplicated_kg.add_edge(edge, qedge_id)

                # Update the edge-to-node map for this edge (used down the line for pruning)
                for qnode_id, corresponding_node_id in edge_to_nodes_map[
                        edge_id].items():
                    updated_edge_to_nodes_map[edge_id][
                        qnode_id] = curie_mappings.get(corresponding_node_id)

        log.debug(
            f"After deduplication, answer KG counts are: {eu.get_printable_counts_by_qg_id(deduplicated_kg)}"
        )
        return deduplicated_kg, updated_edge_to_nodes_map
Ejemplo n.º 9
0
 def _send_query_to_kp(self, query_graph: QueryGraph,
                       log: ARAXResponse) -> Dict[str, any]:
     # Send query to their API (stripping down qnode/qedges to only the properties they like)
     stripped_qnodes = dict()
     for qnode_key, qnode in query_graph.nodes.items():
         stripped_qnode = {'category': qnode.category}
         if qnode.id:
             stripped_qnode['id'] = qnode.id
         stripped_qnodes[qnode_key] = stripped_qnode
     qedge_key = next(qedge_key for qedge_key in
                      query_graph.edges)  # Our query graph is single-edge
     qedge = query_graph.edges[qedge_key]
     stripped_qedge = {
         'subject':
         qedge.subject,
         'object':
         qedge.object,
         'predicate':
         qedge.predicate
         if qedge.predicate else list(self.accepted_edge_types)[0]
     }
     if stripped_qedge['predicate'] not in self.accepted_edge_types:
         log.warning(
             f"{self.kp_name} only accepts the following edge types: {self.accepted_edge_types}"
         )
     source_stripped_qnode = stripped_qnodes[qedge.subject]
     input_curies = eu.convert_string_or_list_to_list(
         source_stripped_qnode['id'])
     combined_message = dict()
     for input_curie in input_curies:  # Until we have batch querying, ping them one-by-one for each input curie
         log.debug(
             f"Sending {qedge_key} query to {self.kp_name} for {input_curie}"
         )
         source_stripped_qnode['id'] = input_curie
         kp_response = requests.post(self.kp_query_endpoint,
                                     json={
                                         'message': {
                                             'query_graph': {
                                                 'nodes': stripped_qnodes,
                                                 'edges': {
                                                     qedge_key:
                                                     stripped_qedge
                                                 }
                                             }
                                         }
                                     },
                                     headers={'accept': 'application/json'})
         if kp_response.status_code != 200:
             log.warning(
                 f"{self.kp_name} KP API returned response of {kp_response.status_code}: {kp_response.text}"
             )
         else:
             kp_response_json = kp_response.json()
             kp_message = kp_response_json["message"]
             if kp_message.get('results'):
                 if not combined_message:
                     combined_message = kp_message
                 else:
                     combined_message['knowledge_graph']['nodes'].update(
                         kp_message['knowledge_graph']['nodes'])
                     combined_message['knowledge_graph']['edges'].update(
                         kp_message['knowledge_graph']['edges'])
                     combined_message['results'] += kp_message['results']
     return combined_message
Ejemplo n.º 10
0
    def _validate_and_pre_process_input(self, query_graph, valid_bte_inputs_dict, enforce_directionality):
        # Make sure we have a valid one-hop query graph
        if len(query_graph.edges) != 1 or len(query_graph.nodes) != 2:
            self.response.error(f"BTE can only accept one-hop query graphs (your QG has {len(query_graph.nodes)} "
                                f"nodes and {len(query_graph.edges)} edges)", error_code="InvalidQueryGraph")
            return None, None, None
        qedge = query_graph.edges[0]

        # Make sure at least one of our qnodes has a curie
        qnodes_with_curies = [qnode for qnode in query_graph.nodes if qnode.curie]
        if not qnodes_with_curies:
            self.response.error(f"Neither qnode for qedge {qedge.id} has a curie specified. BTE requires that at least"
                                f" one of them has a curie. Your query graph is: {query_graph.to_dict()}")
            return None, None, None

        # Figure out which query node is input vs. output and validate which qnodes have curies
        if enforce_directionality:
            input_qnode = next(qnode for qnode in query_graph.nodes if qnode.id == qedge.source_id)
            output_qnode = next(qnode for qnode in query_graph.nodes if qnode.id == qedge.target_id)
        else:
            qnodes_with_curies = [qnode for qnode in query_graph.nodes if qnode.curie]
            input_qnode = qnodes_with_curies[0] if qnodes_with_curies else None
            output_qnode = next(qnode for qnode in query_graph.nodes if qnode.id != input_qnode.id)
        if not input_qnode.curie:
            self.response.error(f"BTE cannot expand edges with a non-specific (curie-less) source node (source node is:"
                                f" {input_qnode.to_dict()})", error_code="InvalidInput")
        elif not enforce_directionality:
            self.response.warning(f"BTE cannot do bidirectional queries; the query for this edge will be directed, "
                                  f"going: {input_qnode.id}-->{output_qnode.id}")
        if self.response.status != 'OK':
            return None, None, None

        # Make sure predicate is allowed
        if qedge.type not in valid_bte_inputs_dict['predicates'] and qedge.type is not None:
            self.response.error(f"BTE does not accept predicate '{qedge.type}'. Valid options are "
                                f"{valid_bte_inputs_dict['predicates']}", error_code="InvalidInput")
            return None, None, None

        # Process qnode types (guess one if none provided, convert to preferred format, make sure allowed)
        if not input_qnode.type:
            input_qnode.type = eu.guess_qnode_type(input_qnode.curie, self.response)
        if not output_qnode.type:
            output_qnode.type = eu.guess_qnode_type(output_qnode.curie, self.response)
        input_qnode.type = eu.convert_string_to_pascal_case(input_qnode.type)
        output_qnode.type = eu.convert_string_to_pascal_case(output_qnode.type)
        qnodes_missing_type = [qnode.id for qnode in [input_qnode, output_qnode] if not qnode.type]
        if qnodes_missing_type:
            self.response.error(f"BTE requires every query node to have a type. QNode(s) missing a type: "
                                f"{', '.join(qnodes_missing_type)}", error_code="InvalidInput")
            return None, None, None
        invalid_qnode_types = [qnode.type for qnode in [input_qnode, output_qnode] if qnode.type not in valid_bte_inputs_dict['node_types']]
        if invalid_qnode_types:
            self.response.error(f"BTE does not accept QNode type(s): {', '.join(invalid_qnode_types)}. Valid options are"
                                f" {valid_bte_inputs_dict['node_types']}", error_code="InvalidInput")
            return None, None, None

        # Make sure our input node curies are in list form and use prefixes BTE prefers
        input_curie_list = eu.convert_string_or_list_to_list(input_qnode.curie)
        input_qnode.curie = [eu.convert_curie_to_bte_format(curie) for curie in input_curie_list]

        return qedge, input_qnode, output_qnode
Ejemplo n.º 11
0
    def _add_answers_to_kg(self, answer_kg: DictKnowledgeGraph,
                           reasoner_std_response: Dict[str, any],
                           input_qnode_id: str, output_qnode_id: str,
                           qedge_id: str, log: Response) -> DictKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(
            reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_ids = dict()
            log.debug(
                f"Got results back from BTE for this query "
                f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)"
            )

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_id = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.type = eu.convert_string_or_list_to_list(
                    eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_ids in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_id)
                if bte_qg_id == "n0":
                    qnode_id = input_qnode_id
                elif bte_qg_id == "n1":
                    qnode_id = output_qnode_id
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_id",
                              error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_id == output_qnode_id:
                    if bte_node_id in remapped_node_ids:
                        swagger_node.id = remapped_node_ids.get(bte_node_id)
                    else:
                        equivalent_curies = [
                            f"{prefix}:{eu.get_curie_local_id(local_id)}"
                            for prefix, local_ids in node.get(
                                'equivalent_identifiers').items()
                            for local_id in local_ids
                        ]
                        swagger_node.id = self._get_best_equivalent_bte_curie(
                            equivalent_curies, swagger_node.type[0])
                        remapped_node_ids[bte_node_id] = swagger_node.id
                else:
                    swagger_node.id = bte_node_id

                answer_kg.add_node(swagger_node, qnode_id)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge.id = edge.get("id")
                swagger_edge.type = edge.get('type')
                swagger_edge.source_id = remapped_node_ids.get(
                    edge.get('source_id'), edge.get('source_id'))
                swagger_edge.target_id = remapped_node_ids.get(
                    edge.get('target_id'), edge.get('target_id'))
                swagger_edge.is_defined_by = "BTE"
                swagger_edge.provided_by = edge.get('edge_source')
                # Map the returned BTE qg_id back to the original qedge_id in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge.id)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_id",
                              error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge, qedge_id)

        return answer_kg