Ejemplo n.º 1
0
    def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kp: str,  query_graph: QueryGraph,
                              log: Response) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(f"Processing query results for edge {query_graph.edges[0].id}")
        final_kg = DictKnowledgeGraph()
        edge_to_nodes_map = dict()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(neo4j_results[0]) if kp == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith('nodes'):  # Example column name: 'nodes_n00'
                column_qnode_id = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    swagger_node = self._convert_neo4j_node_to_swagger_node(neo4j_node, kp)
                    final_kg.add_node(swagger_node, column_qnode_id)
            # Load answer edges into our knowledge graph
            elif column_name.startswith('edges'):  # Example column name: 'edges_e01'
                column_qedge_id = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    if kp == "KG2":
                        swagger_edge = self._convert_kg2_edge_to_swagger_edge(neo4j_edge)
                    else:
                        swagger_edge = self._convert_kg1_edge_to_swagger_edge(neo4j_edge, node_uuid_to_curie_dict)

                    # Record which of this edge's nodes correspond to which qnode_id
                    if swagger_edge.id not in edge_to_nodes_map:
                        edge_to_nodes_map[swagger_edge.id] = dict()
                    for qnode in query_graph.nodes:
                        edge_to_nodes_map[swagger_edge.id][qnode.id] = neo4j_edge.get(qnode.id)

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge, column_qedge_id)

        return final_kg, edge_to_nodes_map
Ejemplo n.º 2
0
 def _merge_answer_into_message_kg(answer_dict_kg: DictKnowledgeGraph,
                                   dict_kg: DictKnowledgeGraph,
                                   log: Response):
     # This function merges an answer KG (from the current edge/node expansion) into the overarching KG
     log.debug("Merging answer into Message.KnowledgeGraph")
     for qnode_id, nodes in answer_dict_kg.nodes_by_qg_id.items():
         for node_key, node in nodes.items():
             dict_kg.add_node(node, qnode_id)
     for qedge_id, edges_dict in answer_dict_kg.edges_by_qg_id.items():
         for edge_key, edge in edges_dict.items():
             dict_kg.add_edge(edge, qedge_id)
Ejemplo n.º 3
0
    def _deduplicate_nodes(
            dict_kg: DictKnowledgeGraph,
            edge_to_nodes_map: Dict[str, Dict[str, str]], log: Response
    ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(f"Deduplicating nodes")
        deduplicated_kg = DictKnowledgeGraph(
            nodes={qnode_id: dict()
                   for qnode_id in dict_kg.nodes_by_qg_id},
            edges={qedge_id: dict()
                   for qedge_id in dict_kg.edges_by_qg_id})
        updated_edge_to_nodes_map = {
            edge_id: dict()
            for edge_id in edge_to_nodes_map
        }
        curie_mappings = dict()

        # First deduplicate the nodes
        for qnode_id, nodes in dict_kg.nodes_by_qg_id.items():
            # Load preferred curie info from NodeSynonymizer for nodes we haven't seen before
            unmapped_node_ids = set(nodes).difference(set(curie_mappings))
            log.debug(
                f"Getting preferred curies for {qnode_id} nodes returned in this step"
            )
            canonicalized_nodes = eu.get_preferred_curies(
                list(unmapped_node_ids), log) if unmapped_node_ids else dict()
            if log.status != 'OK':
                return deduplicated_kg, updated_edge_to_nodes_map

            for node_id in unmapped_node_ids:
                # Figure out the preferred curie/name for this node
                node = nodes.get(node_id)
                canonicalized_node = canonicalized_nodes.get(node_id)
                if canonicalized_node:
                    preferred_curie = canonicalized_node.get(
                        'preferred_curie', node_id)
                    preferred_name = canonicalized_node.get(
                        'preferred_name', node.name)
                    preferred_type = eu.convert_string_or_list_to_list(
                        canonicalized_node.get('preferred_type', node.type))
                    curie_mappings[node_id] = preferred_curie
                else:
                    # Means the NodeSynonymizer didn't recognize this curie
                    preferred_curie = node_id
                    preferred_name = node.name
                    preferred_type = node.type
                    curie_mappings[node_id] = preferred_curie

                # Add this node into our deduplicated KG as necessary # TODO: merge certain fields, like uri?
                if preferred_curie not in deduplicated_kg.nodes_by_qg_id[
                        qnode_id]:
                    node.id = preferred_curie
                    node.name = preferred_name
                    node.type = preferred_type
                    deduplicated_kg.add_node(node, qnode_id)

        # Then update the edges to reflect changes made to the nodes
        for qedge_id, edges in dict_kg.edges_by_qg_id.items():
            for edge_id, edge in edges.items():
                edge.source_id = curie_mappings.get(edge.source_id)
                edge.target_id = curie_mappings.get(edge.target_id)
                if not edge.source_id or not edge.target_id:
                    log.error(
                        f"Could not find preferred curie mappings for edge {edge_id}'s node(s)"
                    )
                    return deduplicated_kg, updated_edge_to_nodes_map
                deduplicated_kg.add_edge(edge, qedge_id)

                # Update the edge-to-node map for this edge (used down the line for pruning)
                for qnode_id, corresponding_node_id in edge_to_nodes_map[
                        edge_id].items():
                    updated_edge_to_nodes_map[edge_id][
                        qnode_id] = curie_mappings.get(corresponding_node_id)

        log.debug(
            f"After deduplication, answer KG counts are: {eu.get_printable_counts_by_qg_id(deduplicated_kg)}"
        )
        return deduplicated_kg, updated_edge_to_nodes_map
Ejemplo n.º 4
0
    def answer_one_hop_query(
        self, query_graph: QueryGraph
    ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2).
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_ids for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        log = self.response
        continue_if_no_results = self.response.data['parameters'][
            'continue_if_no_results']
        final_kg = DictKnowledgeGraph()
        edge_to_nodes_map = dict()

        # Verify this is a valid one-hop query graph
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # Find potential answers using KG2
        qedge = query_graph.edges[0]
        source_qnode = next(qnode for qnode in query_graph.nodes
                            if qnode.id == qedge.source_id)
        target_qnode = next(qnode for qnode in query_graph.nodes
                            if qnode.id == qedge.target_id)
        qedge_params_str = ", ".join(
            list(
                filter(None, [
                    f"id={qedge.id}", f"source_id={source_qnode.id}",
                    f"target_id={target_qnode.id}",
                    self._get_dsl_qedge_type_str(qedge)
                ])))
        source_params_str = ", ".join(
            list(
                filter(None, [
                    f"id={source_qnode.id}",
                    self._get_dsl_qnode_curie_str(source_qnode),
                    self._get_dsl_qnode_type_str(source_qnode)
                ])))
        target_params_str = ", ".join(
            list(
                filter(None, [
                    f"id={target_qnode.id}",
                    self._get_dsl_qnode_curie_str(target_qnode),
                    self._get_dsl_qnode_type_str(target_qnode)
                ])))
        actions_list = [
            f"add_qnode({source_params_str})",
            f"add_qnode({target_params_str})",
            f"add_qedge({qedge_params_str})",
            f"expand(kp=ARAX/KG2)",
            f"return(message=true, store=false)",
        ]
        kg2_answer_kg = self._run_arax_query(actions_list, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # Go through those answers from KG2 and calculate ngd for each edge
        kg2_edges_map = {edge.id: edge for edge in kg2_answer_kg.edges}
        kg2_nodes_map = {node.id: node for node in kg2_answer_kg.nodes}
        self.cngd.load_curie_to_pmids_data(kg2_nodes_map)
        kg2_edge_ngd_map = dict()
        for kg2_edge in kg2_edges_map.values():
            kg2_node_1 = kg2_nodes_map.get(
                kg2_edge.source_id
            )  # These are already canonicalized (default behavior)
            kg2_node_2 = kg2_nodes_map.get(kg2_edge.target_id)
            # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional)
            if source_qnode.id in kg2_node_1.qnode_ids and target_qnode.id in kg2_node_2.qnode_ids:
                ngd_source_id = kg2_node_1.id
                ngd_target_id = kg2_node_2.id
            else:
                ngd_source_id = kg2_node_2.id
                ngd_target_id = kg2_node_1.id
            ngd_value = self.cngd.calculate_ngd_fast(ngd_source_id,
                                                     ngd_target_id)
            kg2_edge_ngd_map[kg2_edge.id] = {
                "ngd_value": ngd_value,
                "source_id": ngd_source_id,
                "target_id": ngd_target_id
            }

        # Create edges for those from KG2 found to have a low enough ngd value
        for kg2_edge_id, ngd_info_dict in kg2_edge_ngd_map.items():
            ngd_value = ngd_info_dict['ngd_value']
            if ngd_value is not None and ngd_value < 0.5:  # TODO: Make determination of the threshold much more sophisticated
                source_id = ngd_info_dict["source_id"]
                target_id = ngd_info_dict["target_id"]
                ngd_edge = self._create_ngd_edge(ngd_value, source_id,
                                                 target_id)
                ngd_source_node = self._create_ngd_node(
                    kg2_nodes_map.get(ngd_edge.source_id))
                ngd_target_node = self._create_ngd_node(
                    kg2_nodes_map.get(ngd_edge.target_id))
                final_kg.add_edge(ngd_edge, qedge.id)
                final_kg.add_node(ngd_source_node, source_qnode.id)
                final_kg.add_node(ngd_target_node, target_qnode.id)
                edge_to_nodes_map[ngd_edge.id] = {
                    source_qnode.id: ngd_source_node.id,
                    target_qnode.id: ngd_target_node.id
                }

        if not eu.qg_is_fulfilled(query_graph, final_kg):
            if continue_if_no_results:
                log.warning(
                    f"No paths were found satisfying this query graph using NGD"
                )
            else:
                log.error(
                    f"No paths were found satisfying this query graph using NGD",
                    error_code="NoResults")

        return final_kg, edge_to_nodes_map
Ejemplo n.º 5
0
    def _add_answers_to_kg(self, answer_kg: DictKnowledgeGraph,
                           reasoner_std_response: Dict[str, any],
                           input_qnode_id: str, output_qnode_id: str,
                           qedge_id: str, log: Response) -> DictKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(
            reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_ids = dict()
            log.debug(
                f"Got results back from BTE for this query "
                f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)"
            )

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_id = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.type = eu.convert_string_or_list_to_list(
                    eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_ids in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_id)
                if bte_qg_id == "n0":
                    qnode_id = input_qnode_id
                elif bte_qg_id == "n1":
                    qnode_id = output_qnode_id
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_id",
                              error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_id == output_qnode_id:
                    if bte_node_id in remapped_node_ids:
                        swagger_node.id = remapped_node_ids.get(bte_node_id)
                    else:
                        equivalent_curies = [
                            f"{prefix}:{eu.get_curie_local_id(local_id)}"
                            for prefix, local_ids in node.get(
                                'equivalent_identifiers').items()
                            for local_id in local_ids
                        ]
                        swagger_node.id = self._get_best_equivalent_bte_curie(
                            equivalent_curies, swagger_node.type[0])
                        remapped_node_ids[bte_node_id] = swagger_node.id
                else:
                    swagger_node.id = bte_node_id

                answer_kg.add_node(swagger_node, qnode_id)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge.id = edge.get("id")
                swagger_edge.type = edge.get('type')
                swagger_edge.source_id = remapped_node_ids.get(
                    edge.get('source_id'), edge.get('source_id'))
                swagger_edge.target_id = remapped_node_ids.get(
                    edge.get('target_id'), edge.get('target_id'))
                swagger_edge.is_defined_by = "BTE"
                swagger_edge.provided_by = edge.get('edge_source')
                # Map the returned BTE qg_id back to the original qedge_id in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge.id)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_id",
                              error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge, qedge_id)

        return answer_kg