Beispiel #1
0
def get_canonical_curies_dict(curie: Union[str, List[str]],
                              log: ARAXResponse) -> Dict[str, Dict[str, str]]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
        )
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return {}
    else:
        if canonical_curies_dict is not None:
            unrecognized_curies = {
                input_curie
                for input_curie in canonical_curies_dict
                if not canonical_curies_dict.get(input_curie)
            }
            if unrecognized_curies:
                log.warning(
                    f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}"
                )
            return canonical_curies_dict
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return {}
Beispiel #2
0
    def _load_answers_into_kg(self, neo4j_results: List[Dict[str,
                                                             List[Dict[str,
                                                                       any]]]],
                              kg_name: str, qg: QueryGraph,
                              log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        log.debug(
            f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(
            neo4j_results[0]) if kg_name == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith(
                    'nodes'):  # Example column name: 'nodes_n00'
                column_qnode_key = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    node_key, node = self._convert_neo4j_node_to_trapi_node(
                        neo4j_node, kg_name)
                    final_kg.add_node(node_key, node, column_qnode_key)
            # Load answer edges into our knowledge graph
            elif column_name.startswith(
                    'edges'):  # Example column name: 'edges_e01'
                column_qedge_key = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    edge_key, edge = self._convert_neo4j_edge_to_trapi_edge(
                        neo4j_edge, node_uuid_to_curie_dict, kg_name)
                    final_kg.add_edge(edge_key, edge, column_qedge_key)

        return final_kg
def get_canonical_curies_list(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]:
    curies = convert_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies")
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__)
        return []
    else:
        if canonical_curies_dict is not None:
            recognized_input_curies = {input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie)}
            unrecognized_curies = set(curies).difference(recognized_input_curies)
            if unrecognized_curies:
                log.warning(f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}")
            canonical_curies = {canonical_curies_dict[recognized_curie].get('preferred_curie') for recognized_curie in recognized_input_curies}
            # Include any original curies we weren't able to find a canonical version for
            canonical_curies.update(unrecognized_curies)
            if not canonical_curies:
                log.error(f"Final list of canonical curies is empty. This shouldn't happen!", error_code="CanonicalCurieIssue")
            return list(canonical_curies)
        else:
            log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue")
            return []
Beispiel #4
0
 def _add_inverted_predicates(qg: QueryGraph,
                              log: ARAXResponse) -> QueryGraph:
     # For now, we'll consider BOTH predicates in an inverse pair (TODO: later tailor to what we know is in KG2)
     qedge = next(qedge for qedge in qg.edges.values())
     response = requests.get(
         "https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.yaml"
     )
     if response.status_code == 200:
         qedge.predicate = eu.convert_to_list(qedge.predicate)
         biolink_model = yaml.safe_load(response.text)
         inverse_predicates = set()
         for predicate in qedge.predicate:
             english_predicate = predicate.split(":")[-1].replace(
                 "_", " ")  # Converts to 'subclass of' format
             biolink_predicate_info = biolink_model["slots"].get(
                 english_predicate)
             if biolink_predicate_info and "inverse" in biolink_predicate_info:
                 english_inverse_predicate = biolink_predicate_info[
                     "inverse"]
                 machine_inverse_predicate = f"biolink:{english_inverse_predicate.replace(' ', '_')}"
                 inverse_predicates.add(machine_inverse_predicate)
                 log.debug(
                     f"Found inverse predicate for {predicate}: {machine_inverse_predicate}"
                 )
         qedge.predicate = list(
             set(qedge.predicate).union(inverse_predicates))
     else:
         log.warning(
             f"Cannot check for inverse predicates: Failed to load Biolink Model yaml file. "
             f"(Page gave status {response.status_code}.)")
     return qg
Beispiel #5
0
 def _answer_query_using_bte(self, input_qnode_key: str, output_qnode_key: str, qg: QueryGraph,
                             answer_kg: QGOrganizedKnowledgeGraph, valid_bte_inputs_dict: Dict[str, Set[str]],
                             log: ARAXResponse) -> Tuple[QGOrganizedKnowledgeGraph, Set[str]]:
     accepted_curies = set()
     qedge_key = next(qedge_key for qedge_key in qg.edges)
     qedge = qg.edges[qedge_key]
     input_qnode = qg.nodes[input_qnode_key]
     output_qnode = qg.nodes[output_qnode_key]
     # Send this single-edge query to BTE, input curie by input curie (adding findings to our answer KG as we go)
     for curie in input_qnode.id:
         # Consider all different combinations of qnode types (can be multiple if gene/protein)
         for input_qnode_category, output_qnode_category in itertools.product(input_qnode.category, output_qnode.category):
             if eu.get_curie_prefix(curie) in valid_bte_inputs_dict['curie_prefixes']:
                 accepted_curies.add(curie)
                 try:
                     loop = asyncio.new_event_loop()
                     seqd = SingleEdgeQueryDispatcher(input_cls=input_qnode_category,
                                                      output_cls=output_qnode_category,
                                                      pred=qedge.predicate,
                                                      input_id=eu.get_curie_prefix(curie),
                                                      values=eu.get_curie_local_id(curie),
                                                      loop=loop)
                     log.debug(f"Sending query to BTE: {curie}-{qedge.predicate if qedge.predicate else ''}->{output_qnode_category}")
                     seqd.query()
                     reasoner_std_response = seqd.to_reasoner_std()
                 except Exception:
                     trace_back = traceback.format_exc()
                     error_type, error, _ = sys.exc_info()
                     log.error(f"Encountered a problem while using BioThings Explorer. {trace_back}",
                               error_code=error_type.__name__)
                     return answer_kg, accepted_curies
                 else:
                     answer_kg = self._add_answers_to_kg(answer_kg, reasoner_std_response, input_qnode_key, output_qnode_key, qedge_key, log)
     return answer_kg, accepted_curies
Beispiel #6
0
def get_preferred_categories(curie: Union[str, List[str]],
                             log: ARAXResponse) -> Optional[List[str]]:
    curies = convert_to_list(curie)
    synonymizer = NodeSynonymizer()
    log.debug(
        f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
    )
    canonical_curies_dict = synonymizer.get_canonical_curies(curies)
    log.debug(f"Got response back from NodeSynonymizer")
    if canonical_curies_dict is not None:
        recognized_input_curies = {
            input_curie
            for input_curie in canonical_curies_dict
            if canonical_curies_dict.get(input_curie)
        }
        unrecognized_curies = set(curies).difference(recognized_input_curies)
        if unrecognized_curies:
            log.warning(
                f"NodeSynonymizer did not recognize: {unrecognized_curies}")
        preferred_categories = {
            canonical_curies_dict[recognized_curie].get('preferred_category')
            for recognized_curie in recognized_input_curies
        }
        if preferred_categories:
            return list(preferred_categories)
        else:
            log.warning(
                f"Unable to find any preferred categories; will default to biolink:NamedThing"
            )
            return ["biolink:NamedThing"]
    else:
        log.error(f"NodeSynonymizer returned None",
                  error_code="NodeNormalizationIssue")
        return []
Beispiel #7
0
    def apply(self, input_message, input_parameters):

        #### Define a default response
        response = ARAXResponse()
        self.response = response
        self.message = input_message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        #### Define a complete set of allowed parameters and their defaults
        parameters = {
            'maximum_results': None,
            'minimum_confidence': None,
            'start_node': 1
        }

        #### Loop through the input_parameters and override the defaults and make sure they are allowed
        for key, value in input_parameters.items():
            if key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                parameters[key] = value
        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        #### Store these final parameters for convenience
        response.data['parameters'] = parameters
        self.parameters = parameters

        #### Now apply the filters. Order of operations is probably quite important
        #### Scalar value filters probably come first like minimum_confidence, then complex logic filters
        #### based on edge or node properties, and then finally maximum_results
        response.debug(
            f"Applying filter to Message with parameters {parameters}")

        #### First, as a test, blow away the results and see if we can recompute them
        #message.n_results = 0
        #message.results = []
        #self.__recompute_results()

        #### Apply scalar value filters first to do easy things and reduce the problem
        # TODO

        #### Complex logic filters probably come next. These may be hard
        # TODO

        #### Finally, if the maximum_results parameter is set, then limit the number of results to that last
        if parameters['maximum_results'] is not None:
            self.__apply_maximum_results_filter(parameters['maximum_results'])

        #### Return the response
        return response
Beispiel #8
0
    def _convert_one_hop_query_graph_to_cypher_query(
            self, qg: QueryGraph, enforce_directionality: bool,
            log: ARAXResponse) -> str:
        qedge_key = next(qedge_key for qedge_key in qg.edges)
        qedge = qg.edges[qedge_key]
        log.debug(f"Generating cypher for edge {qedge_key} query graph")
        try:
            # Build the match clause
            subject_qnode_key = qedge.subject
            object_qnode_key = qedge.object
            qedge_cypher = self._get_cypher_for_query_edge(
                qedge_key, qg, enforce_directionality)
            source_qnode_cypher = self._get_cypher_for_query_node(
                subject_qnode_key, qg)
            target_qnode_cypher = self._get_cypher_for_query_node(
                object_qnode_key, qg)
            match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}"

            # Build the where clause
            where_fragments = []
            for qnode_key in [subject_qnode_key, object_qnode_key]:
                qnode = qg.nodes[qnode_key]
                if qnode.id and isinstance(qnode.id,
                                           list) and len(qnode.id) > 1:
                    where_fragments.append(f"{qnode_key}.id in {qnode.id}")
                if qnode.category:
                    qnode.category = eu.convert_to_list(qnode.category)
                    if len(qnode.category) > 1:
                        # Create where fragment that looks like 'n00:biolink:Disease OR n00:biolink:PhenotypicFeature..'
                        category_sub_fragments = [
                            f"{qnode_key}:`{category}`"
                            for category in qnode.category
                        ]
                        category_where_fragment = f"({' OR '.join(category_sub_fragments)})"
                        where_fragments.append(category_where_fragment)
            where_clause = f"WHERE {' AND '.join(where_fragments)}" if where_fragments else ""

            # Build the with clause
            source_qnode_col_name = f"nodes_{subject_qnode_key}"
            target_qnode_col_name = f"nodes_{object_qnode_key}"
            qedge_col_name = f"edges_{qedge_key}"
            # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID
            extra_edge_properties = "{.*, " + f"id:ID({qedge_key}), {subject_qnode_key}:{subject_qnode_key}.id, {object_qnode_key}:{object_qnode_key}.id" + "}"
            with_clause = f"WITH collect(distinct {subject_qnode_key}) as {source_qnode_col_name}, " \
                          f"collect(distinct {object_qnode_key}) as {target_qnode_col_name}, " \
                          f"collect(distinct {qedge_key}{extra_edge_properties}) as {qedge_col_name}"

            # Build the return clause
            return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}"

            cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}"
            return cypher_query
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            log.error(f"Problem generating cypher for query. {tb}",
                      error_code=error_type.__name__)
            return ""
Beispiel #9
0
 def _send_query_to_kp(self, query_graph: QueryGraph,
                       log: ARAXResponse) -> Dict[str, any]:
     # Send query to their API (stripping down qnode/qedges to only the properties they like)
     stripped_qnodes = []
     for qnode_key, qnode in query_graph.nodes.items():
         stripped_qnode = {'id': qnode_key, 'type': qnode.category}
         if qnode.id:
             stripped_qnode['curie'] = qnode.id
         stripped_qnodes.append(stripped_qnode)
     qedge_key = next(qedge_key for qedge_key in
                      query_graph.edges)  # Our query graph is single-edge
     qedge = query_graph.edges[qedge_key]
     stripped_qedge = {
         'id': qedge_key,
         'source_id': qedge.subject,
         'target_id': qedge.object,
         'type': list(self.accepted_edge_types)[0]
     }
     source_stripped_qnode = next(qnode for qnode in stripped_qnodes
                                  if qnode['id'] == qedge.subject)
     input_curies = eu.convert_string_or_list_to_list(
         source_stripped_qnode['curie'])
     combined_response = dict()
     for input_curie in input_curies:  # Until we have batch querying, ping them one-by-one for each input curie
         log.debug(
             f"Sending {qedge_key} query to {self.kp_name} for {input_curie}"
         )
         source_stripped_qnode['curie'] = input_curie
         kp_response = requests.post(self.kp_query_endpoint,
                                     json={
                                         'message': {
                                             'query_graph': {
                                                 'nodes': stripped_qnodes,
                                                 'edges': [stripped_qedge]
                                             }
                                         }
                                     },
                                     headers={'accept': 'application/json'})
         if kp_response.status_code != 200:
             log.warning(
                 f"{self.kp_name} KP API returned response of {kp_response.status_code}"
             )
         else:
             kp_response_json = kp_response.json()
             if kp_response_json.get('results'):
                 if not combined_response:
                     combined_response = kp_response_json
                 else:
                     combined_response['knowledge_graph'][
                         'nodes'] += kp_response_json['knowledge_graph'][
                             'nodes']
                     combined_response['knowledge_graph'][
                         'edges'] += kp_response_json['knowledge_graph'][
                             'edges']
                     combined_response['results'] += kp_response_json[
                         'results']
     return combined_response
Beispiel #10
0
    def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any],
                           input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_keys = dict()
            log.debug(f"Got results back from BTE for this query "
                      f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)")

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_key = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_keys in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key)
                if bte_qg_id == "n0":
                    qnode_key = input_qnode_key
                elif bte_qg_id == "n1":
                    qnode_key = output_qnode_key
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_key == output_qnode_key:
                    if bte_node_key in remapped_node_keys:
                        swagger_node_key = remapped_node_keys.get(bte_node_key)
                    else:
                        equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in
                                             node.get('equivalent_identifiers').items() for local_id in local_ids]
                        swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0])
                        remapped_node_keys[bte_node_key] = swagger_node_key
                else:
                    swagger_node_key = bte_node_key

                answer_kg.add_node(swagger_node_key, swagger_node, qnode_key)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge_key = edge.get("id")
                swagger_edge.predicate = edge.get('type')
                swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id'))
                swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id'))
                swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")),
                                           Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))]
                # Map the returned BTE qg_id back to the original qedge_key in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key)

        return answer_kg
Beispiel #11
0
def get_curie_names(curie: Union[str, List[str]],
                    log: ARAXResponse) -> Dict[str, str]:
    curies = convert_to_list(curie)
    synonymizer = NodeSynonymizer()
    log.debug(
        f"Looking up names for {len(curies)} input curies using NodeSynonymizer"
    )
    synonymizer_info = synonymizer.get_normalizer_results(curies)
    curie_to_name_map = dict()
    if synonymizer_info:
        recognized_input_curies = {
            input_curie
            for input_curie in synonymizer_info
            if synonymizer_info.get(input_curie)
        }
        unrecognized_curies = set(curies).difference(recognized_input_curies)
        if unrecognized_curies:
            log.warning(
                f"NodeSynonymizer did not recognize: {unrecognized_curies}")
        input_curies_without_matching_node = set()
        for input_curie in recognized_input_curies:
            equivalent_nodes = synonymizer_info[input_curie]["nodes"]
            # Find the 'node' in the synonymizer corresponding to this curie
            input_curie_nodes = [
                node for node in equivalent_nodes
                if node["identifier"] == input_curie
            ]
            if not input_curie_nodes:
                # Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700"
                input_curie_stripped = input_curie.replace(".COMPOUND", "")
                input_curie_nodes = [
                    node for node in equivalent_nodes
                    if node["identifier"] == input_curie_stripped
                ]
            # Record the name for this input curie
            if input_curie_nodes:
                curie_to_name_map[input_curie] = input_curie_nodes[0].get(
                    "label")
            else:
                input_curies_without_matching_node.add(input_curie)
        if input_curies_without_matching_node:
            log.warning(
                f"No matching nodes found in NodeSynonymizer for these input curies: "
                f"{input_curies_without_matching_node}. Cannot determine their specific names."
            )
    else:
        log.error(f"NodeSynonymizer returned None",
                  error_code="NodeNormalizationIssue")
    return curie_to_name_map
Beispiel #12
0
 def _answer_query_using_plover(
     qg: QueryGraph, log: ARAXResponse
 ) -> Tuple[Dict[str, Dict[str, Set[Union[str, int]]]], int]:
     rtxc = RTXConfiguration()
     rtxc.live = "Production"
     log.debug(f"Sending query to Plover")
     response = requests.post(f"{rtxc.plover_url}/query",
                              json=qg.to_dict(),
                              headers={'accept': 'application/json'})
     if response.status_code == 200:
         log.debug(f"Got response back from Plover")
         return response.json(), response.status_code
     else:
         log.warning(
             f"Plover returned a status code of {response.status_code}. Response was: {response.text}"
         )
         return dict(), response.status_code
Beispiel #13
0
    def _load_answers_into_kg(
        self, neo4j_results: List[Dict[str, List[Dict[str, any]]]],
        kg_name: str, qg: QueryGraph, log: ARAXResponse
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(
            f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(
            neo4j_results[0]) if kg_name == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith(
                    'nodes'):  # Example column name: 'nodes_n00'
                column_qnode_key = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node(
                        neo4j_node, kg_name)
                    final_kg.add_node(swagger_node_key, swagger_node,
                                      column_qnode_key)
            # Load answer edges into our knowledge graph
            elif column_name.startswith(
                    'edges'):  # Example column name: 'edges_e01'
                column_qedge_key = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge(
                        neo4j_edge, node_uuid_to_curie_dict, kg_name)

                    # Record which of this edge's nodes correspond to which qnode_key
                    if swagger_edge_key not in edge_to_nodes_map:
                        edge_to_nodes_map[swagger_edge_key] = dict()
                    for qnode_key in qg.nodes:
                        edge_to_nodes_map[swagger_edge_key][
                            qnode_key] = neo4j_edge.get(qnode_key)

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      column_qedge_key)

        return final_kg, edge_to_nodes_map
Beispiel #14
0
 def _prune_highly_connected_nodes(kg: QGOrganizedKnowledgeGraph, qedge_key: str, input_curies: Set[str],
                                   input_qnode_key: str, max_edges_per_input_curie: int, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
     # First create a lookup of which edges belong to which input curies
     input_nodes_to_edges_dict = defaultdict(set)
     for edge_key, edge in kg.edges_by_qg_id[qedge_key].items():
         if edge.subject in input_curies:
             input_nodes_to_edges_dict[edge.subject].add(edge_key)
         if edge.object in input_curies:
             input_nodes_to_edges_dict[edge.object].add(edge_key)
     # Then prune down highly-connected nodes (delete edges per input curie in excess of some set limit)
     for node_key, connected_edge_keys in input_nodes_to_edges_dict.items():
         connected_edge_keys_list = list(connected_edge_keys)
         if len(connected_edge_keys_list) > max_edges_per_input_curie:
             random.shuffle(connected_edge_keys_list)  # Make it random which edges we keep for this input curie
             edge_keys_to_remove = connected_edge_keys_list[max_edges_per_input_curie:]
             log.debug(f"Randomly removing {len(edge_keys_to_remove)} edges from answer for input curie {node_key}")
             for edge_key in edge_keys_to_remove:
                 kg.edges_by_qg_id[qedge_key].pop(edge_key, None)
             # Document that not all answers for this input curie are included
             node = kg.nodes_by_qg_id[input_qnode_key].get(node_key)
             if node:
                 if not node.attributes:
                     node.attributes = []
                 if not any(attribute.attribute_type_id == "biolink:incomplete_result_set"
                            for attribute in node.attributes):
                     node.attributes.append(Attribute(attribute_type_id="biolink:incomplete_result_set",  # TODO: request this as actual biolink item?
                                                      value_type_id="metatype:Boolean",
                                                      value=True,
                                                      attribute_source="infores:rtx-kg2",
                                                      description=f"This attribute indicates that not all "
                                                                  f"nodes/edges returned as answers for this input "
                                                                  f"curie were included in the final answer due to "
                                                                  f"size limitations. {max_edges_per_input_curie} "
                                                                  f"edges for this input curie were kept."))
     # Then delete any nodes orphaned by removal of edges
     node_keys_used_by_edges = kg.get_all_node_keys_used_by_edges()
     for qnode_key, nodes in kg.nodes_by_qg_id.items():
         orphan_node_keys = set(nodes).difference(node_keys_used_by_edges)
         if orphan_node_keys:
             log.debug(f"Removing {len(orphan_node_keys)} {qnode_key} nodes orphaned by the above step")
             for orphan_node_key in orphan_node_keys:
                 del kg.nodes_by_qg_id[qnode_key][orphan_node_key]
     return kg
Beispiel #15
0
def get_node_pairs_to_overlay(subject_qnode_key: str, object_qnode_key: str, query_graph: QueryGraph,
                              knowledge_graph: KnowledgeGraph, log: ARAXResponse) -> Set[Tuple[str, str]]:
    """
    This function determines which combinations of subject/object nodes in the KG need to be overlayed (e.g., have a
    virtual edge added between). It makes use of Resultify to determine what combinations of subject and object nodes
    may actually appear together in the same Results. (See issue #1069.) If it fails to narrow the node pairs for
    whatever reason, it defaults to returning all possible combinations of subject/object nodes.
    """
    log.debug(f"Narrowing down {subject_qnode_key}--{object_qnode_key} node pairs to overlay")
    kg_nodes_by_qg_id = get_node_ids_by_qg_id(knowledge_graph)
    kg_edges_by_qg_id = get_edge_ids_by_qg_id(knowledge_graph)
    # Grab the portion of the QG already 'expanded' (aka, present in the KG)
    sub_query_graph = QueryGraph(nodes={key:qnode for key, qnode in query_graph.nodes.items() if key in set(kg_nodes_by_qg_id)},
                                 edges={key:qedge for key, qedge in query_graph.edges.items() if key in set(kg_edges_by_qg_id)})

    # Compute results using Resultify so we can see which nodes appear in the same results
    resultifier = ARAXResultify()
    sub_response = ARAXResponse()
    sub_response.envelope = Response()
    sub_response.envelope.message = Message()
    sub_message = sub_response.envelope.message
    sub_message.query_graph = sub_query_graph
    sub_message.knowledge_graph = KnowledgeGraph(nodes=knowledge_graph.nodes.copy(),
                                                 edges=knowledge_graph.edges.copy())
    #sub_response.envelope.message = sub_message
    resultify_response = resultifier.apply(sub_response, {})

    # Figure out which node pairs appear together in one or more results
    if resultify_response.status == 'OK':
        node_pairs = set()
        for result in sub_message.results:
            subject_curies_in_this_result = {node_binding.id for key, node_binding_list in result.node_bindings.items() for node_binding in node_binding_list if
                                            key == subject_qnode_key}
            object_curies_in_this_result = {node_binding.id for key, node_binding_list in result.node_bindings.items() for node_binding in node_binding_list if
                                            key == object_qnode_key}
            pairs_in_this_result = set(itertools.product(subject_curies_in_this_result, object_curies_in_this_result))
            node_pairs = node_pairs.union(pairs_in_this_result)
        log.debug(f"Identified {len(node_pairs)} node pairs to overlay (with help of resultify)")
        if node_pairs:
            return node_pairs
    # Back up to using the old (O(n^2)) method of all combinations of subject/object nodes in the KG
    log.warning(f"Failed to narrow down node pairs to overlay; defaulting to all possible combinations")
    return set(itertools.product(kg_nodes_by_qg_id[subject_qnode_key], kg_nodes_by_qg_id[object_qnode_key]))
Beispiel #16
0
 def make_qg_use_supported_prefixes(
         self, qg: QueryGraph, kp_name: str,
         log: ARAXResponse) -> Optional[QueryGraph]:
     for qnode_key, qnode in qg.nodes.items():
         if qnode.ids:
             if kp_name == "infores:rtx-kg2":
                 # Just convert them into canonical curies
                 qnode.ids = eu.get_canonical_curies_list(qnode.ids, log)
             else:
                 # Otherwise figure out which kind of curies KPs want
                 categories = eu.convert_to_list(qnode.categories)
                 supported_prefixes = self._get_supported_prefixes(
                     categories, kp_name)
                 used_prefixes = {
                     self._get_uppercase_prefix(curie)
                     for curie in qnode.ids
                 }
                 # Only convert curie(s) if any use an unsupported prefix
                 if used_prefixes.issubset(supported_prefixes):
                     self.log.debug(
                         f"{kp_name}: All {qnode_key} curies use prefix(es) {kp_name} supports; no "
                         f"conversion necessary")
                 else:
                     self.log.debug(
                         f"{kp_name}: One or more {qnode_key} curies use a prefix {kp_name} doesn't "
                         f"support; will convert these")
                     converted_curies = self.get_desirable_equivalent_curies(
                         qnode.ids, qnode.categories, kp_name)
                     if converted_curies:
                         log.debug(
                             f"{kp_name}: Converted {qnode_key}'s {len(qnode.ids)} curies to a list of "
                             f"{len(converted_curies)} curies tailored for {kp_name}"
                         )
                         qnode.ids = converted_curies
                     else:
                         log.info(
                             f"{kp_name} cannot answer the query because no equivalent curies were found "
                             f"with prefixes it supports for qnode {qnode_key}. Original curies were: "
                             f"{qnode.ids}")
                         return None
     return qg
Beispiel #17
0
 def _answer_query_using_plover(qg: QueryGraph, log: ARAXResponse) -> Tuple[Dict[str, Dict[str, Union[set, dict]]], int]:
     rtxc = RTXConfiguration()
     rtxc.live = "Production"
     # First prep the query graph (requires some minor additions for Plover)
     dict_qg = qg.to_dict()
     dict_qg["include_metadata"] = True  # Ask plover to return node/edge objects (not just IDs)
     dict_qg["respect_predicate_symmetry"] = True  # Ignore direction for symmetric predicate, enforce for asymmetric
     # Allow subclass_of reasoning for qnodes with a small number of curies
     for qnode in dict_qg["nodes"].values():
         if qnode.get("ids") and len(qnode["ids"]) < 5:
             if "allow_subclasses" not in qnode or qnode["allow_subclasses"] is None:
                 qnode["allow_subclasses"] = True
     # Then send the actual query
     response = requests.post(f"{rtxc.plover_url}/query", json=dict_qg, timeout=60,
                              headers={'accept': 'application/json'})
     if response.status_code == 200:
         log.debug(f"Got response back from Plover")
         return response.json(), response.status_code
     else:
         log.warning(f"Plover returned a status code of {response.status_code}. Response was: {response.text}")
         return dict(), response.status_code
Beispiel #18
0
def get_curie_synonyms(curie: Union[str, List[str]],
                       log: ARAXResponse) -> List[str]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies"
        )
        equivalent_curies_dict = synonymizer.get_equivalent_nodes(
            curies, kg_name="KG2")
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return []
    else:
        if equivalent_curies_dict is not None:
            curies_missing_info = {
                curie
                for curie in equivalent_curies_dict
                if not equivalent_curies_dict.get(curie)
            }
            if curies_missing_info:
                log.warning(
                    f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}"
                )
            equivalent_curies = {
                curie
                for curie_dict in equivalent_curies_dict.values() if curie_dict
                for curie in curie_dict
            }
            all_curies = equivalent_curies.union(set(
                curies))  # Make sure even curies without synonyms are included
            return sorted(list(all_curies))
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return []
Beispiel #19
0
    def decorate_nodes(self, response: ARAXResponse) -> ARAXResponse:
        message = response.envelope.message
        response.debug(f"Decorating nodes with metadata from KG2c")

        # Get connected to the local KG2c sqlite database
        connection, cursor = self._connect_to_kg2c_sqlite()

        # Extract the KG2c nodes from sqlite
        response.debug(f"Looking up corresponding KG2c nodes in sqlite")
        node_attributes_ordered = list(self.node_attributes)
        node_cols_str = ", ".join([
            f"N.{property_name}" for property_name in node_attributes_ordered
        ])
        node_keys = set(
            node_key.replace("'", "''")
            for node_key in message.knowledge_graph.nodes)  # Escape quotes
        node_keys_str = "','".join(
            node_keys)  # SQL wants ('node1', 'node2') format for string lists
        sql_query = f"SELECT N.id, {node_cols_str} " \
                    f"FROM nodes AS N " \
                    f"WHERE N.id IN ('{node_keys_str}')"
        cursor.execute(sql_query)
        rows = cursor.fetchall()
        cursor.close()
        connection.close()

        # Decorate nodes in the KG with info in these KG2c nodes
        response.debug(f"Adding attributes to nodes in the KG")
        for row in rows:
            # First create the attributes for this KG2c node
            node_id = row[0]
            trapi_node = message.knowledge_graph.nodes[node_id]
            kg2c_node_attributes = []
            for index, property_name in enumerate(node_attributes_ordered):
                value = self._load_property(
                    property_name,
                    row[index + 1])  # Add one to account for 'id' column
                if value:
                    kg2c_node_attributes.append(
                        self.create_attribute(property_name, value))

            # Then decorate the TRAPI node with those attributes it doesn't already have
            existing_attribute_triples = {
                self._get_attribute_triple(attribute)
                for attribute in trapi_node.attributes
            } if trapi_node.attributes else set()
            novel_attributes = [
                attribute for attribute in kg2c_node_attributes
                if self._get_attribute_triple(attribute) not in
                existing_attribute_triples
            ]
            if trapi_node.attributes:
                trapi_node.attributes += novel_attributes
            else:
                trapi_node.attributes = novel_attributes

        return response
Beispiel #20
0
 def _pre_process_query_graph(self, query_graph: QueryGraph,
                              log: ARAXResponse) -> QueryGraph:
     for qnode_key, qnode in query_graph.nodes.items():
         # Convert node types to preferred format and verify we can do this query
         formatted_qnode_categories = {
             self.node_category_overrides_for_kp.get(
                 qnode_category, qnode_category)
             for qnode_category in eu.convert_string_or_list_to_list(
                 qnode.category)
         }
         accepted_qnode_categories = formatted_qnode_categories.intersection(
             self.accepted_node_categories)
         if not accepted_qnode_categories:
             log.error(
                 f"{self.kp_name} can only be used for queries involving {self.accepted_node_categories} "
                 f"and QNode {qnode_key} has category '{qnode.category}'",
                 error_code="UnsupportedQueryForKP")
             return query_graph
         else:
             qnode.category = list(accepted_qnode_categories)[0]
         # Convert curies to equivalent curies accepted by the KP (depending on qnode type)
         if qnode.id:
             equivalent_curies = eu.get_curie_synonyms(qnode.id, log)
             desired_curies = [
                 curie for curie in equivalent_curies if curie.startswith(
                     f"{self.kp_preferred_prefixes[qnode.category]}:")
             ]
             if desired_curies:
                 qnode.id = desired_curies if len(
                     desired_curies) > 1 else desired_curies[0]
                 log.debug(
                     f"Converted qnode {qnode_key} curie to {qnode.id}")
             else:
                 log.warning(
                     f"Could not convert qnode {qnode_key} curie(s) to preferred prefix ({self.kp_preferred_prefixes[qnode.category]})"
                 )
     return query_graph
Beispiel #21
0
 def _load_plover_answer_into_object_model(self, plover_answer: Dict[str, Dict[str, Union[set, dict]]],
                                           log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
     answer_kg = QGOrganizedKnowledgeGraph()
     # Load returned nodes into TRAPI object model
     for qnode_key, nodes in plover_answer["nodes"].items():
         num_nodes = len(nodes)
         log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model")
         start = time.time()
         for node_key, node_tuple in nodes.items():
             node = self._convert_kg2c_plover_node_to_trapi_node(node_tuple)
             answer_kg.add_node(node_key, node, qnode_key)
         log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model took "
                   f"{round(time.time() - start, 2)} seconds")
     # Load returned edges into TRAPI object model
     for qedge_key, edges in plover_answer["edges"].items():
         num_edges = len(edges)
         log.debug(f"Loading {num_edges} edges into TRAPI object model")
         start = time.time()
         for edge_key, edge_tuple in edges.items():
             edge = self._convert_kg2c_plover_edge_to_trapi_edge(edge_tuple)
             answer_kg.add_edge(edge_key, edge, qedge_key)
         log.debug(f"Loading {num_edges} {qedge_key} edges into TRAPI object model took "
                   f"{round(time.time() - start, 2)} seconds")
     return answer_kg
Beispiel #22
0
    def assess(self, message):

        #### Define a default response
        response = ARAXResponse()
        self.response = response
        self.message = message
        response.debug(f"Assessing the QueryGraph for basic information")

        #### Get shorter handles
        query_graph = message.query_graph
        nodes = query_graph.nodes
        edges = query_graph.edges

        #### Store number of nodes and edges
        self.n_nodes = len(nodes)
        self.n_edges = len(edges)
        response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges")

        #### Handle impossible cases
        if self.n_nodes == 0:
            response.error(
                "QueryGraph has 0 nodes. At least 1 node is required",
                error_code="QueryGraphZeroNodes")
            return response
        if self.n_nodes == 1 and self.n_edges > 0:
            response.error(
                "QueryGraph may not have edges if there is only one node",
                error_code="QueryGraphTooManyEdges")
            return response
        #if self.n_nodes == 2 and self.n_edges > 1:
        #    response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges")
        #    return response

        #### Loop through nodes computing some stats
        node_info = {}
        self.node_category_map = {}
        for key, qnode in nodes.items():
            node_info[key] = {
                'key': key,
                'node_object': qnode,
                'has_id': False,
                'category': qnode.category,
                'has_category': False,
                'is_set': False,
                'n_edges': 0,
                'n_links': 0,
                'is_connected': False,
                'edges': [],
                'edge_dict': {}
            }
            if qnode.id is not None:
                node_info[key]['has_id'] = True

                #### If the user did not specify a category, but there is a curie, try to figure out the category
                if node_info[key]['category'] is None:
                    synonymizer = NodeSynonymizer()
                    curie = qnode.id
                    curies_list = qnode.id
                    if isinstance(qnode.id, list):
                        curie = qnode.id[0]
                    else:
                        curies_list = [qnode.id]

                    canonical_curies = synonymizer.get_canonical_curies(
                        curies=curies_list, return_all_categories=True)
                    if curie in canonical_curies and 'preferred_type' in canonical_curies[
                            curie]:
                        node_info[key]['has_category'] = True
                        node_info[key]['category'] = canonical_curies[curie][
                            'preferred_type']

            if qnode.category is not None:
                node_info[key]['has_category'] = True

            #if qnode.is_set is not None: node_info[key]['is_set'] = True
            if key is None:
                response.error(
                    "QueryGraph has a node with null key. This is not permitted",
                    error_code="QueryGraphNodeWithNoId")
                return response

            #### Remap the node categorys from unsupported to supported
            if qnode.category is not None:
                qnode.category = self.remap_node_category(qnode.category)

            #### Store lookup of categorys
            warning_counter = 0
            if qnode.category is None or (isinstance(qnode.category, list)
                                          and len(qnode.category) == 0):
                if warning_counter == 0:
                    #response.debug("QueryGraph has nodes with no category. This may cause problems with results inference later")
                    pass
                warning_counter += 1
                self.node_category_map['unknown'] = key
            else:
                category = qnode.category
                if isinstance(qnode.category, list):
                    category = qnode.category[
                        0]  # FIXME this is a hack prior to proper list handling
                self.node_category_map[category] = key

        #### Loop through edges computing some stats
        edge_info = {}
        self.edge_predicate_map = {}
        unique_links = {}

        #### Ignore special informationational edges for now.
        virtual_edge_predicates = {
            'has_normalized_google_distance_with': 1,
            'has_fisher_exact_test_p-value_with': 1,
            'has_jaccard_index_with': 1,
            'probably_treats': 1,
            'has_paired_concept_frequency_with': 1,
            'has_observed_expected_ratio_with': 1,
            'has_chi_square_with': 1
        }

        for key, qedge in edges.items():

            predicate = qedge.predicate
            if isinstance(predicate, list):
                if len(predicate) == 0:
                    predicate = None
                else:
                    predicate = predicate[
                        0]  # FIXME Hack before dealing with predicates as lists!

            if predicate is not None and predicate in virtual_edge_predicates:
                continue

            edge_info[key] = {
                'key': key,
                'has_predicate': False,
                'subject': qedge.subject,
                'object': qedge.object,
                'predicate': None
            }
            if predicate is not None:
                edge_info[key]['has_predicate'] = True
                edge_info[key]['predicate'] = predicate

            if key is None:
                response.error(
                    "QueryGraph has a edge with null key. This is not permitted",
                    error_code="QueryGraphEdgeWithNoKey")
                return response

            #### Create a unique node link string
            link_string = ','.join(sorted([qedge.subject, qedge.object]))
            if link_string not in unique_links:
                node_info[qedge.subject]['n_links'] += 1
                node_info[qedge.object]['n_links'] += 1
                unique_links[link_string] = 1
                #print(link_string)

            node_info[qedge.subject]['n_edges'] += 1
            node_info[qedge.object]['n_edges'] += 1
            node_info[qedge.subject]['is_connected'] = True
            node_info[qedge.object]['is_connected'] = True
            #node_info[qedge.subject]['edges'].append(edge_info[key])
            #node_info[qedge.object]['edges'].append(edge_info[key])
            node_info[qedge.subject]['edges'].append(edge_info[key])
            node_info[qedge.object]['edges'].append(edge_info[key])
            node_info[qedge.subject]['edge_dict'][key] = edge_info[key]
            node_info[qedge.object]['edge_dict'][key] = edge_info[key]

            #### Store lookup of predicates
            warning_counter = 0
            edge_predicate = 'any'
            if predicate is None:
                if warning_counter == 0:
                    response.debug(
                        "QueryGraph has edges with no predicate. This may cause problems with results inference later"
                    )
                warning_counter += 1
            else:
                edge_predicate = predicate

            #### It's not clear yet whether we need to store the whole sentence or just the predicate
            #predicate_encoding = f"{node_info[qedge.subject]['predicate']}---{edge_predicate}---{node_info[qedge.object]['predicate']}"
            predicate_encoding = edge_predicate
            self.edge_predicate_map[predicate_encoding] = key

        #### Loop through the nodes again, trying to identify the start_node and the end_node
        singletons = []
        for node_id, node_data in node_info.items():
            if node_data['n_links'] < 2:
                singletons.append(node_data)
            elif node_data['n_links'] > 2:
                self.is_bifurcated_graph = True
                response.warning(
                    "QueryGraph appears to have a fork in it. This might cause trouble"
                )

        #### If this doesn't produce any singletons, then try curie based selection
        if len(singletons) == 0:
            for node_id, node_data in node_info.items():
                if node_data['has_id']:
                    singletons.append(node_data)

        #### If this doesn't produce any singletons, then we don't know how to continue
        if len(singletons) == 0:
            response.error("Unable to understand the query graph",
                           error_code="QueryGraphCircular")
            return response

        #### Try to identify the start_node and the end_node
        start_node = singletons[0]
        if len(nodes) == 1:
            # Just a single node, fine
            pass
        elif len(singletons) < 2:
            response.warning(
                "QueryGraph appears to be circular or has a strange geometry. This might cause trouble"
            )
        elif len(singletons) > 2:
            response.warning(
                "QueryGraph appears to have a fork in it. This might cause trouble"
            )
        else:
            if singletons[0]['has_id'] is True and singletons[1][
                    'has_id'] is False:
                start_node = singletons[0]
            elif singletons[0]['has_id'] is False and singletons[1][
                    'has_id'] is True:
                start_node = singletons[1]
            else:
                start_node = singletons[0]
        #### Hmm, that's not very robust against odd graphs. This needs work. FIXME

        self.node_info = node_info
        self.edge_info = edge_info
        self.start_node = start_node

        current_node = start_node
        node_order = [start_node]
        edge_order = []
        edges = current_node['edges']
        debug = False

        while 1:
            if debug:
                tmp = {
                    'astate': '1',
                    'current_node': current_node,
                    'node_order': node_order,
                    'edge_order': edge_order,
                    'edges': edges
                }
                print(
                    json.dumps(ast.literal_eval(repr(tmp)),
                               sort_keys=True,
                               indent=2))
                print(
                    '=================================================================================='
                )
                tmp = input()

            if len(edges) == 0:
                break
            #if len(edges) > 1:
            if current_node['n_links'] > 1:
                response.error(
                    f"Help, two edges at A583. Don't know what to do: {current_node['n_links']}",
                    error_code="InteralErrorA583")
                return response
            edge_order.append(edges[0])
            previous_node = current_node
            if edges[0]['subject'] == current_node['key']:
                current_node = node_info[edges[0]['object']]
            elif edges[0]['object'] == current_node['key']:
                current_node = node_info[edges[0]['subject']]
            else:
                response.error("Help, edge error A584. Don't know what to do",
                               error_code="InteralErrorA584")
                return response
            node_order.append(current_node)

            #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

            edges = current_node['edges']
            new_edges = []
            for edge in edges:
                key = edge['key']
                if key not in previous_node['edge_dict']:
                    new_edges.append(edge)
            edges = new_edges
            if len(edges) == 0:
                break
            #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

        self.node_order = node_order
        self.edge_order = edge_order

        # Create a text rendering of the QueryGraph geometry for matching against a template
        self.query_graph_templates = {
            'simple': '',
            'detailed': {
                'n_nodes': len(node_order),
                'components': []
            }
        }
        node_index = 0
        edge_index = 0
        #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2))
        for node in node_order:
            component_id = f"n{node_index:02}"
            content = ''
            component = {
                'component_type': 'node',
                'component_id': component_id,
                'has_id': node['has_id'],
                'has_category': node['has_category'],
                'category_value': None
            }
            self.query_graph_templates['detailed']['components'].append(
                component)
            if node['has_id']:
                content = 'id'
            elif node['has_category'] and node[
                    'node_object'].category is not None:
                content = f"category={node['node_object'].category}"
                component['category_value'] = node['node_object'].category
            elif node['has_category']:
                content = 'category'
            template_part = f"{component_id}({content})"
            self.query_graph_templates['simple'] += template_part

            # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false
            if node_index > 0 and node_index < (self.n_nodes - 1):
                if 'is_set' not in node or node['is_set'] is None:
                    node['node_object'].is_set = True
                    response.warning(
                        f"Setting unspecified is_set to true for {node['key']} because this will probably lead to a happier result"
                    )
                elif node['is_set'] is True:
                    response.debug(
                        f"Value for is_set is already true for {node['key']} so that's good"
                    )
                elif node['is_set'] is False:
                    #response.info(f"Value for is_set is set to false for intermediate node {node['key']}. This could lead to weird results. Consider setting it to true")
                    response.info(
                        f"Value for is_set is false for intermediate node {node['key']}. Setting to true because this will probably lead to a happier result"
                    )
                    node['node_object'].is_set = True
                #else:
                #    response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['key']}. This should be true or false")

            node_index += 1
            if node_index < self.n_nodes:
                #print(json.dumps(ast.literal_eval(repr(node)),sort_keys=True,indent=2))

                #### Extract the has_predicate and predicate_value from the edges of the node
                #### This could fail if there are two edges coming out of the node FIXME
                has_predicate = False
                predicate_value = None
                if 'edges' in node:
                    for related_edge in node['edges']:
                        if related_edge['subject'] == node['key']:
                            has_predicate = related_edge['has_predicate']
                            if has_predicate is True and 'predicate' in related_edge:
                                predicate_value = related_edge['predicate']

                component_id = f"e{edge_index:02}"
                template_part = f"-{component_id}()-"
                self.query_graph_templates['simple'] += template_part
                component = {
                    'component_type': 'edge',
                    'component_id': component_id,
                    'has_id': False,
                    'has_predicate': has_predicate,
                    'predicate_value': predicate_value
                }
                self.query_graph_templates['detailed']['components'].append(
                    component)
                edge_index += 1

        response.debug(
            f"The QueryGraph reference template is: {self.query_graph_templates['simple']}"
        )

        #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges,
        #    'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order }
        #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
        #sys.exit(0)

        #### Return the response
        return response
Beispiel #23
0
    def _grab_nodes_and_edges_from_sqlite(
            self, plover_answer: Dict[str, Dict[str, Set[Union[str, int]]]],
            kg_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        # Get connected to the local sqlite database (look up its path using database manager-friendly method)
        path_list = os.path.realpath(__file__).split(os.path.sep)
        rtx_index = path_list.index("RTX")
        rtxc = RTXConfiguration()
        sqlite_dir_path = os.path.sep.join([
            *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources',
            'KG2c'
        ])
        sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1]
        sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}"
        connection = sqlite3.connect(sqlite_file_path)
        cursor = connection.cursor()
        answer_kg = QGOrganizedKnowledgeGraph()

        # Grab the node objects from sqlite corresponding to the returned node IDs
        num_nodes = sum(
            [len(nodes) for nodes in plover_answer["nodes"].values()])
        start = time.time()
        for qnode_key, node_keys in plover_answer["nodes"].items():
            node_keys_str = "','".join(
                node_keys
            )  # SQL wants ('node1', 'node2') format for string lists
            sql_query = f"SELECT N.node " \
                        f"FROM nodes AS N " \
                        f"WHERE N.id IN ('{node_keys_str}')"
            log.debug(
                f"Looking up {len(plover_answer['nodes'][qnode_key])} returned {qnode_key} node IDs in KG2c sqlite"
            )
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            for row in rows:
                node_as_dict = ujson.loads(row[0])
                node_key, node = self._convert_neo4j_node_to_trapi_node(
                    node_as_dict, kg_name)
                answer_kg.add_node(node_key, node, qnode_key)
        log.debug(
            f"Grabbing {num_nodes} nodes from sqlite and loading into object model took "
            f"{round(time.time() - start, 2)} seconds")

        # Grab the edge objects from sqlite corresponding to the returned edge IDs
        num_edges = sum(
            [len(edges) for edges in plover_answer["edges"].values()])
        start = time.time()
        for qedge_key, edge_keys in plover_answer["edges"].items():
            edge_keys_str = ",".join(
                str(edge_key)
                for edge_key in edge_keys)  # SQL wants (1, 2) format int lists
            sql_query = f"SELECT E.edge " \
                        f"FROM edges AS E " \
                        f"WHERE E.id IN ({edge_keys_str})"
            log.debug(
                f"Looking up {len(plover_answer['edges'][qedge_key])} returned {qedge_key} edge IDs in KG2c sqlite"
            )
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            for row in rows:
                edge_as_dict = ujson.loads(row[0])
                edge_key, edge = self._convert_neo4j_edge_to_trapi_edge(
                    edge_as_dict, dict(), kg_name)
                answer_kg.add_edge(edge_key, edge, qedge_key)
        log.debug(
            f"Grabbing {num_edges} edges from sqlite and loading into object model took "
            f"{round(time.time() - start, 2)} seconds")

        cursor.close()
        connection.close()
        return answer_kg
Beispiel #24
0
    def decorate_edges(self,
                       response: ARAXResponse,
                       kind: Optional[str] = "RTX-KG2") -> ARAXResponse:
        """
        Decorates edges with publication sentences and any other available EPC info.
        kind: The kind of edges to decorate, either: "NGD" or "RTX-KG2". For NGD edges, publications info attributes
        are added. For RTX-KG2 edges, attributes for all EPC properties are added.
        """
        kg = response.envelope.message.knowledge_graph
        response.debug(f"Decorating edges with EPC info from KG2c")
        supported_kinds = {"RTX-KG2", "NGD"}
        if kind not in supported_kinds:
            response.error(
                f"Supported values for ARAXDecorator.decorate_edges()'s 'kind' parameter are: "
                f"{supported_kinds}")
            return response

        # Figure out which edges we need to decorate
        if kind == "RTX-KG2":
            edge_keys_to_decorate = {
                edge_id
                for edge_id, edge in kg.edges.items()
                if edge.attributes and any(
                    attribute.value == self.kg2_infores_curie and attribute.
                    attribute_type_id == "biolink:aggregator_knowledge_source"
                    for attribute in edge.attributes)
            }
        else:
            edge_keys_to_decorate = {
                edge_id
                for edge_id, edge in kg.edges.items() if edge.predicate ==
                "biolink:has_normalized_google_distance_with"
            }
        if not edge_keys_to_decorate:
            response.debug(f"Could not identify any {kind} edges to decorate")
        else:
            response.debug(
                f"Identified {len(edge_keys_to_decorate)} edges to decorate")

        # Determine the search keys for these edges that we need to look up in sqlite
        search_key_to_edge_keys_map = defaultdict(set)
        if kind == "NGD":  # For now only NGD/overlay will use this mode
            for edge_key in edge_keys_to_decorate:
                edge = kg.edges[edge_key]
                search_key = f"{edge.subject}--{edge.object}"
                search_key_to_edge_keys_map[search_key].add(edge_key)
            search_key_column = "node_pair"
        else:  # This is the mode used for decorating KG2 edges (or other KPs' edges)
            for edge_key in edge_keys_to_decorate:
                edge = kg.edges[edge_key]
                search_key = f"{edge.subject}--{edge.predicate}--{edge.object}"
                search_key_to_edge_keys_map[search_key].add(edge_key)
            search_key_column = "triple"

        # Extract the proper entries from sqlite
        connection, cursor = self._connect_to_kg2c_sqlite()
        response.debug(f"Looking up EPC edge info in KG2c sqlite")
        edge_attributes_ordered = list(self.edge_attributes)
        edge_cols_str = ", ".join([
            f"E.{property_name}" for property_name in edge_attributes_ordered
        ])
        search_keys_set = set(
            search_key.replace("'", "''") for search_key in set(
                search_key_to_edge_keys_map))  # Escape quotes
        search_keys_str = "','".join(
            search_keys_set
        )  # SQL wants ('node1', 'node2') format for string lists
        sql_query = f"SELECT E.{search_key_column}, {edge_cols_str} " \
                    f"FROM edges AS E " \
                    f"WHERE E.{search_key_column} IN ('{search_keys_str}')"
        cursor.execute(sql_query)
        rows = cursor.fetchall()
        cursor.close()
        connection.close()
        response.debug(f"Got {len(rows)} rows back from KG2c sqlite")

        response.debug(f"Adding attributes to edges in the KG")
        # Create a helper lookup map for easy access to returned rows
        search_key_to_kg2c_edge_tuples_map = defaultdict(list)
        for row in rows:
            search_key = row[0]
            search_key_to_kg2c_edge_tuples_map[search_key].append(row)

        attribute_type_id_map = {
            property_name: self.create_attribute(property_name,
                                                 "something").attribute_type_id
            for property_name in set(self.edge_attributes).difference(
                {"knowledge_source"})
        }
        for search_key, kg2c_edge_tuples in search_key_to_kg2c_edge_tuples_map.items(
        ):
            # Join the property values found for all edges matching the given search key
            merged_kg2c_properties = {
                property_name: None
                for property_name in edge_attributes_ordered
            }
            for kg2c_edge_tuple in kg2c_edge_tuples:
                for index, property_name in enumerate(edge_attributes_ordered):
                    raw_value = kg2c_edge_tuple[index + 1]
                    if raw_value:  # Skip empty attributes
                        value = self._load_property(property_name, raw_value)
                        if not merged_kg2c_properties.get(property_name):
                            merged_kg2c_properties[property_name] = set(
                            ) if isinstance(value, list) else dict()
                        if isinstance(value, list):
                            merged_kg2c_properties[property_name].update(
                                set(value))
                        else:
                            merged_kg2c_properties[property_name].update(value)
            joined_knowledge_sources = list(
                merged_kg2c_properties["knowledge_source"]
            ) if merged_kg2c_properties.get("knowledge_source") else set()
            knowledge_source = joined_knowledge_sources[0] if len(
                joined_knowledge_sources) == 1 else None
            joined_kg2_ids = list(
                merged_kg2c_properties["kg2_ids"]
            ) if merged_kg2c_properties.get("kg2_ids") else set()
            joined_publications = list(
                merged_kg2c_properties["publications"]
            ) if merged_kg2c_properties.get("publications") else set()
            joined_publications_info = merged_kg2c_properties[
                "publications_info"] if merged_kg2c_properties.get(
                    "publications_info") else dict()

            # Add the joined attributes to each of the edges with the given search key (as needed)
            corresponding_bare_edge_keys = search_key_to_edge_keys_map[
                search_key]
            for edge_key in corresponding_bare_edge_keys:
                bare_edge = kg.edges[edge_key]
                existing_attribute_type_ids = {
                    attribute.attribute_type_id
                    for attribute in bare_edge.attributes
                } if bare_edge.attributes else set()
                new_attributes = []
                # Create KG2 edge-specific attributes
                if kind == "RTX-KG2":
                    if attribute_type_id_map[
                            "kg2_ids"] not in existing_attribute_type_ids:
                        new_attributes.append(
                            self.create_attribute("kg2_ids",
                                                  list(joined_kg2_ids)))
                    if joined_publications and attribute_type_id_map[
                            "publications"] not in existing_attribute_type_ids:
                        new_attributes.append(
                            self.create_attribute(
                                "publications",
                                list(joined_publications),
                                attribute_source=knowledge_source))
                # Create attributes that belong on both KG2 and NGD edges
                if joined_publications_info and attribute_type_id_map[
                        "publications_info"] not in existing_attribute_type_ids:
                    new_attributes.append(
                        self.create_attribute(
                            "publications_info",
                            joined_publications_info,
                            attribute_source=knowledge_source))
                # Actually tack the new attributes onto the edge
                if new_attributes:
                    if not bare_edge.attributes:
                        bare_edge.attributes = new_attributes
                    else:
                        bare_edge.attributes += new_attributes

        return response
Beispiel #25
0
    def parse(self, input_actions):

        #### Define a default response
        response = ARAXResponse()
        response.info(f"Parsing input actions list")

        #### Basic error checking of the input_actions
        if not isinstance(input_actions, list):
            response.error("Provided input actions is not a list",
                           error_code="ActionsNotList")
            return response
        if len(input_actions) == 0:
            response.error("Provided input actions is an empty list",
                           error_code="ActionsListEmpty")
            return response

        #### Iterate through the list, checking the items
        actions = []
        n_lines = 1
        for action in input_actions:
            response.debug(f"Parsing action: {action}")

            # If this line is empty, then skip
            match = re.match(r"\s*$", action)
            if match:
                continue

            # If this line begins with a #, it is a comment, then skip
            match = re.match(r"#", action)
            if match:
                continue

            #### First look for a naked command without parentheses
            match = re.match(r"\s*([A-Za-z_]+)\s*$", action)
            if match is not None:
                action = {
                    "line": n_lines,
                    "command": match.group(1),
                    "parameters": None
                }
                actions.append(action)

            #### Then look for and parse a command with parentheses and a comma-separated parameter list
            if match is None:
                match = re.match(r"\s*([A-Za-z_]+)\((.*)\)\s*$", action)
                if match is not None:
                    command = match.group(1)
                    param_string = match.group(2)

                    #### Split the parameters on comma and process those
                    param_string_list = re.split(",", param_string)
                    parameters = {}

                    #### If a value is of the form key=[value1,value2] special code is needed to recompose that
                    mode = 'normal'
                    list_buffer = []
                    key = ''
                    for param_item in param_string_list:
                        param_item = param_item.strip()
                        if mode == 'normal':

                            #### Split on the first = only (might be = in the value)
                            values = re.split("=", param_item, 1)
                            key = values[0]
                            #### If there isn't a value after an =, then just set to string true
                            value = 'true'
                            if len(values) > 1:
                                value = values[1]
                            key = key.strip()
                            value = value.strip()

                            #### If the value begins with a "[", then this is a list
                            match = re.match(r"\[(.+)$", value)
                            if match:
                                #### If it also ends with a "]", then this is a list of one element
                                match2 = re.match(r"\[(.*)\]$", value)
                                if match2:
                                    if match2.group(1) == '':
                                        parameters[key] = []
                                    else:
                                        parameters[key] = [match2.group(1)]
                                else:
                                    mode = 'in_list'
                                    list_buffer = [match.group(1)]
                            else:
                                parameters[key] = value

                        #### Special processing if we're in the middle of a list
                        elif mode == 'in_list':
                            match = re.match(r"(.*)\]$", param_item)
                            if match:
                                mode = 'normal'
                                list_buffer.append(match.group(1))
                                parameters[key] = list_buffer
                            else:
                                list_buffer.append(param_item)
                        else:
                            eprint("Inconceivable!")
                    if mode == 'in_list':
                        parameters[key] = list_buffer

                    #### Store the parsed result in a dict and add to the list
                    action = {
                        "line": n_lines,
                        "command": command,
                        "parameters": parameters
                    }
                    actions.append(action)
                else:
                    response.error(f"Unable to parse action {action}",
                                   error_code="ActionsListEmpty")
            n_lines += 1

        #### Put the actions in the response data envelope and return
        response.data["actions"] = actions
        return response
Beispiel #26
0
    def _convert_one_hop_query_graph_to_cypher_query(
            self, qg: QueryGraph, enforce_directionality: bool, kg_name: str,
            log: ARAXResponse) -> str:
        qedge_key = next(qedge_key for qedge_key in qg.edges)
        qedge = qg.edges[qedge_key]
        log.debug(f"Generating cypher for edge {qedge_key} query graph")
        try:
            # Build the match clause
            subject_qnode_key = qedge.subject
            object_qnode_key = qedge.object
            qedge_cypher = self._get_cypher_for_query_edge(
                qedge_key, qg, enforce_directionality)
            source_qnode_cypher = self._get_cypher_for_query_node(
                subject_qnode_key, qg, kg_name)
            target_qnode_cypher = self._get_cypher_for_query_node(
                object_qnode_key, qg, kg_name)
            match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}"

            # Build the where clause
            where_fragments = []
            for qnode_key in [subject_qnode_key, object_qnode_key]:
                qnode = qg.nodes[qnode_key]
                if qnode.id and isinstance(qnode.id,
                                           list) and len(qnode.id) > 1:
                    where_fragments.append(f"{qnode_key}.id in {qnode.id}")
                if qnode.category:
                    # Only inspect the 'all_categories' field if we're using KG2c
                    if kg_name == "KG2c":
                        category_fragments = [
                            f"'{category}' in {qnode_key}.types"
                            for category in qnode.category
                        ]
                        joined_category_fragments = " OR ".join(
                            category_fragments)
                        category_where_clause = joined_category_fragments if len(
                            category_fragments
                        ) < 2 else f"({joined_category_fragments})"
                        where_fragments.append(category_where_clause)
                    # Otherwise add a simple where condition if we have multiple categories
                    elif len(qnode.category) > 1:
                        if kg_name == "KG2":
                            node_category_property = "category_label"
                        else:
                            node_category_property = "category"
                        where_fragments.append(
                            f"{qnode_key}.{node_category_property} in {qnode.category}"
                        )

            if where_fragments:
                where_clause = f"WHERE {' AND '.join(where_fragments)}"
            else:
                where_clause = ""

            # Build the with clause
            source_qnode_col_name = f"nodes_{subject_qnode_key}"
            target_qnode_col_name = f"nodes_{object_qnode_key}"
            qedge_col_name = f"edges_{qedge_key}"
            # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID
            extra_edge_properties = "{.*, " + f"id:ID({qedge_key}), {subject_qnode_key}:{subject_qnode_key}.id, {object_qnode_key}:{object_qnode_key}.id" + "}"
            with_clause = f"WITH collect(distinct {subject_qnode_key}) as {source_qnode_col_name}, " \
                          f"collect(distinct {object_qnode_key}) as {target_qnode_col_name}, " \
                          f"collect(distinct {qedge_key}{extra_edge_properties}) as {qedge_col_name}"

            # Build the return clause
            return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}"

            cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}"
            return cypher_query
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            log.error(f"Problem generating cypher for query. {tb}",
                      error_code=error_type.__name__)
            return ""
Beispiel #27
0
    def _answer_query_using_CHP_client(
            self, query_graph: QueryGraph,
            log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        log.debug(
            f"Processing query results for edge {qedge_key} by using CHP client"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        gene_label_list = ['gene']
        drug_label_list = ['drug', 'chemicalsubstance']
        # use for checking the requirement
        source_pass_nodes = None
        source_category = None
        target_pass_nodes = None
        target_category = None

        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]

        # check if both ends of edge have no curie
        if (source_qnode.id is None) and (target_qnode.id is None):
            log.error(f"Both ends of edge {qedge_key} are None",
                      error_code="BadEdge")
            return final_kg

        # check if the query nodes are drug or disease
        if source_qnode.id is not None:

            if type(source_qnode.id) is str:
                source_pass_nodes = [source_qnode.id]
            else:
                source_pass_nodes = source_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                source_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(source_qnode.id) is str:
                        log.error(
                            f"The curie id of {source_qnode.id} is not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {source_qnode.id} are not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
        else:
            category = source_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            source_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                source_category = category
            else:
                log.error(
                    f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if target_qnode.id is not None:

            if type(target_qnode.id) is str:
                target_pass_nodes = [target_qnode.id]
            else:
                target_pass_nodes = target_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                target_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(target_qnode.id) is str:
                        log.error(
                            f"The curie id of {target_qnode.id} is not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {target_qnode.id} are not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
        else:
            category = target_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            target_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                target_category = category
            else:
                log.error(
                    f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if (source_pass_nodes is None) and (target_pass_nodes is None):
            return final_kg

        elif (source_pass_nodes is not None) and (target_pass_nodes
                                                  is not None):
            source_dict = dict()
            target_dict = dict()
            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                for (source_curie, target_curie) in itertools.product(
                        source_pass_nodes, target_pass_nodes):

                    if source_category_temp == 'drug':
                        source_curie_temp = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[target_curie],
                                        therapeutic=source_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            target_curie, source_curie, "paired_with",
                            max_probability)
                    else:
                        target_curie_temp = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[source_curie],
                                        therapeutic=target_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            source_curie, target_curie, "paired_with",
                            max_probability)

                    source_dict[source_curie] = source_qnode_key
                    target_dict[target_curie] = target_qnode_key

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg

        elif source_pass_nodes is not None:
            source_dict = dict()
            target_dict = dict()

            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_category in drug_label_list:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if source_category_temp == 'drug':
                    for source_curie in source_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, source_curie, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[gene] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                else:
                    for source_curie in source_pass_nodes:

                        genes = [source_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                source_curie, drug, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[drug] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
        else:
            source_dict = dict()
            target_dict = dict()

            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category in drug_label_list:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if target_category_temp == 'drug':
                    for target_curie in target_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, target_curie, "paired_with", prob)

                            source_dict[gene] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                else:
                    for target_curie in target_pass_nodes:

                        genes = [target_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                target_curie, drug, "paired_with", prob)

                            source_dict[drug] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
Beispiel #28
0
def create_results(
    qg: QueryGraph,
    kg: QGOrganizedKnowledgeGraph,
    log: ARAXResponse,
    overlay_fet: bool = False,
    rank_results: bool = False,
    qnode_key_to_prune: Optional[str] = None,
) -> Response:
    regular_format_kg = convert_qg_organized_kg_to_standard_kg(kg)
    resultifier = ARAXResultify()
    prune_response = ARAXResponse()
    prune_response.envelope = Response()
    prune_response.envelope.message = Message()
    prune_message = prune_response.envelope.message
    prune_message.query_graph = qg
    prune_message.knowledge_graph = regular_format_kg
    if overlay_fet:
        log.debug(
            f"Using FET to assess quality of intermediate answers in Expand")
        connected_qedges = [
            qedge for qedge in qg.edges.values()
            if qedge.subject == qnode_key_to_prune
            or qedge.object == qnode_key_to_prune
        ]
        qnode_pairs_to_overlay = {
            (qedge.subject if qedge.subject != qnode_key_to_prune else
             qedge.object, qnode_key_to_prune)
            for qedge in connected_qedges
        }
        for qnode_pair in qnode_pairs_to_overlay:
            pair_string_id = f"{qnode_pair[0]}-->{qnode_pair[1]}"
            log.debug(f"Overlaying FET for {pair_string_id} (from Expand)")
            fet_qedge_key = f"FET{pair_string_id}"
            try:
                overlayer = ARAXOverlay()
                params = {
                    "action": "fisher_exact_test",
                    "subject_qnode_key": qnode_pair[0],
                    "object_qnode_key": qnode_pair[1],
                    "virtual_relation_label": fet_qedge_key
                }
                overlayer.apply(prune_response, params)
            except Exception as error:
                exception_type, exception_value, exception_traceback = sys.exc_info(
                )
                log.warning(
                    f"An uncaught error occurred when overlaying with FET during Expand's pruning: {error}: "
                    f"{repr(traceback.format_exception(exception_type, exception_value, exception_traceback))}"
                )
            if prune_response.status != "OK":
                log.warning(
                    f"FET produced an error when Expand tried to use it to prune the KG. "
                    f"Log was: {prune_response.show()}")
                log.debug(f"Will continue pruning without overlaying FET")
                # Get rid of any FET edges that might be in the KG/QG, since this step failed
                remove_edges_with_qedge_key(
                    prune_response.envelope.message.knowledge_graph,
                    fet_qedge_key)
                qg.edges.pop(fet_qedge_key, None)
                prune_response.status = "OK"  # Clear this so we can continue without overlaying
            else:
                if fet_qedge_key in qg.edges:
                    qg.edges[
                        fet_qedge_key].option_group_id = f"FET_VIRTUAL_GROUP_{pair_string_id}"
                else:
                    log.warning(
                        f"Attempted to overlay FET from Expand, but it didn't work. Pruning without it."
                    )

    # Create results and rank them as appropriate
    log.debug(f"Calling Resultify from Expand for pruning")
    resultifier.apply(prune_response, {})
    if rank_results:
        try:
            log.debug(f"Ranking Expand's intermediate pruning results")
            ranker = ARAXRanker()
            ranker.aggregate_scores_dmk(prune_response)
        except Exception as error:
            exception_type, exception_value, exception_traceback = sys.exc_info(
            )
            log.error(
                f"An uncaught error occurred when attempting to rank results during Expand's pruning: "
                f"{error}: {repr(traceback.format_exception(exception_type, exception_value, exception_traceback))}."
                f"Log was: {prune_response.show()}",
                error_code="UncaughtARAXiError")
            # Give any unranked results a score of 0
            for result in prune_response.envelope.message.results:
                if result.score is None:
                    result.score = 0
    return prune_response
    def add_query_graph_tags(self, message, query_graph_info):

        #### Define a default response
        response = ARAXResponse()
        self.response = response
        self.message = message
        response.debug(f"Adding temporary QueryGraph ids to KnowledgeGraph")

        #### Get shorter handles
        knowedge_graph = message.knowledge_graph
        nodes = knowedge_graph.nodes
        edges = knowedge_graph.edges

        #### Loop through nodes adding qnode_ids
        for key, node in nodes.items():

            #### If there is not qnode_id, then determine what it should be and add it
            if node.qnode_id is None:
                categorys = node.category

                #### Find a matching category in the QueryGraph for this node
                if categorys is None:
                    response.error(
                        f"KnowledgeGraph node {key} does not have a category. This should never be",
                        error_code="NodeMissingCategory")
                    return response
                n_found_categorys = 0
                found_category = None
                for node_category in categorys:
                    if node_category in query_graph_info.node_category_map:
                        n_found_categorys += 1
                        found_category = node_category

                #### If we did not find exactly one matching category, error out
                if n_found_categorys == 0:
                    response.error(
                        f"Tried to find categorys '{categorys}' for KnowledgeGraph node {key} in query_graph_info, but did not find it",
                        error_code="NodeCategoryMissingInQueryGraph")
                    return response
                elif n_found_categorys > 1:
                    response.error(
                        f"Tried to find categorys '{categorys}' for KnowledgeGraph node {key} in query_graph_info, and found multiple matches. This is ambiguous",
                        error_code="MultipleNodeCategorysInQueryGraph")
                    return response

                #### Else add it
                node.qnode_id = query_graph_info.node_category_map[
                    found_category]

        #### Loop through the edges adding qedge_ids
        for key, edge in edges.items():

            #### Check to see if there is already a qedge_id attribute on the edge
            if edge.qedge_id is None:

                #### If there isn't a predicate or can't find it in the query_graph, error out
                if edge.predicate is None:
                    response.error(
                        f"KnowledgeGraph edge {key} does not have a predicate. This should never be",
                        error_code="EdgeMissingPredicate")
                    return response
                if edge.predicate not in query_graph_info.edge_predicate_map:
                    response.error(
                        f"Tried to find predicate '{edge.predicate}' for KnowledgeGraph node {key} in query_graph_info, but did not find it",
                        error_code="EdgePredicateMissingInQueryGraph")
                    return response

                #### Else add it
                edge.qedge_id = query_graph_info.edge_predicate_map[
                    edge.predicate]

        #### Return the response
        return response
    def check_for_query_graph_tags(self, message, query_graph_info):

        #### Define a default response
        response = ARAXResponse()
        self.response = response
        self.message = message
        response.debug(f"Checking KnowledgeGraph for QueryGraph tags")

        #### Get shorter handles
        knowledge_graph = message.knowledge_graph
        nodes = knowledge_graph.nodes
        edges = knowledge_graph.edges

        #### Store number of nodes and edges
        self.n_nodes = len(nodes)
        self.n_edges = len(edges)
        response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges")

        #### Clear the maps
        self.node_map = {'by_qnode_id': {}}
        self.edge_map = {'by_qedge_id': {}}

        #### Loop through nodes computing some stats
        n_nodes_with_query_graph_ids = 0
        for key, node in nodes.items():
            if node.qnode_id is None:
                continue
            n_nodes_with_query_graph_ids += 1

            #### Place an entry in the node_map
            if node.qnode_id not in self.node_map['by_qnode_id']:
                self.node_map['by_qnode_id'][node.qnode_id] = {}
            self.node_map['by_qnode_id'][node.qnode_id][key] = 1

        #### Tally the stats
        if n_nodes_with_query_graph_ids == self.n_nodes:
            self.query_graph_id_node_status = 'all nodes have query_graph_ids'
        elif n_nodes_with_query_graph_ids == 0:
            self.query_graph_id_node_status = 'no nodes have query_graph_ids'
        else:
            self.query_graph_id_node_status = 'only some nodes have query_graph_ids'
        response.info(
            f"In the KnowledgeGraph, {self.query_graph_id_node_status}")

        #### Loop through edges computing some stats
        n_edges_with_query_graph_ids = 0
        for key, edge in edges.items():
            if edge.qedge_id is None:
                continue
            n_edges_with_query_graph_ids += 1

            #### Place an entry in the edge_map
            if edge.qedge_id not in self.edge_map['by_qedge_id']:
                self.edge_map['by_qedge_id'][edge.qedge_id] = {}
            self.edge_map['by_qedge_id'][edge.qedge_id][key] = 1

        if n_edges_with_query_graph_ids == self.n_edges:
            self.query_graph_id_edge_status = 'all edges have query_graph_ids'
        elif n_edges_with_query_graph_ids == 0:
            self.query_graph_id_edge_status = 'no edges have query_graph_ids'
        else:
            self.query_graph_id_edge_status = 'only some edges have query_graph_ids'
        response.info(
            f"In the KnowledgeGraph, {self.query_graph_id_edge_status}")

        #### Return the response
        return response