Beispiel #1
0
    def _convert_kg1_edge_to_swagger_edge(self, neo4j_edge: Dict[str, any], node_uuid_to_curie_dict: Dict[str, str]) -> Edge:
        swagger_edge = Edge()
        swagger_edge.type = neo4j_edge.get("predicate")
        swagger_edge.source_id = node_uuid_to_curie_dict[neo4j_edge.get("source_node_uuid")]
        swagger_edge.target_id = node_uuid_to_curie_dict[neo4j_edge.get("target_node_uuid")]
        swagger_edge.id = f"KG1:{neo4j_edge.get('id')}"
        swagger_edge.relation = neo4j_edge.get("relation")
        swagger_edge.provided_by = neo4j_edge.get("provided_by")
        swagger_edge.is_defined_by = "ARAX/KG1"

        if neo4j_edge.get("probability"):
            swagger_edge.edge_attributes = self._create_swagger_attributes("edge", ["probability"], neo4j_edge)
        return swagger_edge
Beispiel #2
0
    def _convert_kg1_edge_to_swagger_edge(self, neo4j_edge,
                                          node_uuid_to_curie_dict):
        swagger_edge = Edge()
        swagger_edge.type = neo4j_edge.get('predicate')
        swagger_edge.source_id = node_uuid_to_curie_dict[neo4j_edge.get(
            'source_node_uuid')]
        swagger_edge.target_id = node_uuid_to_curie_dict[neo4j_edge.get(
            'target_node_uuid')]
        swagger_edge.id = self._create_edge_id(swagger_edge)
        swagger_edge.relation = neo4j_edge.get('relation')
        swagger_edge.provided_by = neo4j_edge.get('provided_by')
        swagger_edge.is_defined_by = "ARAX/KG1"

        if neo4j_edge.get('probability'):
            swagger_edge.edge_attributes = self._create_swagger_attributes(
                "edge", ['probability'], neo4j_edge)
        return swagger_edge
Beispiel #3
0
 def _create_ngd_edge(self, ngd_value: float, source_id: str,
                      target_id: str) -> Edge:
     ngd_edge = Edge()
     ngd_edge.type = self.ngd_edge_type
     ngd_edge.source_id = source_id
     ngd_edge.target_id = target_id
     ngd_edge.id = f"NGD:{source_id}--{ngd_edge.type}--{target_id}"
     ngd_edge.provided_by = "ARAX"
     ngd_edge.is_defined_by = "ARAX"
     ngd_edge.edge_attributes = [
         EdgeAttribute(name=self.ngd_edge_attribute_name,
                       type=self.ngd_edge_attribute_type,
                       value=ngd_value,
                       url=self.ngd_edge_attribute_url)
     ]
     return ngd_edge
 def _create_icees_virtual_edge(self, source_curie, target_curie, p_value):
     return Edge(
         id=f"ICEES:{source_curie}--{target_curie}",
         type=self.icees_edge_type,
         source_id=source_curie,
         target_id=target_curie,
         is_defined_by="ARAX",
         provided_by="ICEES+",
         relation=self.virtual_relation_label,
         qedge_ids=[self.virtual_relation_label],
         edge_attributes=[self._create_icees_edge_attribute(p_value)])
Beispiel #5
0
    def _add_answers_to_kg(self, answer_kg, reasoner_std_response, input_qnode_id, output_qnode_id, qedge_id):
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_ids = dict()
            self.response.debug(f"Got results back from BTE for this query "
                                f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)")
            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_id = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.type = eu.convert_string_to_snake_case(node.get('type'))

                # Map the returned BTE qg_ids back to the original qnode_ids in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_id)
                if bte_qg_id == "n0":
                    qnode_id = input_qnode_id
                elif bte_qg_id == "n1":
                    qnode_id = output_qnode_id
                else:
                    self.response.error("Could not map BTE qg_id to ARAX qnode_id", error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an 'output' node)
                if qnode_id == output_qnode_id:
                    if bte_node_id in remapped_node_ids:
                        swagger_node.id = remapped_node_ids.get(bte_node_id)
                    else:
                        equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in
                                             node.get('equivalent_identifiers').items() for local_id in local_ids]
                        swagger_node.id = eu.get_best_equivalent_curie(equivalent_curies, swagger_node.type)
                        remapped_node_ids[bte_node_id] = swagger_node.id
                else:
                    swagger_node.id = bte_node_id

                eu.add_node_to_kg(answer_kg, swagger_node, qnode_id)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge.id = edge.get("id")
                swagger_edge.type = edge.get('type')
                swagger_edge.source_id = remapped_node_ids.get(edge.get('source_id'), edge.get('source_id'))
                swagger_edge.target_id = remapped_node_ids.get(edge.get('target_id'), edge.get('target_id'))
                swagger_edge.is_defined_by = "BTE"
                swagger_edge.provided_by = edge.get('edge_source')
                # Map the returned BTE qg_id back to the original qedge_id in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge.id)
                if bte_qg_id != "e1":
                    self.response.error("Could not map BTE qg_id to ARAX qedge_id", error_code="UnknownQGID")
                    return answer_kg
                eu.add_edge_to_kg(answer_kg, swagger_edge, qedge_id)
        return answer_kg
Beispiel #6
0
    def add_subgraph(self,
                     nodes,
                     edges,
                     description,
                     confidence,
                     return_result=False,
                     suppress_bindings=False):
        """
		Populate the object model using networkx neo4j subgraph
		:param nodes: nodes in the subgraph (g.nodes(data=True))
		:param edges: edges in the subgraph (g.edges(data=True))
		:return: none
		"""

        # Get the relevant info from the nodes and edges
        node_keys = []
        node_descriptions = dict()
        node_names = dict()
        node_labels = dict()
        node_uuids = dict()
        node_accessions = dict()
        node_iris = dict()
        node_uuids2iri = dict()
        node_curies = dict()
        node_uuids2curie = dict()
        for u, data in nodes:
            node_keys.append(u)
            if 'description' in data['properties']:
                node_descriptions[u] = data['properties']['description']
            else:
                node_descriptions[u] = "None"
            node_names[u] = data['properties']['name']
            node_labels[u] = list(set(data['labels']).difference({'Base'}))[0]
            node_uuids[u] = data['properties']['UUID']
            node_accessions[u] = data['properties']['accession']
            node_iris[u] = data['properties']['uri']
            node_uuids2iri[data['properties']
                           ['UUID']] = data['properties']['uri']
            curie_id = data['properties']['id']
            if curie_id.split(':')[0].upper() == "CHEMBL":
                curie_id = "CHEMBL:CHEMBL" + curie_id.split(':')[1]
            node_uuids2curie[data['properties']['UUID']] = curie_id
            node_curies[
                u] = curie_id  # These are the actual CURIE IDS eg UBERON:00000941 (uri is the web address)

        edge_keys = []
        edge_types = dict()
        edge_source_db = dict()
        edge_source_iri = dict()
        edge_target_iri = dict()
        edge_source_curie = dict()
        edge_target_curie = dict()
        edge_ids = dict()
        for u, v, data in edges:
            edge_keys.append((u, v))
            edge_types[(u, v)] = data['type']
            edge_source_db[(u, v)] = data['properties']['provided_by']
            edge_source_iri[(
                u, v)] = node_uuids2iri[data['properties']['source_node_uuid']]
            edge_target_iri[(
                u, v)] = node_uuids2iri[data['properties']['target_node_uuid']]
            edge_source_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['source_node_uuid']]
            edge_target_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['target_node_uuid']]
            edge_ids[(u, v)] = data['properties']['provided_by']  # FIXME

        # For each node, populate the relevant information
        node_objects = []
        node_iris_to_node_object = dict()
        for node_key in node_keys:
            node = Node()
            node.id = node_curies[node_key]
            node.type = [node_labels[node_key]]
            node.name = node_names[node_key]
            node.uri = node_iris[node_key]
            node.accession = node_accessions[node_key]
            node.description = node_descriptions[node_key]
            node_objects.append(node)
            node_iris_to_node_object[node_iris[node_key]] = node

            #### Add this node to the master knowledge graph
            if node.id not in self._node_ids:
                self.message.knowledge_graph.nodes.append(node)
                self._node_ids[node.id] = node.type[
                    0]  # Just take the first of potentially several FIXME

        #### Create the bindings lists
        node_bindings = list()
        edge_bindings = list()

        # for each edge, create an edge between them
        edge_objects = []
        for u, v in edge_keys:
            edge = Edge()
            #edge.id is set below when building the bindings
            edge.type = edge_types[(u, v)]
            edge.source_id = node_iris_to_node_object[edge_source_iri[(u,
                                                                       v)]].id
            edge.target_id = node_iris_to_node_object[edge_target_iri[(u,
                                                                       v)]].id
            edge_objects.append(edge)
            #edge.attribute_list
            #edge.confidence
            #edge.evidence_type
            edge.is_defined_by = "RTX"
            edge.provided_by = edge_source_db[(u, v)]
            #edge.publications
            #edge.qualifiers
            #edge.relation
            #edge.source_id
            #edge.target_id
            #edge.type

            #### Add this edge to the master knowledge graph
            edge_str = "%s -%s- %s" % (edge.source_id, edge.type,
                                       edge.target_id)
            if edge_str not in self._edge_ids:
                self.message.knowledge_graph.edges.append(edge)
                edge.id = "%d" % self._edge_counter
                self._edge_ids[edge_str] = edge.id
                self._edge_counter += 1
            else:
                edge.id = self._edge_ids[edge_str]

            #### Try to figure out how the source fits into the query_graph for the bindings
            source_type = self._node_ids[edge.source_id]
            if edge.source_id in self._type_map:
                source_knowledge_map_key = self._type_map[edge.source_id]
            else:
                source_knowledge_map_key = self._type_map[source_type]
            if not source_knowledge_map_key:
                eprint(
                    "Expected to find '%s' in the response._type_map, but did not"
                    % source_type)
                raise Exception(
                    "Expected to find '%s' in the response._type_map, but did not"
                    % source_type)

            node_bindings.append(
                NodeBinding(qg_id=source_knowledge_map_key,
                            kg_id=edge.source_id))
            #			if source_knowledge_map_key not in node_bindings:
            #				node_bindings[source_knowledge_map_key] = list()
            #				node_bindings_dict[source_knowledge_map_key] = dict()
            #			if edge.source_id not in node_bindings_dict[source_knowledge_map_key]:
            #				node_bindings[source_knowledge_map_key].append(edge.source_id)
            #				node_bindings_dict[source_knowledge_map_key][edge.source_id] = 1

            #### Try to figure out how the target fits into the query_graph for the knowledge map
            target_type = self._node_ids[edge.target_id]
            if edge.target_id in self._type_map:
                target_knowledge_map_key = self._type_map[edge.target_id]
            else:
                target_knowledge_map_key = self._type_map[target_type]
            if not target_knowledge_map_key:
                eprint(
                    "ERROR: Expected to find '%s' in the response._type_map, but did not"
                    % target_type)
                raise Exception(
                    "Expected to find '%s' in the response._type_map, but did not"
                    % target_type)

            node_bindings.append(
                NodeBinding(qg_id=target_knowledge_map_key,
                            kg_id=edge.target_id))
            #			if target_knowledge_map_key not in node_bindings:
            #				node_bindings[target_knowledge_map_key] = list()
            #				node_bindings_dict[target_knowledge_map_key] = dict()
            #			if edge.target_id not in node_bindings_dict[target_knowledge_map_key]:
            #				node_bindings[target_knowledge_map_key].append(edge.target_id)
            #				node_bindings_dict[target_knowledge_map_key][edge.target_id] = 1

            #### Try to figure out how the edge fits into the query_graph for the knowledge map
            source_target_key = "e" + source_knowledge_map_key + "-" + target_knowledge_map_key
            target_source_key = "e" + target_knowledge_map_key + "-" + source_knowledge_map_key
            if edge.type in self._type_map:
                knowledge_map_key = self._type_map[edge.type]
            elif source_target_key in self._type_map:
                knowledge_map_key = source_target_key
            elif target_source_key in self._type_map:
                knowledge_map_key = target_source_key
            else:
                eprint(
                    "ERROR: Expected to find '%s' or '%s' or '%s' in the response._type_map, but did not"
                    % (edge.type, source_target_key, target_source_key))
                knowledge_map_key = "ERROR"

            edge_bindings.append(
                EdgeBinding(qg_id=knowledge_map_key, kg_id=edge.id))


#			if knowledge_map_key not in edge_bindings:
#				edge_bindings[knowledge_map_key] = list()
#				edge_bindings_dict[knowledge_map_key] = dict()
#			if edge.id not in edge_bindings_dict[knowledge_map_key]:
#				edge_bindings[knowledge_map_key].append(edge.id)
#				edge_bindings_dict[knowledge_map_key][edge.id] = 1

# Create the result (potential answer)
        result1 = Result()
        result1.reasoner_id = "RTX"
        result1.description = description
        result1.confidence = confidence
        if suppress_bindings is False:
            result1.node_bindings = node_bindings
            result1.edge_bindings = edge_bindings

        # Create a KnowledgeGraph object and put the list of nodes and edges into it
        #### This is still legal, then is redundant with the knowledge map, so leave it out maybe
        knowledge_graph = KnowledgeGraph()
        knowledge_graph.nodes = node_objects
        knowledge_graph.edges = edge_objects
        if suppress_bindings is True:
            result1.result_graph = knowledge_graph

        # Put the first result (potential answer) into the message
        self._results.append(result1)
        self.message.results = self._results

        # Increment the number of results
        self._num_results += 1
        if self._num_results == 1:
            self.message.code_description = "%s result found" % self._num_results
        else:
            self.message.code_description = "%s results found" % self._num_results

        #### Finish and return the result if requested
        if return_result:
            return result1
        else:
            pass
Beispiel #7
0
    def add_split_results(self, knowledge_graph, result_bindings):
        """
		Populate the object model with the resulting raw knowledge_graph and result_bindings (initially from QueryGraphReasoner)
		:param nodes: knowledge_graph in native RTX KG dump
		:param edges: result_bindings in a native format from QueryGraphReasoner
		:return: none
		"""

        #### Add the knowledge_graph nodes
        regular_node_attributes = [
            "id", "uri", "name", "description", "symbol"
        ]
        for input_node in knowledge_graph["nodes"]:
            node = Node()
            for attribute in regular_node_attributes:
                if attribute in input_node:
                    setattr(node, attribute, input_node[attribute])
            node.type = [input_node["category"]]
            #node.node_attributes = FIXME
            self.message.knowledge_graph.nodes.append(node)

        #### Add the knowledge_graph edges
        regular_edge_attributes = [
            "id", "type", "relation", "source_id", "target_id",
            "is_defined_by", "defined_datetime", "provided_by", "weight",
            "evidence_type", "qualifiers", "negated", "", ""
        ]
        for input_edge in knowledge_graph["edges"]:
            edge = Edge()
            for attribute in regular_edge_attributes:
                if attribute in input_edge:
                    setattr(edge, attribute, input_edge[attribute])
            if "probability" in input_edge:
                edge.confidence = input_edge["probability"]
            # missing edge properties: defined_datetime, weight, publications, evidence_type, qualifiers, negated
            # extra edge properties: predicate,
            #edge.edge_attributes = FIXME
            #edge.publications = FIXME
            self.message.knowledge_graph.edges.append(edge)

        #### Add each result
        self.message.results = []
        for input_result in result_bindings:
            result = Result()
            result.description = "No description available"
            result.essence = "?"
            #result.essence_type = "?"
            #result.row_data = "?"
            #result.score = 0
            #result.score_name = "?"
            #result.score_direction = "?"
            result.confidence = 1.0
            result.result_type = "individual query answer"
            result.reasoner_id = "RTX"
            result.result_graph = None
            result.node_bindings = input_result["nodes"]
            #			#### Convert each binding value to a list because the viewer requires it
            #			for binding in result.node_bindings:
            #				result.node_bindings[binding] = [ result.node_bindings[binding] ]
            result.edge_bindings = input_result["edges"]
            self.message.results.append(result)

        #### Set the code_description
        n_results = len(result_bindings)
        plural = "s"
        if n_results == 1: plural = ""
        self.message.code_description = f"{n_results} result{plural} found"

        #### Complete normally
        return ()
Beispiel #8
0
    def fisher_exact_test(self):
        """
        Peform the fisher's exact test to expand or decorate the knowledge graph
        :return: response
        """

        self.response.info(
            f"Performing Fisher's Exact Test to add p-value to edge attribute of virtual edge"
        )

        # check the input parameters
        if 'source_qnode_id' not in self.parameters:
            self.response.error(
                f"The argument 'source_qnode_id' is required for fisher_exact_test function"
            )
            return self.response
        else:
            source_qnode_id = self.parameters['source_qnode_id']
        if 'virtual_relation_label' not in self.parameters:
            self.response.error(
                f"The argument 'virtual_relation_label' is required for fisher_exact_test function"
            )
            return self.response
        else:
            virtual_relation_label = str(
                self.parameters['virtual_relation_label'])
        if 'target_qnode_id' not in self.parameters:
            self.response.error(
                f"The argument 'target_qnode_id' is required for fisher_exact_test function"
            )
            return self.response
        else:
            target_qnode_id = self.parameters['target_qnode_id']
        rel_edge_id = self.parameters[
            'rel_edge_id'] if 'rel_edge_id' in self.parameters else None
        top_n = int(
            self.parameters['top_n']) if 'top_n' in self.parameters else None
        cutoff = float(
            self.parameters['cutoff']) if 'cutoff' in self.parameters else None

        # initialize some variables
        nodes_info = {}
        edge_expand_kp = []
        source_node_list = []
        target_node_dict = {}
        size_of_target = {}
        source_node_exist = False
        target_node_exist = False
        query_edge_id = set()
        rel_edge_type = set()
        source_node_type = None
        target_node_type = None

        ## Check if source_qnode_id and target_qnode_id are in the Query Graph
        try:
            if len(self.message.query_graph.nodes) != 0:
                for node in self.message.query_graph.nodes:
                    if node.id == source_qnode_id:
                        source_node_exist = True
                        source_node_type = node.type
                    elif node.id == target_qnode_id:
                        target_node_exist = True
                        target_node_type = node.type
                    else:
                        pass
            else:
                self.response.error(f"There is no query node in QG")
                return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong with retrieving nodes in message QG")
            return self.response

        if source_node_exist:
            if target_node_exist:
                pass
            else:
                self.response.error(
                    f"No query node with target qnode id {target_qnode_id} detected in QG for Fisher's Exact Test"
                )
                return self.response
        else:
            self.response.error(
                f"No query node with source qnode id {source_qnode_id} detected in QG for Fisher's Exact Test"
            )
            return self.response

        ## Check if there is a query edge connected to both source_qnode_id and target_qnode_id in the Query Graph
        try:
            if len(self.message.query_graph.edges) != 0:
                for edge in self.message.query_graph.edges:
                    if edge.source_id == source_qnode_id and edge.target_id == target_qnode_id and edge.relation == None:
                        query_edge_id.update(
                            [edge.id])  # only actual query edge is added
                    elif edge.source_id == target_qnode_id and edge.target_id == source_qnode_id and edge.relation == None:
                        query_edge_id.update(
                            [edge.id])  # only actual query edge is added
                    else:
                        continue
            else:
                self.response.error(f"There is no query edge in Query Graph")
                return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong with retrieving edges in message QG")
            return self.response

        if len(query_edge_id) != 0:
            if rel_edge_id:
                if rel_edge_id in query_edge_id:
                    pass
                else:
                    self.response.error(
                        f"No query edge with qedge id {rel_edge_id} connected to both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id} detected in QG for Fisher's Exact Test"
                    )
                    return self.response
            else:
                pass
        else:
            self.response.error(
                f"No query edge connected to both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id} detected in QG for Fisher's Exact Test"
            )
            return self.response

        ## loop over all nodes in KG and collect their node information
        try:
            count = 0
            for node in self.message.knowledge_graph.nodes:
                nodes_info[node.id] = {
                    'count': count,
                    'qnode_ids': node.qnode_ids,
                    'type': node.type[0],
                    'edge_index': []
                }
                count = count + 1
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong with retrieving nodes in message KG")
            return self.response

        ## loop over all edges in KG and create source node list and target node dict based on source_qnode_id, target_qnode_id as well as rel_edge_id (optional, otherwise all edges are considered)
        try:
            count = 0
            for edge in self.message.knowledge_graph.edges:
                if edge.provided_by != "ARAX":

                    nodes_info[edge.source_id]['edge_index'].append(count)
                    nodes_info[edge.target_id]['edge_index'].append(count)

                    if rel_edge_id:
                        if rel_edge_id in edge.qedge_ids:
                            if source_qnode_id in nodes_info[
                                    edge.source_id]['qnode_ids']:
                                edge_expand_kp.append(edge.is_defined_by)
                                rel_edge_type.update([edge.type])
                                source_node_list.append(edge.source_id)
                                if edge.target_id not in target_node_dict.keys(
                                ):
                                    target_node_dict[edge.target_id] = {
                                        edge.source_id
                                    }
                                else:
                                    target_node_dict[edge.target_id].update(
                                        [edge.source_id])
                            else:
                                edge_expand_kp.append(edge.is_defined_by)
                                rel_edge_type.update([edge.type])
                                source_node_list.append(edge.target_id)
                                if edge.source_id not in target_node_dict.keys(
                                ):
                                    target_node_dict[edge.source_id] = {
                                        edge.target_id
                                    }
                                else:
                                    target_node_dict[edge.source_id].update(
                                        [edge.target_id])
                        else:
                            pass
                    else:
                        if source_qnode_id in nodes_info[
                                edge.source_id]['qnode_ids']:
                            if target_qnode_id in nodes_info[
                                    edge.target_id]['qnode_ids']:
                                edge_expand_kp.append(edge.is_defined_by)
                                source_node_list.append(edge.source_id)
                                if edge.target_id not in target_node_dict.keys(
                                ):
                                    target_node_dict[edge.target_id] = {
                                        edge.source_id
                                    }
                                else:
                                    target_node_dict[edge.target_id].update(
                                        [edge.source_id])

                            else:
                                pass
                        elif target_qnode_id in nodes_info[
                                edge.source_id]['qnode_ids']:
                            if source_qnode_id in nodes_info[
                                    edge.target_id]['qnode_ids']:
                                edge_expand_kp.append(edge.is_defined_by)
                                source_node_list.append(edge.target_id)
                                if edge.source_id not in target_node_dict.keys(
                                ):
                                    target_node_dict[edge.source_id] = {
                                        edge.target_id
                                    }
                                else:
                                    target_node_dict[edge.source_id].update(
                                        [edge.target_id])

                            else:
                                pass
                        else:
                            pass

                else:
                    pass

                count = count + 1  ## record edge position in message.knowledge_graph

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong with retrieving edges in message KG")
            return self.response

        source_node_list = list(
            set(source_node_list))  ## remove the duplicate source node id

        ## check if there is no source node in message KG
        if len(source_node_list) == 0:
            self.response.error(
                f"No source node found in message KG for Fisher's Exact Test")
            return self.response

        ## check if there is no target node in message KG
        if len(target_node_dict) == 0:
            self.response.error(
                f"No target node found in message KG for Fisher's Exact Test")
            return self.response

        ## check if source node has more than one type. If so, throw an error
        if source_node_type is None:
            self.response.error(
                f"Source node with qnode id {source_qnode_id} was set to None in Query Graph. Please specify the node type"
            )
            return self.response
        else:
            pass

        ## check if target node has more than one type. If so, throw an error
        if target_node_type is None:
            self.response.error(
                f"Target node with qnode id {target_qnode_id} was set to None in Query Graph. Please specify the node type"
            )
            return self.response
        else:
            pass

        ##check how many kps were used in message KG. If more than one, the one with the max number of edges connnected to both source nodes and target nodes was used
        if len(collections.Counter(edge_expand_kp)) == 1:
            kp = edge_expand_kp[0]
        else:
            occurrences = collections.Counter(edge_expand_kp)
            max_index = max(
                [(value, index)
                 for index, value in enumerate(occurrences.values())]
            )[1]  # if there are more than one kp having the maximum number of edges, then the last one based on alphabetical order will be chosen.
            kp = list(occurrences.keys())[max_index]
            self.response.debug(f"{occurrences}")
            self.response.warning(
                f"More than one knowledge provider was detected to be used for expanding the edges connected to both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id}"
            )
            self.response.warning(
                f"The knowledge provider {kp} was used to calculate Fisher's exact test because it has the maximum number of edges both source node with qnode id {source_qnode_id} and target node with qnode id {target_qnode_id}"
            )

        ## Print out some information used to calculate FET
        if len(source_node_list) == 1:
            self.response.debug(
                f"{len(source_node_list)} source node with qnode id {source_qnode_id} and node type {source_node_type} was found in message KG and used to calculate Fisher's Exact Test"
            )
        else:
            self.response.debug(
                f"{len(source_node_list)} source nodes with qnode id {source_qnode_id} and node type {source_node_type} was found in message KG and used to calculate Fisher's Exact Test"
            )
        if len(target_node_dict) == 1:
            self.response.debug(
                f"{len(target_node_dict)} target node with qnode id {target_qnode_id} and node type {target_node_type} was found in message KG and used to calculate Fisher's Exact Test"
            )
        else:
            self.response.debug(
                f"{len(target_node_dict)} target nodes with qnode id {target_qnode_id} and node type {target_node_type} was found in message KG and used to calculate Fisher's Exact Test"
            )

        # find all nodes with the same type of 'source_qnode_id' nodes in specified KP ('ARAX/KG1','ARAX/KG2','BTE') that are adjacent to target nodes
        if kp == "ARAX/KG1":
            # query adjacent node in one DSL command by providing a list of query nodes to add_qnode()
            if rel_edge_id:
                if len(
                        rel_edge_type
                ) == 1:  # if the edge with rel_edge_id has only type, we use this rel_edge_type to find all source nodes in KP
                    self.response.debug(
                        f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total adjacent nodes in Fisher's Exact Test"
                    )
                    result = self.query_size_of_adjacent_nodes(
                        node_curie=list(target_node_dict.keys()),
                        adjacent_type=source_node_type,
                        kp=kp,
                        rel_type=list(rel_edge_type)[0],
                        use_cypher_command=True)
                else:  # if the edge with rel_edge_id has more than one type, we ignore the edge type and use all types to find all source nodes in KP
                    self.response.warning(
                        f"The edges with specified qedge id {rel_edge_id} have more than one type, we ignore the edge type and use all types to calculate Fisher's Exact Test"
                    )
                    self.response.debug(
                        f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test"
                    )
                    result = self.query_size_of_adjacent_nodes(
                        node_curie=list(target_node_dict.keys()),
                        adjacent_type=source_node_type,
                        kp=kp,
                        rel_type=None,
                        use_cypher_command=True)
            else:  # if no rel_edge_id is specified, we ignore the edge type and use all types to find all source nodes in KP
                self.response.debug(
                    f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test"
                )
                result = self.query_size_of_adjacent_nodes(
                    node_curie=list(target_node_dict.keys()),
                    adjacent_type=source_node_type,
                    kp=kp,
                    rel_type=None,
                    use_cypher_command=True)

            if result is None:
                return self.response  ## Something wrong happened for querying the adjacent nodes
            else:
                size_of_target = result
        else:
            # query adjacent node for query nodes one by one in parallel
            if rel_edge_id:
                if len(
                        rel_edge_type
                ) == 1:  # if the edge with rel_edge_id has only type, we use this rel_edge_type to find all source nodes in KP
                    self.response.debug(
                        f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total adjacent nodes in Fisher's Exact Test"
                    )
                    parameter_list = [
                        (node, source_node_type, kp, list(rel_edge_type)[0])
                        for node in list(target_node_dict.keys())
                    ]
                else:  # if the edge with rel_edge_id has more than one type, we ignore the edge type and use all types to find all source nodes in KP
                    self.response.warning(
                        f"The edges with specified qedge id {rel_edge_id} have more than one type, we ignore the edge type and use all types to calculate Fisher's Exact Test"
                    )
                    self.response.debug(
                        f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test"
                    )
                    parameter_list = [(node, source_node_type, kp, None)
                                      for node in list(target_node_dict.keys())
                                      ]
            else:  # if no rel_edge_id is specified, we ignore the edge type and use all types to find all source nodes in KP
                self.response.debug(
                    f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test"
                )
                parameter_list = [(node, source_node_type, kp, None)
                                  for node in list(target_node_dict.keys())]

            ## get the count of all nodes with the type of 'source_qnode_id' nodes in KP for each target node in parallel
            try:
                with multiprocessing.Pool() as executor:
                    target_count_res = [
                        elem for elem in executor.map(
                            self._query_size_of_adjacent_nodes_parallel,
                            parameter_list)
                    ]
                    executor.close()
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(
                    f"Something went wrong with querying adjacent nodes in parallel"
                )
                return self.response

            if any([type(elem) is list for elem in target_count_res]):
                for msg in [
                        elem2 for elem1 in target_count_res
                        if type(elem1) is list for elem2 in elem1
                ]:
                    if type(msg) is tuple:
                        self.response.error(msg[0], error_code=msg[1])
                    else:
                        self.response.error(msg)
                return self.response  ## Something wrong happened for querying the adjacent nodes
            else:
                for index in range(len(target_node_dict)):
                    node = list(target_node_dict.keys())[index]
                    size_of_target[node] = target_count_res[index]

        ## Based on KP detected in message KG, find the total number of node with the same type of source node
        if kp == 'ARAX/KG1':
            size_of_total = self.size_of_given_type_in_KP(
                node_type=source_node_type, use_cypher_command=True,
                kg='KG1')  ## Try cypher query first
            if size_of_total is not None:
                if size_of_total != 0:
                    self.response.debug(
                        f"ARAX/KG1 and cypher query were used to calculate total number of node with the same type of source node in Fisher's Exact Test"
                    )
                    self.response.debug(
                        f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1"
                    )
                    pass
                else:
                    size_of_total = self.size_of_given_type_in_KP(
                        node_type=source_node_type,
                        use_cypher_command=False,
                        kg='KG1'
                    )  ## If cypher query fails, then try kgNodeIndex
                    if size_of_total == 0:
                        self.response.error(
                            f"KG1 has 0 node with the same type of source node with qnode id {source_qnode_id}"
                        )
                        return self.response
                    else:
                        self.response.debug(
                            f"ARAX/KG1 and kgNodeIndex were used to calculate total number of node with the same type of source node in Fisher's Exact Test"
                        )
                        self.response.debug(
                            f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1"
                        )
                        pass
            else:
                return self.response  ## Something wrong happened for querying total number of node with the same type of source node

        elif kp == 'ARAX/KG2':
            ## check KG1 first as KG2 might have many duplicates. If KG1 is 0, then check KG2
            size_of_total = self.size_of_given_type_in_KP(
                node_type=source_node_type, use_cypher_command=True,
                kg='KG1')  ## Try cypher query first
            if size_of_total is not None:
                if size_of_total != 0:
                    self.response.warning(
                        f"Although ARAX/KG2 was found to have the maximum number of edges connected to both {source_qnode_id} and {target_qnode_id}, ARAX/KG1 and cypher query were used to find the total number of nodes with the same type of source node with qnode id {source_qnode_id} as KG2 might have many duplicates"
                    )
                    self.response.debug(
                        f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1"
                    )
                    pass
                else:
                    size_of_total = self.size_of_given_type_in_KP(
                        node_type=source_node_type,
                        use_cypher_command=False,
                        kg='KG1'
                    )  ## If cypher query fails, then try kgNodeIndex
                    if size_of_total is not None:
                        if size_of_total != 0:
                            self.response.warning(
                                f"Although ARAX/KG2 was found to have the maximum number of edges connected to both {source_qnode_id} and {target_qnode_id}, ARAX/KG1 and kgNodeIndex were used to find the total number of nodes with the same type of source node with qnode id {source_qnode_id} as KG2 might have many duplicates"
                            )
                            self.response.debug(
                                f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG1"
                            )
                            pass
                        else:
                            size_of_total = self.size_of_given_type_in_KP(
                                node_type=source_node_type,
                                use_cypher_command=False,
                                kg='KG2')
                            if size_of_total is None:
                                return self.response  ## Something wrong happened for querying total number of node with the same type of source node
                            elif size_of_total == 0:
                                self.response.error(
                                    f"KG2 has 0 node with the same type of source node with qnode id {source_qnode_id}"
                                )
                                return self.response
                            else:
                                self.response.debug(
                                    f"ARAX/KG2 and kgNodeIndex were used to calculate total number of node with the same type of source node in Fisher's Exact Test"
                                )
                                self.response.debug(
                                    f"Total {size_of_total} nodes with node type {source_node_type} was found in ARAX/KG2"
                                )
                                pass
                    else:
                        return self.response  ## Something wrong happened for querying total number of node with the same type of source node
            else:
                return self.response  ## Something wrong happened for querying total number of node with the same type of source node
        else:
            self.response.error(
                f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally"
            )
            return self.response

        size_of_query_sample = len(source_node_list)

        self.response.debug(f"Computing Fisher's Exact Test P-value")
        # calculate FET p-value for each target node in parallel
        parameter_list = [
            (node, len(target_node_dict[node]),
             size_of_target[node] - len(target_node_dict[node]),
             size_of_query_sample - len(target_node_dict[node]),
             (size_of_total - size_of_target[node]) -
             (size_of_query_sample - len(target_node_dict[node])))
            for node in target_node_dict
        ]

        try:
            with multiprocessing.Pool() as executor:
                FETpvalue_list = [
                    elem for elem in executor.map(
                        self._calculate_FET_pvalue_parallel, parameter_list)
                ]
                executor.close()
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong with computing Fisher's Exact Test P-value"
            )
            return self.response

        if any([type(elem) is list for elem in FETpvalue_list]):
            for msg in [
                    elem2 for elem1 in FETpvalue_list if type(elem1) is list
                    for elem2 in elem1
            ]:
                if type(msg) is tuple:
                    self.response.error(msg[0], error_code=msg[1])
                else:
                    self.response.error(msg)
            return self.response
        else:
            output = dict(FETpvalue_list)

        # check if the results need to be filtered
        output = dict(sorted(output.items(), key=lambda x: x[1]))
        if cutoff:
            output = dict(filter(lambda x: x[1] < cutoff, output.items()))
        else:
            pass
        if top_n:
            output = dict(list(output.items())[:top_n])
        else:
            pass

        # add the virtual edge with FET result to message KG
        self.response.debug(
            f"Adding virtual edge with FET result to message KG")

        virtual_edge_list = [
            Edge(id=f"{value[0]}_{index}",
                 type='has_fisher_exact_test_p-value_with',
                 relation=value[0],
                 source_id=value[2],
                 target_id=value[3],
                 is_defined_by="ARAX",
                 defined_datetime=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                 provided_by="ARAX",
                 confidence=None,
                 weight=None,
                 edge_attributes=[
                     EdgeAttribute(type="data:1669",
                                   name="fisher_exact_test_p-value",
                                   value=str(value[1]),
                                   url=None)
                 ],
                 qedge_ids=[value[0]]) for index, value in enumerate(
                     [(virtual_relation_label, output[adj], node, adj)
                      for adj in target_node_dict if adj in output.keys()
                      for node in target_node_dict[adj]], 1)
        ]

        self.message.knowledge_graph.edges.extend(virtual_edge_list)

        count = len(virtual_edge_list)

        self.response.debug(
            f"{count} new virtual edges were added to message KG")

        # add the virtual edge to message QG
        if count > 0:
            self.response.debug(f"Adding virtual edge to message QG")
            edge_type = "has_fisher_exact_test_p-value_with"
            q_edge = QEdge(id=virtual_relation_label,
                           type=edge_type,
                           relation=virtual_relation_label,
                           source_id=source_qnode_id,
                           target_id=target_qnode_id)
            self.message.query_graph.edges.append(q_edge)
            self.response.debug(f"One virtual edge was added to message QG")

        return self.response
Beispiel #9
0
    def compute_ngd(self):
        """
        Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info
        on the edge_attributes
        :default: The default value to set for NGD if it returns a nan
        :return: response
        """
        if self.response.status != 'OK':  # Catches any errors that may have been logged during initialization
            self._close_database()
            return self.response
        parameters = self.parameters
        self.response.debug(f"Computing NGD")
        self.response.info(
            f"Computing the normalized Google distance: weighting edges based on source/target node "
            f"co-occurrence frequency in PubMed abstracts")

        self.response.info(
            "Converting CURIE identifiers to human readable names")
        node_curie_to_name = dict()
        try:
            for node in self.message.knowledge_graph.nodes:
                node_curie_to_name[node.id] = node.name
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(f"Something went wrong when converting names")
            self.response.error(tb, error_code=error_type.__name__)

        name = "normalized_google_distance"
        type = "EDAM:data_2526"
        value = self.parameters['default_value']
        url = "https://arax.rtx.ai/api/rtx/v1/ui/#/PubmedMeshNgd"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            curies_to_names = dict()
            # identify the nodes that we should be adding virtual edges for
            for node in self.message.knowledge_graph.nodes:
                if hasattr(node, 'qnode_ids'):
                    if parameters['source_qnode_id'] in node.qnode_ids:
                        source_curies_to_decorate.add(node.id)
                        curies_to_names[node.id] = node.name
                    if parameters['target_qnode_id'] in node.qnode_ids:
                        target_curies_to_decorate.add(node.id)
                        curies_to_names[node.id] = node.name

            # Convert these curies to their canonicalized curies (needed for the local NGD system)
            canonicalized_curie_map = self._get_canonical_curies_map(
                list(source_curies_to_decorate.union(
                    target_curies_to_decorate)))
            self.load_curie_to_pmids_data(canonicalized_curie_map.values())
            added_flag = False  # check to see if any edges where added
            num_computed_total = 0
            num_computed_slow = 0
            self.response.debug(
                f"Looping through node pairs and calculating NGD values")
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
            for (source_curie,
                 target_curie) in itertools.product(source_curies_to_decorate,
                                                    target_curies_to_decorate):
                # create the edge attribute if it can be
                source_name = curies_to_names[source_curie]
                target_name = curies_to_names[target_curie]
                num_computed_total += 1
                canonical_source_curie = canonicalized_curie_map.get(
                    source_curie, source_curie)
                canonical_target_curie = canonicalized_curie_map.get(
                    target_curie, target_curie)
                ngd_value = self.calculate_ngd_fast(canonical_source_curie,
                                                    canonical_target_curie)
                if ngd_value is None:
                    ngd_value = self.NGD.get_ngd_for_all(
                        [source_curie, target_curie],
                        [source_name, target_name])
                    self.response.debug(
                        f"Had to use eUtils to compute NGD between {source_name} "
                        f"({canonical_source_curie}) and {target_name} ({canonical_target_curie}). "
                        f"Value is: {ngd_value}")
                    num_computed_slow += 1
                if np.isfinite(
                        ngd_value
                ):  # if ngd is finite, that's ok, otherwise, stay with default
                    value = ngd_value
                edge_attribute = EdgeAttribute(
                    type=type, name=name, value=str(value),
                    url=url)  # populate the NGD edge attribute
                if edge_attribute:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "has_normalized_google_distance_with"
                    qedge_ids = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    source_id = source_curie
                    target_id = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge = Edge(id=id,
                                type=edge_type,
                                relation=relation,
                                source_id=source_id,
                                target_id=target_id,
                                is_defined_by=is_defined_by,
                                defined_datetime=defined_datetime,
                                provided_by=provided_by,
                                confidence=confidence,
                                weight=weight,
                                edge_attributes=[edge_attribute],
                                qedge_ids=qedge_ids)
                    self.message.knowledge_graph.edges.append(edge)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                #edge_type = parameters['virtual_edge_type']
                edge_type = "has_normalized_google_distance_with"
                relation = parameters['virtual_relation_label']
                q_edge = QEdge(id=relation,
                               type=edge_type,
                               relation=relation,
                               source_id=parameters['source_qnode_id'],
                               target_id=parameters['target_qnode_id'])
                self.message.query_graph.edges.append(q_edge)

            self.response.info(f"NGD values successfully added to edges")
            num_computed_fast = num_computed_total - num_computed_slow
            percent_computed_fast = round(
                (num_computed_fast / num_computed_total) * 100)
            self.response.debug(
                f"Used fastNGD for {percent_computed_fast}% of edges "
                f"({num_computed_fast} of {num_computed_total})")
        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system)
                canonicalized_curie_map = self._get_canonical_curies_map(
                    [node.id for node in self.message.knowledge_graph.nodes])
                self.load_curie_to_pmids_data(canonicalized_curie_map.values())
                num_computed_total = 0
                num_computed_slow = 0
                self.response.debug(
                    f"Looping through edges and calculating NGD values")
                for edge in self.message.knowledge_graph.edges:
                    # Make sure the edge_attributes are not None
                    if not edge.edge_attributes:
                        edge.edge_attributes = [
                        ]  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    source_curie = edge.source_id
                    target_curie = edge.target_id
                    source_name = node_curie_to_name[source_curie]
                    target_name = node_curie_to_name[target_curie]
                    num_computed_total += 1
                    canonical_source_curie = canonicalized_curie_map.get(
                        source_curie, source_curie)
                    canonical_target_curie = canonicalized_curie_map.get(
                        target_curie, target_curie)
                    ngd_value = self.calculate_ngd_fast(
                        canonical_source_curie, canonical_target_curie)
                    if ngd_value is None:
                        ngd_value = self.NGD.get_ngd_for_all(
                            [source_curie, target_curie],
                            [source_name, target_name])
                        self.response.debug(
                            f"Had to use eUtils to compute NGD between {source_name} "
                            f"({canonical_source_curie}) and {target_name} ({canonical_target_curie}). "
                            f"Value is: {ngd_value}")
                        num_computed_slow += 1
                    if np.isfinite(
                            ngd_value
                    ):  # if ngd is finite, that's ok, otherwise, stay with default
                        value = ngd_value
                    ngd_edge_attribute = EdgeAttribute(
                        type=type, name=name, value=str(value),
                        url=url)  # populate the NGD edge attribute
                    edge.edge_attributes.append(
                        ngd_edge_attribute
                    )  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(
                    f"Something went wrong adding the NGD edge attributes")
            else:
                self.response.info(f"NGD values successfully added to edges")
                num_computed_fast = num_computed_total - num_computed_slow
                percent_computed_fast = round(
                    (num_computed_fast / num_computed_total) * 100)
                self.response.debug(
                    f"Used fastNGD for {percent_computed_fast}% of edges "
                    f"({num_computed_fast} of {num_computed_total})")
            self._close_database()
            return self.response
Beispiel #10
0
    def compute_jaccard(self):
        message = self.message
        parameters = self.parameters
        self.response.debug(f"Computing Jaccard distance and adding this information as virtual edges")
        self.response.info(f"Computing Jaccard distance and adding this information as virtual edges")

        self.response.info("Getting all relevant nodes")
        # TODO: should I check that they're connected to the start node, or just assume that they are?
        # TODO: For now, assume that they are
        try:
            intermediate_nodes = set()
            end_node_to_intermediate_node_set = dict()  # keys will be end node curies, values will be tuples the (intermediate curie ids, edge_type)
            for node in message.knowledge_graph.nodes:
                if parameters['intermediate_node_id'] in node.qnode_ids:
                    intermediate_nodes.add(node.id)  # add the intermediate node by it's identifier
                # also look for the source node id
                if parameters['start_node_id'] in node.qnode_ids:
                    source_node_id = node.id
                if parameters['end_node_id'] in node.qnode_ids:
                    end_node_to_intermediate_node_set[node.id] = set()

            # now iterate over the edges to look for the ones we need to add  # TODO: Here, I won't care which direction the edges are pointing
            for edge in message.knowledge_graph.edges:
                if edge.source_id in intermediate_nodes:  # if source is intermediate
                    if edge.target_id in end_node_to_intermediate_node_set:
                        end_node_to_intermediate_node_set[edge.target_id].add((edge.source_id, edge.type))  # add source
                elif edge.target_id in intermediate_nodes:  # if target is intermediate
                    if edge.source_id in end_node_to_intermediate_node_set:
                        end_node_to_intermediate_node_set[edge.source_id].add((edge.target_id, edge.type))  # add target

            # now compute the actual jaccard indexes
            denom = len(intermediate_nodes)
            end_node_to_jaccard = dict()
            for end_node_id in end_node_to_intermediate_node_set:
                # TODO: add code here if you care about edge types
                numerator = len(end_node_to_intermediate_node_set[end_node_id])
                jacc = numerator / float(denom)
                end_node_to_jaccard[end_node_id] = jacc

            # now add them all as virtual edges

            # edge properties
            j_iter = 0
            now = datetime.now()
            #edge_type = parameters['virtual_edge_type']
            edge_type = 'has_jaccard_index_with'
            qedge_ids = [parameters['virtual_relation_label']]
            relation = parameters['virtual_relation_label']
            is_defined_by = "ARAX"
            defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
            provided_by = "ARAX"
            confidence = None
            weight = None  # TODO: could make the jaccard index the weight
            try:
                source_id = source_node_id
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.warning(
                    f"Source node id: {parameters['start_node_id']} not found in the KG. Perhaps the KG is empty?")
                #self.response.error(tb, error_code=error_type.__name__)

            # edge attribute properties
            description = f"Jaccard index based on intermediate query nodes {parameters['intermediate_node_id']}"
            attribute_type = 'data:1772'
            name = "jaccard_index"
            url = None

            # now actually add the virtual edges in
            for end_node_id, value in end_node_to_jaccard.items():
                edge_attribute = EdgeAttribute(type=attribute_type, name=name, value=value, url=url)
                id = f"J{j_iter}"
                j_iter += 1
                target_id = end_node_id
                edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id,
                            is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by,
                            confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids)
                message.knowledge_graph.edges.append(edge)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['start_node_id'], target_id=parameters['end_node_id'])  # TODO: ok to make the id and type the same thing?
            self.message.query_graph.edges.append(q_edge)

            return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(f"Something went wrong when computing the Jaccard index")
            self.response.error(tb, error_code=error_type.__name__)
Beispiel #11
0
    def add_neighborhood_graph(self, nodes, edges, confidence=None):
        """
		Populate the object model using networkx neo4j subgraph
		:param nodes: nodes in the subgraph (g.nodes(data=True))
		:param edges: edges in the subgraph (g.edges(data=True))
		:return: none
		"""

        # Get the relevant info from the nodes and edges
        node_keys = []
        node_descriptions = dict()
        node_names = dict()
        node_labels = dict()
        node_uuids = dict()
        node_accessions = dict()
        node_iris = dict()
        node_uuids2iri = dict()
        node_curies = dict()
        node_uuids2curie = dict()
        for u, data in nodes:
            node_keys.append(u)
            if 'description' in data['properties']:
                node_descriptions[u] = data['properties']['description']
            else:
                node_descriptions[u] = "None"
            node_names[u] = data['properties']['name']
            node_labels[u] = list(set(data['labels']).difference({'Base'}))[0]
            node_uuids[u] = data['properties']['UUID']
            node_accessions[u] = data['properties']['accession']
            node_iris[u] = data['properties']['uri']
            node_uuids2iri[data['properties']
                           ['UUID']] = data['properties']['uri']
            curie_id = data['properties']['id']
            if curie_id.split(':')[0].upper() == "CHEMBL":
                curie_id = "CHEMBL:CHEMBL" + curie_id.split(':')[1]
            node_uuids2curie[data['properties']['UUID']] = curie_id
            node_curies[
                u] = curie_id  # These are the actual CURIE IDS eg UBERON:00000941 (uri is the web address)

        edge_keys = []
        edge_types = dict()
        edge_source_db = dict()
        edge_source_iri = dict()
        edge_target_iri = dict()
        edge_source_curie = dict()
        edge_target_curie = dict()
        for u, v, data in edges:
            edge_keys.append((u, v))
            edge_types[(u, v)] = data['type']
            edge_source_db[(u, v)] = data['properties']['provided_by']
            edge_source_iri[(
                u, v)] = node_uuids2iri[data['properties']['source_node_uuid']]
            edge_target_iri[(
                u, v)] = node_uuids2iri[data['properties']['target_node_uuid']]
            edge_source_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['source_node_uuid']]
            edge_target_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['target_node_uuid']]

        # For each node, populate the relevant information
        node_objects = []
        node_iris_to_node_object = dict()
        for node_key in node_keys:
            node = Node()
            node.id = node_curies[node_key]
            node.type = node_labels[node_key]
            node.name = node_names[node_key]
            node.uri = node_iris[node_key]
            node.accession = node_accessions[node_key]
            node.description = node_descriptions[node_key]
            node_objects.append(node)
            node_iris_to_node_object[node_iris[node_key]] = node

        # for each edge, create an edge between them
        edge_objects = []
        for u, v in edge_keys:
            edge = Edge()
            edge.type = edge_types[(u, v)]
            edge.source_id = node_iris_to_node_object[edge_source_iri[(u,
                                                                       v)]].id
            edge.target_id = node_iris_to_node_object[edge_target_iri[(u,
                                                                       v)]].id
            #edge.origin_list = []
            #edge.origin_list.append(edge_source_db[(u, v)])  # TODO: check with eric if this really should be a list and if it should contain the source DB('s)
            edge.provided_by = edge_source_db[(u, v)]
            edge.is_defined_by = "RTX"
            edge_objects.append(edge)

        # Create the result (potential answer)
        result1 = Result()
        text = "This is a subgraph extracted from the full RTX knowledge graph, including nodes and edges relevant to the query." \
            " This is not an answer to the query per se, but rather an opportunity to examine a small region of the RTX knowledge graph for further study. " \
            "Formal answers to the query are below."
        result1.text = text
        result1.confidence = confidence
        result1.result_type = "neighborhood graph"

        # Create a ResultGraph object and put the list of nodes and edges into it
        result_graph = ResultGraph()
        result_graph.node_list = node_objects
        result_graph.edge_list = edge_objects

        # Put the ResultGraph into the first result (potential answer)
        result1.result_graph = result_graph

        # Put the first result (potential answer) into the response
        self._result_list.append(result1)
        self.response.result_list = self._result_list
Beispiel #12
0
    def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge: Dict[str, any]) -> Edge:
        swagger_edge = Edge()
        swagger_edge.id = f"KG2:{neo4j_edge.get('id')}"
        swagger_edge.type = neo4j_edge.get("simplified_edge_label")
        swagger_edge.source_id = neo4j_edge.get("subject")
        swagger_edge.target_id = neo4j_edge.get("object")
        swagger_edge.relation = neo4j_edge.get("relation")
        swagger_edge.publications = ast.literal_eval(neo4j_edge.get("publications"))
        swagger_edge.provided_by = self._convert_strange_provided_by_field_to_list(neo4j_edge.get("provided_by"))  # Temporary hack until provided_by is fixed in KG2
        swagger_edge.negated = ast.literal_eval(neo4j_edge.get("negated"))
        swagger_edge.is_defined_by = "ARAX/KG2"
        swagger_edge.edge_attributes = []

        # Add additional properties on KG2 edges as swagger EdgeAttribute objects
        # TODO: fix issues coming from strange characters in 'publications_info'! (EOF error)
        additional_kg2_edge_properties = ["relation_curie", "simplified_relation_curie", "simplified_relation",
                                          "edge_label"]
        edge_attributes = self._create_swagger_attributes("edge", additional_kg2_edge_properties, neo4j_edge)
        swagger_edge.edge_attributes += edge_attributes

        return swagger_edge
Beispiel #13
0
    def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge):
        swagger_edge = Edge()
        swagger_edge.type = neo4j_edge.get('simplified_edge_label')
        swagger_edge.source_id = neo4j_edge.get('subject')
        swagger_edge.target_id = neo4j_edge.get('object')
        swagger_edge.id = self._create_edge_id(swagger_edge)
        swagger_edge.relation = neo4j_edge.get('relation')
        swagger_edge.publications = ast.literal_eval(
            neo4j_edge.get('publications'))
        swagger_edge.provided_by = self._convert_strange_provided_by_field_to_list(
            neo4j_edge.get('provided_by')
        )  # Temporary hack until provided_by is fixed in KG2
        swagger_edge.negated = ast.literal_eval(neo4j_edge.get('negated'))
        swagger_edge.is_defined_by = "ARAX/KG2"
        swagger_edge.edge_attributes = []

        # Add additional properties on KG2 edges as swagger EdgeAttribute objects
        # TODO: fix issues coming from strange characters in 'publications_info'! (EOF error)
        additional_kg2_edge_properties = [
            'relation_curie', 'simplified_relation_curie',
            'simplified_relation', 'edge_label'
        ]
        edge_attributes = self._create_swagger_attributes(
            "edge", additional_kg2_edge_properties, neo4j_edge)
        swagger_edge.edge_attributes += edge_attributes

        return swagger_edge
Beispiel #14
0
    def add_subgraph(self, nodes, edges, plain_text, confidence):
        """
		Populate the object model using networkx neo4j subgraph
		:param nodes: nodes in the subgraph (g.nodes(data=True))
		:param edges: edges in the subgraph (g.edges(data=True))
		:return: none
		"""

        # Get the relevant info from the nodes and edges
        node_keys = []
        node_descriptions = dict()
        node_names = dict()
        node_labels = dict()
        node_uuids = dict()
        node_accessions = dict()
        node_iris = dict()
        node_uuids2iri = dict()
        node_curies = dict()
        node_uuids2curie = dict()
        for u, data in nodes:
            node_keys.append(u)
            node_descriptions[u] = data['properties']['description']
            node_names[u] = data['properties']['name']
            node_labels[u] = list(set(data['labels']).difference({'Base'}))[0]
            node_uuids[u] = data['properties']['UUID']
            node_accessions[u] = data['properties']['accession']
            node_iris[u] = data['properties']['iri']
            node_uuids2iri[data['properties']
                           ['UUID']] = data['properties']['iri']
            node_curies[u] = data['properties']['curie_id']
            node_uuids2curie[data['properties']
                             ['UUID']] = data['properties']['curie_id']

        edge_keys = []
        edge_types = dict()
        edge_source_db = dict()
        edge_source_iri = dict()
        edge_target_iri = dict()
        edge_source_curie = dict()
        edge_target_curie = dict()
        for u, v, data in edges:
            edge_keys.append((u, v))
            edge_types[(u, v)] = data['type']
            edge_source_db[(u, v)] = data['properties']['sourcedb']
            edge_source_iri[(
                u, v)] = node_uuids2iri[data['properties']['source_node_uuid']]
            edge_target_iri[(
                u, v)] = node_uuids2iri[data['properties']['target_node_uuid']]
            edge_source_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['source_node_uuid']]
            edge_target_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['target_node_uuid']]

        # For each node, populate the relevant information
        node_objects = []
        node_iris_to_node_object = dict()
        for node_key in node_keys:
            node = Node()
            node.id = node_curies[node_key]
            node.type = node_labels[node_key]
            node.name = node_names[node_key]
            node.accession = node_accessions[node_key]
            node.description = node_descriptions[node_key]
            node_objects.append(node)
            node_iris_to_node_object[node_iris[node_key]] = node

        # for each edge, create an edge between them
        edge_objects = []
        for u, v in edge_keys:
            edge = Edge()
            edge.type = edge_types[(u, v)]
            edge.source_id = node_iris_to_node_object[edge_source_iri[(u,
                                                                       v)]].id
            edge.target_id = node_iris_to_node_object[edge_target_iri[(u,
                                                                       v)]].id
            edge.origin_list = []
            edge.origin_list.append(
                edge_source_db[(u, v)]
            )  # TODO: check with eric if this really should be a list and if it should contain the source DB('s)
            edge_objects.append(edge)

        # Create the result (potential answer)
        result1 = Result()
        #result1.id = "http://rtx.ncats.io/api/v1/response/1234/result/2345"
        #result1.id = "-1"
        result1.text = plain_text
        result1.confidence = confidence

        # Create a ResultGraph object and put the list of nodes and edges into it
        result_graph = ResultGraph()
        result_graph.node_list = node_objects
        result_graph.edge_list = edge_objects

        # Put the ResultGraph into the first result (potential answer)
        result1.result_graph = result_graph

        # Put the first result (potential answer) into the response
        self._result_list.append(result1)
        self.response.result_list = self._result_list
        # Increment the number of results
        self._num_results += 1
        if self._num_results == 1:
            self.response.message = "%s result found" % self._num_results
        else:
            self.response.message = "%s results found" % self._num_results
Beispiel #15
0
    def compute_ngd(self):
        """
        Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info
        on the edge_attributes
        :default: The default value to set for NGD if it returns a nan
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing NGD")
        self.response.info(f"Computing the normalized Google distance: weighting edges based on source/target node "
                           f"co-occurrence frequency in PubMed abstracts")

        self.response.info("Converting CURIE identifiers to human readable names")
        node_curie_to_name = dict()
        try:
            for node in self.message.knowledge_graph.nodes:
                node_curie_to_name[node.id] = node.name
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(f"Something went wrong when converting names")
            self.response.error(tb, error_code=error_type.__name__)


        self.response.warning(f"Utilizing API calls to NCBI eUtils, so this may take a while...")
        name = "normalized_google_distance"
        type = "data:2526"
        value = self.parameters['default_value']
        url = "https://arax.rtx.ai/api/rtx/v1/ui/#/PubmedMeshNgd"
        ngd_method_counts = {"fast": 0, "slow": 0}

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            curies_to_names = dict()
            # identify the nodes that we should be adding virtual edges for
            for node in self.message.knowledge_graph.nodes:
                if hasattr(node, 'qnode_ids'):
                    if parameters['source_qnode_id'] in node.qnode_ids:
                        source_curies_to_decorate.add(node.id)
                        curies_to_names[node.id] = node.name
                    if parameters['target_qnode_id'] in node.qnode_ids:
                        target_curies_to_decorate.add(node.id)
                        curies_to_names[node.id] = node.name
            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                # create the edge attribute if it can be
                source_name = curies_to_names[source_curie]
                target_name = curies_to_names[target_curie]
                self.response.debug(f"Computing NGD between {source_name} and {target_name}")
                ngd_value, method_used = self.NGD.get_ngd_for_all_fast([source_curie, target_curie], [source_name, target_name])
                ngd_method_counts[method_used] += 1
                if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                    value = ngd_value
                edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url)  # populate the NGD edge attribute
                if edge_attribute:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "has_normalized_google_distance_with"
                    qedge_ids = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    source_id = source_curie
                    target_id = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id,
                                target_id=target_id,
                                is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                                provided_by=provided_by,
                                confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids)
                    self.message.knowledge_graph.edges.append(edge)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                #edge_type = parameters['virtual_edge_type']
                edge_type = "has_normalized_google_distance_with"
                relation = parameters['virtual_relation_label']
                q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                               source_id=parameters['source_qnode_id'], target_id=parameters[
                        'target_qnode_id'])
                self.message.query_graph.edges.append(q_edge)
        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                for edge in self.message.knowledge_graph.edges:
                    # Make sure the edge_attributes are not None
                    if not edge.edge_attributes:
                        edge.edge_attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    source_curie = edge.source_id
                    target_curie = edge.target_id
                    source_name = node_curie_to_name[source_curie]
                    target_name = node_curie_to_name[target_curie]
                    ngd_value, method_used = self.NGD.get_ngd_for_all_fast([source_curie, target_curie], [source_name, target_name])
                    ngd_method_counts[method_used] += 1
                    if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                        value = ngd_value
                    ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url)  # populate the NGD edge attribute
                    edge.edge_attributes.append(ngd_edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the NGD edge attributes")
            else:
                self.response.info(f"NGD values successfully added to edges")
                self.response.debug(f"Used fast NGD for {ngd_method_counts['fast']} edges, back-up NGD method for {ngd_method_counts['slow']}")

            return self.response
    def add_virtual_edge(self, name="", default=0.):
        """
        Generic function to add a virtual edge to the KG an QG
        :name: name of the functionality of the KP to use
        """
        parameters = self.parameters
        source_curies_to_decorate = set()
        target_curies_to_decorate = set()
        curies_to_names = dict(
        )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        # identify the nodes that we should be adding virtual edges for
        for node in self.message.knowledge_graph.nodes:
            if hasattr(node, 'qnode_ids'):
                if parameters['source_qnode_id'] in node.qnode_ids:
                    source_curies_to_decorate.add(node.id)
                    curies_to_names[
                        node.
                        id] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                if parameters['target_qnode_id'] in node.qnode_ids:
                    target_curies_to_decorate.add(node.id)
                    curies_to_names[
                        node.
                        id] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        added_flag = False  # check to see if any edges where added
        # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
        for (source_curie,
             target_curie) in itertools.product(source_curies_to_decorate,
                                                target_curies_to_decorate):
            # create the edge attribute if it can be
            edge_attribute = self.make_edge_attribute_from_curies(
                source_curie,
                target_curie,
                source_name=curies_to_names[source_curie],
                target_name=curies_to_names[target_curie],
                default=default,
                name=name)
            if edge_attribute:
                added_flag = True
                # make the edge, add the attribute

                # edge properties
                now = datetime.now()
                edge_type = f"has_{name}_with"
                qedge_ids = [parameters['virtual_relation_label']]
                relation = parameters['virtual_relation_label']
                is_defined_by = "ARAX"
                defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                provided_by = "ARAX"
                confidence = None
                weight = None  # TODO: could make the actual value of the attribute
                source_id = source_curie
                target_id = target_curie

                # now actually add the virtual edges in
                id = f"{relation}_{self.global_iter}"
                self.global_iter += 1
                edge = Edge(id=id,
                            type=edge_type,
                            relation=relation,
                            source_id=source_id,
                            target_id=target_id,
                            is_defined_by=is_defined_by,
                            defined_datetime=defined_datetime,
                            provided_by=provided_by,
                            confidence=confidence,
                            weight=weight,
                            edge_attributes=[edge_attribute],
                            qedge_ids=qedge_ids)
                self.message.knowledge_graph.edges.append(edge)

        # Now add a q_edge the query_graph since I've added an extra edge to the KG
        if added_flag:
            edge_type = f"has_{name}_with"
            relation = parameters['virtual_relation_label']
            qedge_ids = [parameters['virtual_relation_label']]
            q_edge = QEdge(
                id=relation,
                type=edge_type,
                relation=relation,
                source_id=parameters['source_qnode_id'],
                target_id=parameters['target_qnode_id']
            )  # TODO: ok to make the id and type the same thing?
            self.message.query_graph.edges.append(q_edge)
Beispiel #17
0
    def predict_drug_treats_disease(self):
        """
        Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges
        on the edge_attributes
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing drug disease treatment probability based on a machine learning model")
        self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.")

        attribute_name = "probability_treats"
        attribute_type = "EDAM:data_0951"
        value = 0  # this will be the default value. If the model returns 0, or the default is there, don't include that edge
        url = "https://doi.org/10.1101/765305"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            # identify the nodes that we should be adding virtual edges for
            for node in self.message.knowledge_graph.nodes:
                if hasattr(node, 'qnode_ids'):
                    if parameters['source_qnode_id'] in node.qnode_ids:
                        if "drug" in node.type or "chemical_substance" in node.type:  # this is now NOT checked by ARAX_overlay
                            source_curies_to_decorate.add(node.id)
                    if parameters['target_qnode_id'] in node.qnode_ids:
                        if "disease" in node.type or "phenotypic_feature" in node.type:  # this is now NOT checked by ARAX_overlay
                            target_curies_to_decorate.add(node.id)

            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                # create the edge attribute if it can be
                # loop over all equivalent curies and take the highest probability

                max_probability = 0
                converted_source_curie = self.convert_to_trained_curies(source_curie)
                converted_target_curie = self.convert_to_trained_curies(target_curie)
                if converted_source_curie is None or converted_target_curie is None:
                    continue
                res = list(itertools.product(converted_source_curie, converted_target_curie))
                if len(res) != 0:
                    all_probabilities = self.pred.prob_all(res)
                    if isinstance(all_probabilities, list):
                        max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                value = max_probability

                #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                #    value = probability[0]
                edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the edge attribute
                if edge_attribute and value != 0:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "probably_treats"
                    qedge_ids = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    source_id = source_curie
                    target_id = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id,
                                target_id=target_id,
                                is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                                provided_by=provided_by,
                                confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids)
                    self.message.knowledge_graph.edges.append(edge)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                edge_type = "probably_treats"
                relation = parameters['virtual_relation_label']
                qedge_id = parameters['virtual_relation_label']
                q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                               source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id'])  # TODO: ok to make the id and type the same thing?
                self.message.query_graph.edges.append(q_edge)
            return self.response

        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # map curies to types
                curie_to_type = dict()
                for node in self.message.knowledge_graph.nodes:
                    curie_to_type[node.id] = node.type
                # then iterate over the edges and decorate if appropriate
                for edge in self.message.knowledge_graph.edges:
                    # Make sure the edge_attributes are not None
                    if not edge.edge_attributes:
                        edge.edge_attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    source_curie = edge.source_id
                    target_curie = edge.target_id
                    source_types = curie_to_type[source_curie]
                    target_types = curie_to_type[target_curie]
                    if (("drug" in source_types) or ("chemical_substance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types)):
                        temp_value = 0
                        # loop over all pairs of equivalent curies and take the highest probability

                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_source_curie is None or converted_target_curie is None:
                            continue
                        res = list(itertools.product(converted_source_curie, converted_target_curie))
                        if len(res) != 0:
                            all_probabilities = self.pred.prob_all(res)
                            if isinstance(all_probabilities, list):
                                max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                        #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                    elif (("drug" in target_types) or ("chemical_substance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types)):
                        #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]

                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_source_curie is None or converted_target_curie is None:
                            continue
                        res = list(itertools.product(converted_target_curie, converted_source_curie))
                        if len(res) != 0:
                            all_probabilities = self.pred.prob_all(res)
                            if isinstance(all_probabilities, list):
                                max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                    else:
                        continue
                    if value != 0:
                        edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the attribute
                        edge.edge_attributes.append(edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the drug disease treatment probability")
            else:
                self.response.info(f"Drug disease treatment probability successfully added to edges")

            return self.response
Beispiel #18
0
 def _remap_edge(edge: Edge, new_curie: str, old_curie: str) -> Edge:
     if edge.source_id == new_curie:
         edge.source_id = old_curie
     if edge.target_id == new_curie:
         edge.target_id = old_curie
     return edge
Beispiel #19
0
    def add_subgraph(self,
                     nodes,
                     edges,
                     plain_text,
                     confidence,
                     return_result=False):
        """
		Populate the object model using networkx neo4j subgraph
		:param nodes: nodes in the subgraph (g.nodes(data=True))
		:param edges: edges in the subgraph (g.edges(data=True))
		:return: none
		"""

        # Get the relevant info from the nodes and edges
        node_keys = []
        node_descriptions = dict()
        node_names = dict()
        node_labels = dict()
        node_uuids = dict()
        node_accessions = dict()
        node_iris = dict()
        node_uuids2iri = dict()
        node_curies = dict()
        node_uuids2curie = dict()
        for u, data in nodes:
            node_keys.append(u)
            if 'description' in data['properties']:
                node_descriptions[u] = data['properties']['description']
            else:
                node_descriptions[u] = "None"
            node_names[u] = data['properties']['name']
            node_labels[u] = list(set(data['labels']).difference({'Base'}))[0]
            node_uuids[u] = data['properties']['UUID']
            node_accessions[u] = data['properties']['accession']
            node_iris[u] = data['properties']['uri']
            node_uuids2iri[data['properties']
                           ['UUID']] = data['properties']['uri']
            curie_id = data['properties']['id']
            if curie_id.split(':')[0].upper() == "CHEMBL":
                curie_id = "CHEMBL:CHEMBL" + curie_id.split(':')[1]
            node_uuids2curie[data['properties']['UUID']] = curie_id
            node_curies[
                u] = curie_id  # These are the actual CURIE IDS eg UBERON:00000941 (uri is the web address)

        edge_keys = []
        edge_types = dict()
        edge_source_db = dict()
        edge_source_iri = dict()
        edge_target_iri = dict()
        edge_source_curie = dict()
        edge_target_curie = dict()
        for u, v, data in edges:
            edge_keys.append((u, v))
            edge_types[(u, v)] = data['type']
            edge_source_db[(u, v)] = data['properties']['provided_by']
            edge_source_iri[(
                u, v)] = node_uuids2iri[data['properties']['source_node_uuid']]
            edge_target_iri[(
                u, v)] = node_uuids2iri[data['properties']['target_node_uuid']]
            edge_source_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['source_node_uuid']]
            edge_target_curie[(
                u,
                v)] = node_uuids2curie[data['properties']['target_node_uuid']]

        # For each node, populate the relevant information
        node_objects = []
        node_iris_to_node_object = dict()
        for node_key in node_keys:
            node = Node()
            node.id = node_curies[node_key]
            node.type = node_labels[node_key]
            node.name = node_names[node_key]
            node.uri = node_iris[node_key]
            node.accession = node_accessions[node_key]
            node.description = node_descriptions[node_key]
            node_objects.append(node)
            node_iris_to_node_object[node_iris[node_key]] = node

        # for each edge, create an edge between them
        edge_objects = []
        for u, v in edge_keys:
            edge = Edge()
            edge.type = edge_types[(u, v)]
            edge.source_id = node_iris_to_node_object[edge_source_iri[(u,
                                                                       v)]].id
            edge.target_id = node_iris_to_node_object[edge_target_iri[(u,
                                                                       v)]].id
            #edge.origin_list = []
            #edge.origin_list.append(edge_source_db[(u, v)])  # TODO: check with eric if this really should be a list and if it should contain the source DB('s)
            edge_objects.append(edge)
            #edge.attribute_list
            #edge.confidence
            #edge.evidence_type
            edge.is_defined_by = "RTX"
            #edge.provided_by = node_iris_to_node_object[edge_source_iri[(u, v)]].uri
            edge.provided_by = edge_source_db[(u, v)]
            #edge.publications
            #edge.qualifiers
            #edge.relation
            #edge.source_id
            #edge.target_id
            #edge.type

        # Create the result (potential answer)
        result1 = Result()
        result1.text = plain_text
        result1.confidence = confidence

        # Create a ResultGraph object and put the list of nodes and edges into it
        result_graph = ResultGraph()
        result_graph.node_list = node_objects
        result_graph.edge_list = edge_objects

        # Put the ResultGraph into the first result (potential answer)
        result1.result_graph = result_graph

        # Put the first result (potential answer) into the response
        self._result_list.append(result1)
        self.response.result_list = self._result_list
        # Increment the number of results
        self._num_results += 1
        if self._num_results == 1:
            self.response.message = "%s result found" % self._num_results
        else:
            self.response.message = "%s results found" % self._num_results
        if return_result:
            return result1
        else:
            pass
Beispiel #20
0
    def test1(self):

        #### Create the response object and fill it with attributes about the response
        response = Response()
        response.context = "http://translator.ncats.io"
        response.id = "http://rtx.ncats.io/api/v1/response/1234"
        response.type = "medical_translator_query_response"
        response.tool_version = "RTX 0.4"
        response.schema_version = "0.5"
        response.datetime = datetime.datetime.now().strftime(
            "%Y-%m-%d %H:%M:%S")
        response.original_question_text = "what proteins are affected by sickle cell anemia"
        response.restated_question_text = "Which proteins are affected by sickle cell anemia?"
        response.result_code = "OK"
        response.message = "1 result found"

        #### Create a disease node
        node1 = Node()
        node1.id = "http://omim.org/entry/603903"
        node1.type = "disease"
        node1.name = "sickle cell anemia"
        node1.accession = "OMIM:603903"
        node1.description = "A disease characterized by chronic hemolytic anemia..."

        #### Create a protein node
        node2 = Node()
        node2.id = "https://www.uniprot.org/uniprot/P00738"
        node2.type = "protein"
        node2.name = "Haptoglobin"
        node2.symbol = "HP"
        node2.accession = "UNIPROT:P00738"
        node2.description = "Haptoglobin captures, and combines with free plasma hemoglobin..."

        #### Create a node attribute
        node2attribute1 = NodeAttribute()
        node2attribute1.type = "comment"
        node2attribute1.name = "Complex_description"
        node2attribute1.value = "The Hemoglobin/haptoglobin complex is composed of a haptoglobin dimer bound to two hemoglobin alpha-beta dimers"
        node2.node_attributes = [node2attribute1]

        #### Create an edge between these 2 nodes
        edge1 = Edge()
        edge1.type = "is_caused_by_a_defect_in"
        edge1.source_id = node1.id
        edge1.target_id = node2.id
        edge1.confidence = 1.0

        #### Add an origin and property for the edge
        origin1 = Origin()
        origin1.id = "https://api.monarchinitiative.org/api/bioentity/disease/OMIM:603903/genes/"
        origin1.type = "Monarch_BioLink_API_Relationship"

        #### Add an attribute
        attribute1 = EdgeAttribute()
        attribute1.type = "PubMed_article"
        attribute1.name = "Orthopaedic Manifestations of Sickle Cell Disease"
        attribute1.value = None
        attribute1.url = "https://www.ncbi.nlm.nih.gov/pubmed/29309293"
        origin1.attribute_list = [attribute1]
        edge1.origin_list = [origin1]

        #### Create the first result (potential answer)
        result1 = Result()
        result1.id = "http://rtx.ncats.io/api/v1/response/1234/result/2345"
        result1.text = "A free text description of this result"
        result1.confidence = 0.932

        #### Create a ResultGraph object and put the list of nodes and edges into it
        result_graph = ResultGraph()
        result_graph.node_list = [node1, node2]
        result_graph.edge_list = [edge1]

        #### Put the ResultGraph into the first result (potential answer)
        result1.result_graph = result_graph

        #### Put the first result (potential answer) into the response
        result_list = [result1]
        response.result_list = result_list

        print(response)