Esempio n. 1
0
 def answer(source_list,
            source_type,
            target_type,
            use_json=False,
            num_show=20,
            rel_type=None):
     """
     Answers the question 'what pathways are most enriched by $protein_list?'
     :param source_list: A list of source node ids
     :param source_type: The source node label
     :param target_type: The target node label
     :param use_json: bool, use JSON output
     :param num_show: int, number to display
     :return: none
     """
     if RU.does_connect(source_list, source_type, target_type) != 1:
         error_message = "I found no %s connected to any element of %s" % (
             target_type, str(source_list))
         if not use_json:
             print(error_message)
             return
         else:
             error_code = "NoPathsFound"
             response = FormatOutput.FormatResponse(3)
             response.add_error_message(error_code, error_message)
             response.print()
             return
     (target_dict, target_list) = RU.top_n_fisher_exact(source_list,
                                                        source_type,
                                                        target_type,
                                                        n=num_show,
                                                        rel_type=rel_type)
     target_list.reverse()
     return (target_dict, target_list)
Esempio n. 2
0
def other_connection_types():
    # Besides direct disease->phenotype connections, here is a list of other possible connections
    # one is parent of
    print("one")
    node_label_list = [disease_label, "phenotypic_feature"]
    relationship_label_list = ["subclass_of", "has_phenotype", "has_phenotype"]
    node_of_interest_position = 1
    print(
        RU.count_nodes_of_type_on_path_of_type_to_label(
            disease_ID,
            disease_label,
            target_label,
            node_label_list,
            relationship_label_list,
            node_of_interest_position,
            debug=True))
    names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
        disease_ID, disease_label, target_label, node_label_list,
        relationship_label_list, node_of_interest_position)
    for ID in names2counts.keys():
        if names2counts[ID] / float(
                len(disease_phenotypes_set)
        ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
            other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]

    # other is parent of
    print("other")
    node_label_list = ["phenotypic_feature", target_label]
    relationship_label_list = ["has_phenotype", "has_phenotype", "subclass_of"]
    node_of_interest_position = 0
    names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
        disease_ID, disease_label, target_label, node_label_list,
        relationship_label_list, node_of_interest_position)
    for ID in names2counts.keys():
        if names2counts[ID] / float(
                len(disease_phenotypes_set)
        ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
            other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]

    # Both is parent of
    print("both")
    node_label_list = [disease_label, "phenotypic_feature", target_label]
    relationship_label_list = [
        "subclass_of", "has_phenotype", "has_phenotype", "subclass_of"
    ]
    node_of_interest_position = 1
    names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
        disease_ID, disease_label, target_label, node_label_list,
        relationship_label_list, node_of_interest_position)
    for ID in names2counts.keys():
        if names2counts[ID] / float(
                len(disease_phenotypes_set)
        ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
            other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]
Esempio n. 3
0
	def restate_question(self, input_parameters):
		"""
		Restates a question.
		:param parameters: a dictionary with keys given by self.parameters.keys()
		:return: string
		"""
		# First, get rid of the Nones since they substitute in an ugly way
		parameters = dict()
		for key, value in input_parameters.items():
			if value is not None:
				parameters[key] = value

		# Try to get the description of each node
		parameters_as_descriptions = dict()
		if parameters:
			for parameter in parameters:
				try:
					description = RU.get_node_property(parameters[parameter], 'description')
				except:
					description = parameters[parameter]
				parameters_as_descriptions[parameter] = description

		# Lastly, make the template substitution
		if parameters_as_descriptions:
			restated = self.restated_question_template.safe_substitute(parameters_as_descriptions)
		else:
			restated = self.restated_question_template.safe_substitute({})
		return restated
Esempio n. 4
0
 def describe(self):
     output = "Answers questions of the form: 'What proteins does tranilast target?' and 'What genes are affected by " \
        "Fanconi anemia?'" + "\n"
     output += "You can ask: 'What X does Y Z?' where X is one of the following: \n"
     for label in RU.get_node_labels():
         output = output + label + "\n"
     output += "\n The term Y is any of the nodes that are in our graph (currently " + str(
         RU.count_nodes()) + " nodes in total). \n"
     output += "\n The term Z is any relationship of the following kind: \n"
     for rel in RU.get_relationship_types():
         rel_split = rel.split("_")
         for term in rel_split:
             output += term + " "
         output += "\n"
     output += "Assumes that Z directly connects X and Y."
     return output
Esempio n. 5
0
def test_correct_question():
	"""
	Point of this test is to form a bunch of sentences, match them against all queries, and make sure the correct
	question template is matched
	:return: None
	"""
	# get a random selection of nodes
	property_to_nodes = dict()
	for label in RU.get_node_labels():
		nodes = RU.get_random_nodes(label, property="description")
		property_to_nodes[label] = nodes

	# import the questions
	questions = []
	with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Questions.tsv'), 'r') as fid:
		for line in fid.readlines():
			if line[0] == "#":
				pass
			else:
				questions.append(Question(line))

	# form the corpora
	corpora = [q.corpus for q in questions]

	for q in questions:
		# populate the sentence template
		parameters = dict()

		# ignore the what is question
		if q.parameter_names and q.parameter_names[0] != "term":
			for label in q.parameter_names:
				node = random.choice(property_to_nodes[label])
				parameters[label] = node
			input_sentence = q.restate_question(parameters)
			input_sentence = input_sentence.strip(string.punctuation)

			# Run it against all the questions
			(corpus_index, similarity) = wd.find_corpus(input_sentence, corpora)
			if questions[corpus_index].restated_question_template.template != q.restated_question_template.template:
				temp_parameters = questions[corpus_index].get_parameters(input_sentence)
				# test if the parameters were populated
				if all([val is not None for val in temp_parameters.values()]):
					print("Bad classification! input: %s\n matched template: %s" % (input_sentence, questions[corpus_index].restated_question_template.template))
					print(questions[corpus_index].get_parameters(input_sentence))
Esempio n. 6
0
	def get_similar_nodes_in_common_parameters(node_ID, target_node_label, association_node_label):
		"""
		This function will get the parameters for get_similar_nodes_in_common based on target node, target label, and association label
		:param node_ID: source node ID (name in KG)
		:param target_label: the node types that you want returned
		:param association_node_label: the association node (node in common between source and target) type
		:return: dict, error_code, error_message (dict keys input_node_ID, input_node_label, association_node_label, input_association_relationship,
				target_association_relationship, target_node_label)
		"""
		# Check if node exists
		if not RU.node_exists_with_property(node_ID, 'id'):
			error_message = "Sorry, the disease %s is not yet in our knowledge graph." % node_ID
			error_code = "DiseaseNotFound"
			return dict(), error_code, error_message

		# Get label/kind of node the source is
		input_node_label = RU.get_node_property(node_ID, "label")
		input_node_ID = node_ID

		# Get relationship between source and association label
		rels = RU.get_relationship_types_between(input_node_ID, input_node_label, "", association_node_label, max_path_len=1)
		# TODO: there could be multiple relationship types, for now, let's just pop one
		if not rels:
			error_code = "NoRelationship"
			error_message = "Sorry, the %s %s is not connected to any %s." % (input_node_label, input_node_ID, association_node_label)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (
				RU.get_node_property(parent, 'name'), RU.get_node_property(input_node_ID, 'name'))
			return dict(), error_code, error_message
		input_association_relationship = rels.pop()

		# Get relationship between target and association label
		rels = RU.get_relationship_types_between("", target_node_label, "", association_node_label, max_path_len=1)
		if not rels:
			error_code = "NoRelationship"
			error_message = "Sorry, no %s is not connected to any %s." % (target_node_label, association_node_label)
			return dict(), error_code, error_message
		target_association_relationship = rels.pop()
		# TODO: kludgy fix for microRNA's having multiple relationship types, only one of which shows up frequently
		if target_association_relationship == "gene_mutations_contribute_to":
			target_association_relationship = "gene_associated_with_condition"

		# populate the arguments
		arguments = dict(input_node_ID=input_node_ID,
						input_node_label=input_node_label,
						association_node_label=association_node_label,
						input_association_relationship=input_association_relationship,
						target_association_relationship=target_association_relationship,
						target_node_label=target_node_label)
		return arguments, None, None
    def answer(source_node_ID,
               target_node_type,
               association_node_type,
               use_json=False,
               threshold=0.2,
               n=20):
        """
		Answers the question what X are similar to Y based on overlap of common Z nodes. X is target_node_type,
		Y is source_node_ID, Z is association_node_type. The relationships are automatically determined in
		SimilarNodesInCommon by looking for 1 hop relationships and poping the FIRST one (you are warned).
		:param source_node_ID: actual name in the KG
		:param target_node_type: kinds of nodes you want returned
		:param association_node_type: kind of node you are computing the Jaccard overlap on
		:param use_json: print the results in standardized format
		:param threshold: only return results where jaccard is >= this threshold
		:param n: number of results to return (default 20)
		:return: reponse (or printed text)
		"""

        # Initialize the response class
        response = FormatOutput.FormatResponse(5)
        # add the column names for the row data
        response.message.table_column_names = [
            "source name", "source ID", "target name", "target ID",
            "Jaccard index"
        ]

        # Initialize the similar nodes class
        similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon()

        # get the description
        source_node_description = RU.get_node_property(source_node_ID, 'name')

        # get the source node label
        source_node_label = RU.get_node_property(source_node_ID, 'label')

        # Get the nodes in common
        node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association(
            source_node_ID, target_node_type, association_node_type, threshold)

        # reduce to top 100
        if len(node_jaccard_tuples_sorted) > n:
            node_jaccard_tuples_sorted = node_jaccard_tuples_sorted[0:n]

        # make sure that the input node isn't in the list
        node_jaccard_tuples_sorted = [
            i for i in node_jaccard_tuples_sorted if i[0] != source_node_ID
        ]

        # check for an error
        if error_code is not None or error_message is not None:
            if not use_json:
                print(error_message)
                return
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return

        #### If use_json not specified, then return results as a fairly plain list
        if not use_json:
            to_print = "The %s's involving similar %ss as %s are: \n" % (
                target_node_type, association_node_type,
                source_node_description)
            for other_disease_ID, jaccard in node_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID, 'name'), jaccard)
            print(to_print)

        #### Else if use_json requested, return the results in the Translator standard API JSON format
        else:

            #### Create the QueryGraph for this type of question
            query_graph = QueryGraph()
            source_node = QNode()
            source_node.id = "n00"
            source_node.curie = source_node_ID
            source_node.type = source_node_label
            association_node = QNode()
            association_node.id = "n01"
            association_node.type = association_node_type
            association_node.is_set = True
            target_node = QNode()
            target_node.id = "n02"
            target_node.type = target_node_type
            query_graph.nodes = [source_node, association_node, target_node]

            #source_association_relationship_type = "unknown1"
            edge1 = QEdge()
            edge1.id = "en00-n01"
            edge1.source_id = "n00"
            edge1.target_id = "n01"
            #edge1.type = source_association_relationship_type

            #association_target_relationship_type = "unknown2"
            edge2 = QEdge()
            edge2.id = "en01-n02"
            edge2.source_id = "n01"
            edge2.target_id = "n02"
            #edge2.type = association_target_relationship_type

            query_graph.edges = [edge1, edge2]

            #### DONT Suppress the query_graph because we can now do the knowledge_map with v0.9.1
            response.message.query_graph = query_graph

            #### Create a mapping dict with the source curie and node types and edge types. This dict is used for reverse lookups by type
            #### for mapping to the QueryGraph. There is a potential point of failure here if there are duplicate node or edge types. FIXME
            response._type_map = dict()
            response._type_map[source_node.curie] = source_node.id
            response._type_map[association_node.type] = association_node.id
            response._type_map[target_node.type] = target_node.id
            response._type_map["e" + edge1.source_id + "-" +
                               edge1.target_id] = edge1.id
            response._type_map["e" + edge2.source_id + "-" +
                               edge2.target_id] = edge2.id

            #### Extract the sorted IDs from the list of tuples
            node_jaccard_ID_sorted = [
                id for id, jac in node_jaccard_tuples_sorted
            ]

            # print(RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type,
            #										[association_node_type], with_rel=[], directed=True, debug=True))

            # get the entire subgraph
            g = RU.return_subgraph_through_node_labels(source_node_ID,
                                                       source_node_label,
                                                       node_jaccard_ID_sorted,
                                                       target_node_type,
                                                       [association_node_type],
                                                       with_rel=[],
                                                       directed=False,
                                                       debug=False)

            # extract the source_node_number
            for node, data in g.nodes(data=True):
                if data['properties']['id'] == source_node_ID:
                    source_node_number = node
                    break

            # Get all the target numbers
            target_id2numbers = dict()
            node_jaccard_ID_sorted_set = set(node_jaccard_ID_sorted)
            for node, data in g.nodes(data=True):
                if data['properties']['id'] in node_jaccard_ID_sorted_set:
                    target_id2numbers[data['properties']['id']] = node

            for other_disease_ID, jaccard in node_jaccard_tuples_sorted:
                target_name = RU.get_node_property(other_disease_ID, 'name')
                to_print = "The %s %s involves similar %ss as %s with similarity value %f" % (
                    target_node_type, target_name, association_node_type,
                    source_node_description, jaccard)

                # get all the shortest paths between source and target
                all_paths = nx.all_shortest_paths(
                    g, source_node_number, target_id2numbers[other_disease_ID])

                # get all the nodes on these paths
                #try:
                if 1 == 1:
                    rel_nodes = set()
                    for path in all_paths:
                        for node in path:
                            rel_nodes.add(node)

                    if rel_nodes:
                        # extract the relevant subgraph
                        sub_g = nx.subgraph(g, rel_nodes)

                        # add it to the response
                        res = response.add_subgraph(sub_g.nodes(data=True),
                                                    sub_g.edges(data=True),
                                                    to_print,
                                                    jaccard,
                                                    return_result=True)
                        res.essence = "%s" % target_name  # populate with essence of question result
                        res.essence_type = target_node_type
                        row_data = []  # initialize the row data
                        row_data.append("%s" % source_node_description)
                        row_data.append("%s" % source_node_ID)
                        row_data.append("%s" % target_name)
                        row_data.append("%s" % other_disease_ID)
                        row_data.append("%f" % jaccard)
                        res.row_data = row_data


#				except:
#					pass
            response.print()
Esempio n. 8
0
def old_answer(disease_ID, use_json=False, threshold=0.2):
    # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded
    # Initialize the response class
    response = FormatOutput.FormatResponse(4)

    # Check if node exists
    if not RU.node_exists_with_property(disease_ID, 'name'):
        error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
        error_code = "DiseaseNotFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # Get label/kind of node the source is
    disease_label = RU.get_node_property(disease_ID, "label")
    if disease_label != "disease" and disease_label != "disease":
        error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
            " Please try a different term" % disease_label
        error_code = "NotADisease"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # get the description
    disease_description = RU.get_node_property(disease_ID, 'description')

    # get the phenotypes associated to the disease
    disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                               "phenotypic_feature",
                                               "has_phenotype")

    # Look more steps beyond if we didn't get any physically_interacts_with
    if disease_phenotypes == []:
        for max_path_len in range(2, 5):
            disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                disease_label,
                disease_ID,
                "phenotypic_feature",
                max_path_len=max_path_len,
                direction="u")
            if disease_phenotypes:
                break
    # print("Total of %d phenotypes" % len(disease_phenotypes))

    # Make sure you actually picked up at least one phenotype
    if not disease_phenotypes:
        error_message = "No phenotypes found for this disease."
        error_code = "NoPhenotypesFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1
    disease_phenotypes_set = set(disease_phenotypes)

    # get all the other disease that connect and get the phenotypes in common
    other_disease_IDs_to_intersection_counts = dict()
    for target_label in ["disease", "disease"]:

        # direct connection
        # print("direct")
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
            disease_ID, disease_label, target_label, node_label_list,
            relationship_label_list, node_of_interest_position)
        for ID in names2counts.keys():
            if names2counts[ID] / float(
                    len(disease_phenotypes_set)
            ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]

    if not other_disease_IDs_to_intersection_counts:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of").pop()
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys())))
    # Now for each of the diseases in here, compute the actual Jaccard index
    disease_jaccard_tuples = []
    # i = 0
    for other_disease_ID in other_disease_IDs_to_intersection_counts.keys():
        # print(i)
        # i += 1
        # print(other_disease_ID)
        # get the phenotypes associated to the disease
        if other_disease_ID.split(":")[0] == "DOID":
            other_disease_label = "disease"
        if other_disease_ID.split(":")[0] == "OMIM":
            other_disease_label = "disease"
        other_disease_phenotypes = RU.get_one_hop_target(
            other_disease_label, other_disease_ID, "phenotypic_feature",
            "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if other_disease_phenotypes == []:
            for max_path_len in range(2, 5):
                other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    other_disease_label,
                    other_disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if other_disease_phenotypes:
                    break

        # compute the Jaccard index
        if not other_disease_phenotypes:
            jaccard = 0
        else:
            other_disease_phenotypes_set = set(other_disease_phenotypes)
            jaccard = other_disease_IDs_to_intersection_counts[
                other_disease_ID] / float(
                    len(
                        list(
                            disease_phenotypes_set.union(
                                other_disease_phenotypes_set))))
        # print("jaccard %f" % jaccard)
        if jaccard > threshold:
            disease_jaccard_tuples.append((other_disease_ID, jaccard))

    # Format the results.
    # Maybe nothing passed the threshold
    if not disease_jaccard_tuples:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of")
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            return 1

    # Otherwise there are results to return, first sort them largest to smallest
    disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
        disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
    if not use_json:
        to_print = "The diseases similar to %s are: \n" % disease_description
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print += "%s\t%s\tJaccard %f\n" % (
                other_disease_ID,
                RU.get_node_property(other_disease_ID, 'description'), jaccard)
        print(to_print)
    else:
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print = "%s is similar to the disease %s with similarity value %f" % (
                disease_description,
                RU.get_node_property(other_disease_ID, 'decription'), jaccard)
            g = RU.get_node_as_graph(other_disease_ID)
            response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                  to_print, jaccard)
        response.print()
Esempio n. 9
0
	def get_similar_nodes_in_common(input_node_ID, input_node_label, association_node_label, input_association_relationship,
				target_association_relationship, target_node_label, threshold=0.2):
		"""
		This function returns the nodes that are associated with an input node based on Jaccard index similarity of
		shared intermediate nodes
		:param input_node_ID: input node ID (in KG)
		:param input_node_label: label of the input node
		:param association_node_label: what kind of node you want to calculate the Jaccard index with
		:param input_association_relationship: how the input node is connected to the association nodes
		:param target_association_relationship: how the target node is connected to the association node
		:param target_node_label: what kind of target nodes to return
		:param threshold: threshold to compute the Jaccard index
		:return: a list of tuples, an error_code, and an error_message. tuple[0] is a target node with tuple[1] jaccard index based on association nodes
		"""
		# get the description
		input_node_description = RU.get_node_property(input_node_ID, 'name')

		# get the nodes associated to the input node
		input_node_associated_nodes = RU.get_one_hop_target(input_node_label, input_node_ID, association_node_label,
															input_association_relationship)

		# Look more steps beyond if we didn't get any physically_interacts_with
		if input_node_associated_nodes == []:
			for max_path_len in range(2, 5):
				input_node_associated_nodes = RU.get_node_names_of_type_connected_to_target(input_node_label, input_node_ID,
																			association_node_label,
																			max_path_len=max_path_len,
																			direction="u")
				if input_node_associated_nodes:
					break

		# Make sure you actually picked up at least one associated node
		if not input_node_associated_nodes:
			error_code = "NoNodesFound"
			error_message = "No %s found for %s." % (association_node_label, input_node_description)
			return [], error_code, error_message

		input_node_associated_nodes_set = set(input_node_associated_nodes)

		# get all the other disease that connect and get the association nodes in common
		# direct connection
		node_label_list = [association_node_label]
		relationship_label_list = [input_association_relationship, target_association_relationship]
		node_of_interest_position = 0
		other_node_IDs_to_intersection_counts = dict()
		#if target_node_label == "disease" or target_node_label == "disease":
		#	target_labels = ["disease", "disease"]
		#else:
		target_labels = [target_node_label]
		for target_label in target_labels:
			names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(input_node_ID, input_node_label,
																						target_label, node_label_list,
																						relationship_label_list,
																						node_of_interest_position)
			for ID in names2counts.keys():
				if names2counts[ID] / float(len(
						input_node_associated_nodes_set)) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
					other_node_IDs_to_intersection_counts[ID] = names2counts[ID]

		# check if any other associated nodes passed the threshold
		if not other_node_IDs_to_intersection_counts:
			error_code = "NoNodesFound"
			error_message = "No %s were found with similarity crossing the threshold of %f." % (target_node_label, threshold)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (
				RU.get_node_property(parent, 'name'), input_node_description)
			return [], error_code, error_message

		# Now for each of the nodes connecting to source, count number of association nodes
		node_label_list = [association_node_label]
		relationship_label_list = [input_association_relationship, target_association_relationship]
		node_of_interest_position = 0
		other_node_counts = dict()
		for target_label in target_labels:
			temp_other_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(input_node_ID, input_node_label,
																					   target_label, node_label_list,
																					   relationship_label_list,
																					   node_of_interest_position)
			# add it to the dictionary
			for key in temp_other_counts.keys():
				other_node_counts[key] = temp_other_counts[key]

		# then compute the jaccard index
		node_jaccard_tuples = []
		for other_node_ID in other_node_counts.keys():
			jaccard = 0
			if other_node_ID in other_node_IDs_to_intersection_counts:
				union_card = len(input_node_associated_nodes) + other_node_counts[other_node_ID] - \
							other_node_IDs_to_intersection_counts[other_node_ID]
				jaccard = other_node_IDs_to_intersection_counts[other_node_ID] / float(union_card)
			if jaccard > threshold:
				node_jaccard_tuples.append((other_node_ID, jaccard))

		# Format the results.
		# Maybe nothing passed the threshold
		if not node_jaccard_tuples:
			error_code = "NoNodesFound"
			error_message = "No %s's were found with similarity crossing the threshold of %f." % (target_node_label, threshold)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (RU.get_node_property(parent, 'description'), input_node_description)
			return [], error_code, error_message

		# Otherwise there are results to return, first sort them largest to smallest
		node_jaccard_tuples_sorted = [(x, y) for x, y in
										sorted(node_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]

		return node_jaccard_tuples_sorted, None, None
Esempio n. 10
0
    def answer(self, disease_ID, use_json=False, threshold=0.2):
        """
		Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID
		(in terms of phenotype overlap)
		:param disease_ID: KG disease name (eg. DOID:8398)
		:param use_json: use the standardized output format
		:param threshold: only include diseases with Jaccard index above this
		:return: None (print to stdout), unless there's an error, then return 1
		"""
        # Initialize the response class
        response = FormatOutput.FormatResponse(4)

        # Check if node exists
        if not RU.node_exists_with_property(disease_ID, 'name'):
            error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
            error_code = "DiseaseNotFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Get label/kind of node the source is
        disease_label = RU.get_node_property(disease_ID, "label")
        if disease_label != "disease" and disease_label != "disease":
            error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
                " Please try a different term" % disease_label
            error_code = "NotADisease"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # get the description
        disease_description = RU.get_node_property(disease_ID, 'description')

        # get the phenotypes associated to the disease
        disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                                   "phenotypic_feature",
                                                   "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if disease_phenotypes == []:
            for max_path_len in range(2, 5):
                disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    disease_label,
                    disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if disease_phenotypes:
                    break

        # Make sure you actually picked up at least one phenotype
        if not disease_phenotypes:
            error_message = "No phenotypes found for this disease."
            error_code = "NoPhenotypesFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1
        disease_phenotypes_set = set(disease_phenotypes)

        # get all the other disease that connect and get the phenotypes in common
        # direct connection
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_disease_IDs_to_intersection_counts = dict()
        for target_label in ["disease", "disease"]:
            names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
                disease_ID, disease_label, target_label, node_label_list,
                relationship_label_list, node_of_interest_position)
            for ID in names2counts.keys():
                if names2counts[ID] / float(
                        len(disease_phenotypes_set)
                ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                    other_disease_IDs_to_intersection_counts[
                        ID] = names2counts[ID]

        # check if any other diseases passed the threshold
        if not other_disease_IDs_to_intersection_counts:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Now for each of the diseases connecting to source, count number of phenotypes
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        # union the two
        other_disease_counts = dict()
        for key in other_doid_counts.keys():
            other_disease_counts[key] = other_doid_counts[key]
        for key in other_omim_counts.keys():
            other_disease_counts[key] = other_omim_counts[key]

        # then compute the jaccard index
        disease_jaccard_tuples = []
        for other_disease_ID in other_disease_counts.keys():
            jaccard = 0
            if other_disease_ID in other_disease_IDs_to_intersection_counts:
                union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \
                    other_disease_IDs_to_intersection_counts[other_disease_ID]
                jaccard = other_disease_IDs_to_intersection_counts[
                    other_disease_ID] / float(union_card)
            if jaccard > threshold:
                disease_jaccard_tuples.append((other_disease_ID, jaccard))

        # Format the results.
        # Maybe nothing passed the threshold
        if not disease_jaccard_tuples:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                return 1

        # Otherwise there are results to return, first sort them largest to smallest
        disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
            disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
        if not use_json:
            to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
            print(to_print)
        else:
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % (
                    disease_description,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
                g = RU.get_node_as_graph(other_disease_ID)
                response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                      to_print, jaccard)
            response.print()
Esempio n. 11
0
    def answer(self, entity, use_json=False):
        """
		Answer a question of the type "What is X" but is general:
		:param entity: KG neo4j node name (eg "carbetocin")
		:param use_json: If the answer should be in Translator standardized API output format
		:return: a description and type of the node
		"""

        #### See if this entity is in the KG via the index
        eprint("Looking up '%s' in KgNodeIndex" % entity)
        kgNodeIndex = KGNodeIndex()
        curies = kgNodeIndex.get_curies(entity)

        #### If not in the KG, then return no information
        if not curies:
            if not use_json:
                return None
            else:
                error_code = "TermNotFound"
                error_message = "This concept is not in our knowledge graph"
                response = FormatOutput.FormatResponse(0)
                response.add_error_message(error_code, error_message)
                return response.message

        # Get label/kind of node the source is
        eprint("Getting properties for '%s'" % curies[0])
        properties = RU.get_node_properties(curies[0])
        eprint("Properties are:")
        eprint(properties)

        #### By default, return the results just as a plain simple list of data structures
        if not use_json:
            return properties

        #### Or, if requested, format the output as the standardized API output format
        else:
            #### Create a stub Message object
            response = FormatOutput.FormatResponse(0)
            response.message.table_column_names = [
                "id", "type", "name", "description", "uri"
            ]
            response.message.code_description = None

            #### Create a Node object and fill it
            node1 = Node()
            node1.id = properties["id"]
            node1.uri = properties["uri"]
            node1.type = [properties["category"]]
            node1.name = properties["name"]
            node1.description = properties["description"]

            #### Create the first result (potential answer)
            result1 = Result()
            result1.id = "http://arax.ncats.io/api/v1/result/0000"
            result1.description = "The term %s is in our knowledge graph and is defined as %s" % (
                properties["name"], properties["description"])
            result1.confidence = 1.0
            result1.essence = properties["name"]
            result1.essence_type = properties["category"]
            node_types = ",".join(node1.type)
            result1.row_data = [
                node1.id, node_types, node1.name, node1.description, node1.uri
            ]

            #### Create a KnowledgeGraph object and put the list of nodes and edges into it
            result_graph = KnowledgeGraph()
            result_graph.nodes = [node1]
            result_graph.edges = []

            #### Put the ResultGraph into the first result (potential answer)
            result1.result_graph = result_graph

            #### Put the first result (potential answer) into the message
            results = [result1]
            response.message.results = results

            #### Also put the union of all result_graph components into the top Message KnowledgeGraph
            #### Normally the knowledge_graph will be much more complex than this, but take a shortcut for this single-node result
            response.message.knowledge_graph = result_graph

            #### Also manufacture a query_graph post hoc
            qnode1 = QNode()
            qnode1.id = "n00"
            qnode1.curie = properties["id"]
            qnode1.type = None
            query_graph = QueryGraph()
            query_graph.nodes = [qnode1]
            query_graph.edges = []
            response.message.query_graph = query_graph

            #### Create the corresponding knowledge_map
            node_binding = NodeBinding(qg_id="n00", kg_id=properties["id"])
            result1.node_bindings = [node_binding]
            result1.edge_bindings = []

            #eprint(response.message)
            return response.message
Esempio n. 12
0
    def answer(disease_id,
               use_json=False,
               num_show=20,
               rev=True,
               normalize=False):
        """
		"""

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # get the description
        disease_description = RU.get_node_property(disease_id, 'name')

        # get subgraph of all all the symptom nodes connecting to the disease
        try:
            g = RU.return_subgraph_paths_of_type(disease_id,
                                                 "disease",
                                                 None,
                                                 "phenotypic_feature",
                                                 ["has_phenotype"],
                                                 directed=False)
        except CustomExceptions.EmptyCypherError:
            error_code = "EmptyGraph"
            error_message = "Sorry, but there are no phenotypes associated to %s" % disease_description
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

        # decorate with cohd data
        RU.weight_graph_with_cohd_frequency(
            g, normalized=normalize
        )  # TODO: check if normalized on returns better results

        # sort the phenotypes by frequency
        names = nx.get_node_attributes(g, 'names')
        labels = nx.get_node_attributes(g, 'labels')
        descriptions = nx.get_node_attributes(g, 'description')

        # get the node corresponding to the disease
        disease_node = None
        for node in names.keys():
            if names[node] == disease_id:
                disease_node = node

        # get all the nodes and the frequencies in one place
        node_freq_tuples = []
        for node in names.keys():
            if "phenotypic_feature" == list(set(labels[node]) -
                                            {"Base"}).pop():
                # get the corresponding edge frequency (try both directions)
                edge_data = g.get_edge_data(disease_node, node)
                if "cohd_freq" in edge_data and isinstance(
                        edge_data["cohd_freq"], float):
                    freq = edge_data["cohd_freq"]
                else:
                    edge_data = g.get_edge_data(node, disease_node)
                    if "cohd_freq" in edge_data and isinstance(
                            edge_data["cohd_freq"], float):
                        freq = edge_data["cohd_freq"]
                    else:
                        freq = 0
                node_freq_tuples.append((node, freq))

        # sort the node freqs
        node_freq_tuples_sorted = sorted(node_freq_tuples,
                                         key=lambda x: x[1],
                                         reverse=rev)

        # reduce to top n
        node_freq_tuples_sorted_top_n = node_freq_tuples_sorted
        if len(node_freq_tuples_sorted_top_n) > num_show:
            node_freq_tuples_sorted_top_n = node_freq_tuples_sorted_top_n[
                0:num_show]

        # good nodes
        good_nodes = set([tup[0] for tup in node_freq_tuples_sorted_top_n])
        good_nodes.add(disease_node)

        # all nodes
        all_nodes = set([tup[0] for tup in node_freq_tuples_sorted])

        # remove the other nodes from the graph
        g.remove_nodes_from(all_nodes - good_nodes)

        # return the results
        if not use_json:
            if rev:
                to_print = "The most common phenotypes "
            else:
                to_print = "The least common phenotypes "
            to_print += "associated with %s, according to the Columbia Open Health Data, are:\n" % disease_description
            for node, freq in node_freq_tuples_sorted_top_n:
                to_print += "phenotype: %s\t frequency %f \n" % (
                    descriptions[node], freq)
            print(to_print)
        else:
            for node, freq in node_freq_tuples_sorted_top_n:
                to_print = "According to the Columbia Open Health Data, %s has the phenotype %s with frequency %f." % (
                    disease_description, descriptions[node], freq)
                sub_g = nx.subgraph(g, [disease_node, node])
                # add it to the response
                response.add_subgraph(sub_g.nodes(data=True),
                                      sub_g.edges(data=True), to_print, freq)
            response.print()
Esempio n. 13
0
    def answer(self, query_graph, TxltrApiFormat=False):
        """
        Answer a question based on the input query_graph:
        :param query_graph: QueryGraph object
        :param TxltrApiFormat: Set to true if the answer should be in Translator standardized API output format
        :return: Result of the query in native or API format
        """

        #### Create a stub Message object
        response = FormatOutput.FormatResponse(0)

        #### Include the original query_graph in the envelope
        response.message.query_graph = query_graph

        #### Perform some basic validation of the query graph before sending to the server
        result = self.validate_query_graph(query_graph)
        if result["message_code"] != "OK":
            response.add_error_message(result["message_code"],
                                       result["code_description"])
            return (response.message)

        #### Insert some dummy question stuff
        response.message.original_question = "Input via Query Graph"
        response.message.restated_question = "No restatement for QueryGraph yet"

        #### Preprocess query_graph object
        query_graph, sort_flags, res_limit, ascending_flag = self.preprocess_query_graph(
            query_graph)

        #### Interpret the query_graph object to create a cypher query and encode the result in a response
        try:
            query_gen = RU.get_cypher_from_question_graph(
                {'question_graph': query_graph})
            answer_graph_cypher = query_gen.cypher_query_answer_map()
            knowledge_graph_cypher = query_gen.cypher_query_knowledge_graph()
        except Exception as error:
            response.add_error_message("CypherGenerationError", format(error))
            return (response.message)

        #### The Robokop code renames stuff in the query_graph for strange reasons. Rename them back.
        #### It would be better to not make the changes in the first place. FIXME
        #for node in response.message.query_graph["nodes"]:
        #    node["node_id"] = node["id"]
        #    node.pop("id", None)
        #for edge in response.message.query_graph["edges"]:
        #    edge["edge_id"] = edge["id"]
        #    edge.pop("id", None)

        #### Execute the cypher to obtain results[]. Return an error if there are no results, or otherwise extract the list
        try:
            with RU.driver.session() as session:
                result = session.run(answer_graph_cypher)
            answer_graph_list = result.data()
        except Exception as error:
            response.add_error_message("QueryGraphError", format(error))
            return (response.message)

        if len(answer_graph_list) == 0:
            response.add_error_message(
                "NoPathsFound",
                "No paths satisfying this query graph were found")
            return (response.message)

        #### Execute the knowledge_graph cypher. Return an error if there are no results, or otherwise extract the dict
        try:
            with RU.driver.session() as session:
                result = session.run(knowledge_graph_cypher)
            result_data = result.data()
        except Exception as error:
            response.add_error_message("QueryGraphError", format(error))
            return (response.message)

        if len(result_data) == 0:
            response.add_error_message(
                "NoPathsFound",
                "No paths satisfying this query graph were found")
            return (response.message)
        knowledge_graph_dict = result_data[0]

        #### If TxltrApiFormat was not specified, just return a single data structure with the results
        if not TxltrApiFormat:
            return {
                'answer_subgraphs': answer_graph_list,
                'knowledge_graph': knowledge_graph_dict
            }

        #### Add the knowledge_graph and bindings to the Message
        response.add_split_results(knowledge_graph_dict, answer_graph_list)
        #response.message.table_column_names = [ "id", "type", "name", "description", "uri" ]
        #response.message.code_description = None

        #### Enrich the Message Results with some inferred information
        response.infer_result_information()

        #### Return the final result message
        return (response.message)
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Answers questions of the form: 'what pathways are most enriched by $protein_list?'",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-s',
                        '--source',
                        type=str,
                        help="source curie ID",
                        default="UniProtKB:Q96M43")
    parser.add_argument('-t',
                        '--target',
                        type=str,
                        help="target node type",
                        default="pathway")
    parser.add_argument('-y',
                        '--type',
                        type=str,
                        help="source node type",
                        default="protein")
    parser.add_argument(
        '-j',
        '--json',
        action='store_true',
        help=
        'Flag specifying that results should be printed in JSON format (to stdout)',
        default=False)
    parser.add_argument(
        '-r',
        '--rel_type',
        type=str,
        help='Only do the Fisher exact test along edges of this type',
        default=None)
    parser.add_argument(
        '--describe',
        action='store_true',
        help='Print a description of the question to stdout and quit',
        default=False)
    parser.add_argument('--num_show',
                        type=int,
                        help='Maximum number of results to return',
                        default=20)

    # Parse and check args
    args = parser.parse_args()
    source_arg = args.source
    target_type = args.target
    source_type = args.type
    use_json = args.json
    describe_flag = args.describe
    num_show = args.num_show
    rel_type = args.rel_type

    if source_arg[0] == "[":
        if "','" not in source_arg:
            source_arg = source_arg.replace(",", "','").replace("[",
                                                                "['").replace(
                                                                    "]", "']")
        source_list = ast.literal_eval(source_arg)
        source_list_strip = []
        for source in source_list:
            source_list_strip.append(source.strip())
        source_list = source_list_strip
    else:
        source_list = [source_arg]

    # Initialize the question class
    Q = QuestionFisher()

    if describe_flag:
        res = Q.describe()
        print(res)
    else:
        # Initialize the response class
        response = FormatOutput.FormatResponse(6)
        response.response.table_column_names = [
            "target name", "target ID", "P value"
        ]
        graph_weight_tuples = []

        q_answer = Q.answer(source_list,
                            source_type,
                            target_type,
                            use_json=use_json,
                            num_show=num_show,
                            rel_type=rel_type)

        if not q_answer:  # if q_answer == None
            return None  # All messages printed out; safe to quit

        p_dict, target_list = q_answer

        # print out the results
        if not use_json:
            for target_name in target_list:
                target_description = RU.get_node_property(
                    target_name, "name", node_label=target_type)
                print("%s %f" % (target_description, p_dict[target_name]))
        else:
            #response.response.table_column_names = ["source name", "source ID", "target name", "target ID", "path weight",
            #                                        "target source google distance",
            #                                        "ML probability target treats source"]
            for target_name in target_list:
                target_description = RU.get_node_property(
                    target_name, "name", node_label=target_type)
                target_id_old_curie = target_name.replace(
                    "CHEMBL.COMPOUND:CHEMBL", "ChEMBL:")
                confidence = p_dict[target_name]
                # populate the graph
                graph = RU.get_graph_from_nodes([target_name])
                res = response.add_subgraph(
                    graph.nodes(data=True),
                    graph.edges(data=True),
                    "The target %s is enriched by %s." %
                    (target_description, str(source_list)),
                    confidence,
                    return_result=True)
                res.essence = "%s" % target_description  # populate with essence of question result
                row_data = []  # initialize the row data
                #row_data.append("%s" % source_description)
                #row_data.append("%s" % source_id)
                row_data.append("%s" % target_description)
                row_data.append("%s" % target_name)
                row_data.append("%f" % confidence)
                #row_data.append("%f" % gd)
                #row_data.append("%f" % prob)
                res.row_data = row_data
            response.print()
Esempio n. 15
0
	def get_parameters(self, input_question):
		"""
		Given the input_question, try to extract the proper parameters
		:param input_question: plain text input question
		:return: a dictionary (with keys self.parameter_names), values either None or the KG node names
		"""
		parameters = dict()
		for parameter in self.parameter_names:
			parameters[parameter] = None

		# The "what is a X?" questions are of a completely different form and are handled separately
		if self.parameter_names == ["term"]:
			# Next, see if it's a "what is" question
			term = None
			input_question = re.sub("\?", "", input_question)
			input_question = re.sub("^\s+", "", input_question)
			input_question = re.sub("\s+$", "", input_question)
			input_question = input_question.lower()
			match = re.match("what is\s*(a|an)?\s+(.+)", input_question, re.I)
			if match:
				term = match.group(2)
				term = re.sub("^\s+", "", term)
				term = re.sub("\s+$", "", term)
				parameters["term"] = term
				return parameters
			match = re.match("what are (.+)", input_question, re.I)
			if match:
				term = match.group(1)
				term = re.sub("^\s+", "", term)
				term = re.sub("\s+$", "", term)
				parameters["term"] = term
				return parameters
			else:
				return parameters
		else:  # Otherwise, it's a standard question template
			# get all n-tuples of words in the question (largest to smallest)
			blocks = []
			question_tokenized = nltk.word_tokenize(input_question, "english")
			# Tokenizers have a bad habit of splitting on \', so fix it
			question_tokenized_no_apos_split = []
			for ind, block in enumerate(question_tokenized):
				if block[0] == "'" and ind > 0:  # the tokenizer split on apostrophe
					question_tokenized_no_apos_split[ind - 1] += question_tokenized[ind]
				else:
					question_tokenized_no_apos_split.append(block)
			question_tokenized = question_tokenized_no_apos_split

			for block_size in range(1, len(question_tokenized)):
				for i in range(len(question_tokenized) - block_size + 1):
					block = " ".join(question_tokenized[i:(i + block_size)])
					blocks.append(block)
			blocks = list(reversed(blocks))

			# Look for anything that could be a node name
			candidate_node_names = []
			found_blocks = []  # keep track of the already found blocks TODO: this will cause problems when you ask something like "how are malaria and mixed malaria different?"
			for block in blocks:
				nodes = find_node_name(block)
				if nodes:
					if all([block not in b for b in found_blocks]):  # only add it if it's not a proper subset of an already found block
						candidate_node_names.extend(nodes)
						found_blocks.append(block)
						#print(block)

			# Get the node labels for the found nodes
			candidate_node_names_labels = set()  # set automatically deduplicates for me
			for node in candidate_node_names:
				node_label = RU.get_node_property(node, "label")  # TODO: Arnab's UMLS lookup
				candidate_node_names_labels.add((node, node_label))

			# turn it back into a set for indexing
			candidate_node_names_labels = list(candidate_node_names_labels)

			# For each of the parameter names, make sure it only shows up once, and if so, populate it
			for parameter_name in self.parameter_names:
				parameter_name_positions = []
				pos = 0
				for node, node_label in candidate_node_names_labels:
					if node_label == parameter_name:
						parameter_name_positions.append(pos)
					pos += 1
				if len(parameter_name_positions) > 1:
					raise CustomExceptions.MultipleTerms(parameter_name, [candidate_node_names_labels[pos][0] for pos in parameter_name_positions])
				elif len(parameter_name_positions) == 0:
					pass
				else:  # There's exactly one term
					pos = parameter_name_positions.pop()
					parameters[parameter_name] = candidate_node_names_labels[pos][0]

			# Throw in the extra parameters
			for key, value in self.other_parameters.items():
				parameters[key] = value
			return parameters
Esempio n. 16
0
    def answer(self,
               source_name,
               target_label,
               relationship_type,
               use_json=False,
               directed=False):
        """
		Answer a question of the type "What proteins does drug X target" but is general:
		 what <node X type> does <node Y grounded> <relatioship Z> that can be answered in one hop in the KG (increasing the step size if necessary).
		:param query_terms: a triple consisting of a source node name (KG neo4j node name, the target label (KG neo4j
		"node label") and the relationship type (KG neo4j "Relationship type")
		:param source_name: KG neo4j node name (eg "carbetocin")
		:param target_label: KG node label (eg. "protein")
		:param relationship_type: KG relationship type (eg. "physically_interacts_with")
		:param use_json: If the answer should be in Eric's Json standardized API output format
		:return: list of dictionaries containing the nodes that are one hop (along relationship type) that connect source to target.
		"""
        # Get label/kind of node the source is
        source_label = RU.get_node_property(source_name, "label")

        # Get the subgraph (all targets along relationship)
        has_intermediate_node = False
        try:
            g = RU.return_subgraph_paths_of_type(source_name,
                                                 source_label,
                                                 None,
                                                 target_label,
                                                 [relationship_type],
                                                 directed=directed)
        except CustomExceptions.EmptyCypherError:
            try:
                has_intermediate_node = True
                g = RU.return_subgraph_paths_of_type(
                    source_name,
                    source_label,
                    None,
                    target_label, ['subclass_of', relationship_type],
                    directed=directed)
            except CustomExceptions.EmptyCypherError:
                error_message = "No path between %s and %s via relationship %s" % (
                    source_name, target_label, relationship_type)
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                return response

        # extract the source_node_number
        for node, data in g.nodes(data=True):
            if data['properties']['id'] == source_name:
                source_node_number = node
                break

        # Get all the target numbers
        target_numbers = []
        for node, data in g.nodes(data=True):
            if data['properties']['id'] != source_name:
                target_numbers.append(node)

        # if there's an intermediate node, get the name
        if has_intermediate_node:
            neighbors = list(g.neighbors(source_node_number))
            if len(neighbors) > 1:
                error_message = "More than one intermediate node"
                error_code = "AmbiguousPath"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                return response
            else:
                intermediate_node = neighbors.pop()

        #### If use_json not specified, then return results as a fairly plain list
        if not use_json:
            results_list = list()
            for target_number in target_numbers:
                data = g.nodes[target_number]
                results_list.append({
                    'type':
                    list(set(data['labels']) - {'Base'}).pop(),
                    'name':
                    data['properties']['name'],
                    'desc':
                    data['properties']['name'],
                    'prob':
                    1
                })  # All these are known to be true
            return results_list

        #### Else if use_json requested, return the results in the Translator standard API JSON format
        else:
            response = FormatOutput.FormatResponse(3)  # it's a Q3 question
            response.message.table_column_names = [
                "source name", "source ID", "target name", "target ID"
            ]
            source_description = g.nodes[source_node_number]['properties'][
                'name']

            #### Create the QueryGraph for this type of question
            query_graph = QueryGraph()
            source_node = QNode()
            source_node.id = "n00"
            source_node.curie = g.nodes[source_node_number]['properties']['id']
            source_node.type = g.nodes[source_node_number]['properties'][
                'category']
            target_node = QNode()
            target_node.id = "n01"
            target_node.type = target_label
            query_graph.nodes = [source_node, target_node]
            edge1 = QEdge()
            edge1.id = "e00"
            edge1.source_id = "n00"
            edge1.target_id = "n01"
            edge1.type = relationship_type
            query_graph.edges = [edge1]
            response.message.query_graph = query_graph

            #### Create a mapping dict with the source curie and the target type. This dict is used for reverse lookups by type
            #### for mapping to the QueryGraph.
            response._type_map = dict()
            response._type_map[source_node.curie] = source_node.id
            response._type_map[target_node.type] = target_node.id
            response._type_map[edge1.type] = edge1.id

            #### Loop over all the returned targets and put them into the response structure
            for target_number in target_numbers:
                target_description = g.nodes[target_number]['properties'][
                    'name']
                if not has_intermediate_node:
                    subgraph = g.subgraph([source_node_number, target_number])
                else:
                    subgraph = g.subgraph(
                        [source_node_number, intermediate_node, target_number])
                res = response.add_subgraph(
                    subgraph.nodes(data=True),
                    subgraph.edges(data=True),
                    "%s and %s are connected by the relationship %s" %
                    (source_description, target_description,
                     relationship_type),
                    1,
                    return_result=True)
                res.essence = "%s" % target_description  # populate with essence of question result
                res.essence_type = g.nodes[target_number]['properties'][
                    'category']  # populate with the type of the essence of question result
                row_data = []  # initialize the row data
                row_data.append("%s" % source_description)
                row_data.append(
                    "%s" % g.nodes[source_node_number]['properties']['id'])
                row_data.append("%s" % target_description)
                row_data.append("%s" %
                                g.nodes[target_number]['properties']['id'])
                res.row_data = row_data
            return response
Esempio n. 17
0
    def answer(self,
               source_name,
               target_label,
               relationship_type,
               use_json=False):
        """
		Answer a question of the type "What proteins does drug X target" but is general:
		 what <node X type> does <node Y grounded> <relatioship Z> that can be answered in one hop in the KG (increasing the step size if necessary).
		:param query_terms: a triple consisting of a source node name (KG neo4j node name, the target label (KG neo4j
		"node label") and the relationship type (KG neo4j "Relationship type")
		:param source_name: KG neo4j node name (eg "carbetocin")
		:param target_label: KG node label (eg. "protein")
		:param relationship_type: KG relationship type (eg. "directly_interacts_with")
		:param use_json: If the answer should be in Eric's Json standardized API output format
		:return: list of dictionaries containing the nodes that are one hop (along relationship type) that connect source to target.
		"""
        # Get label/kind of node the source is
        source_label = RU.get_node_property(source_name, "label")

        # Get the subgraph (all targets along relationship)
        has_intermediate_node = False
        try:
            g = RU.return_subgraph_paths_of_type(source_name,
                                                 source_label,
                                                 None,
                                                 target_label,
                                                 [relationship_type],
                                                 directed=False)
        except CustomExceptions.EmptyCypherError:
            try:
                has_intermediate_node = True
                g = RU.return_subgraph_paths_of_type(
                    source_name,
                    source_label,
                    None,
                    target_label, ['subclass_of', relationship_type],
                    directed=False)
            except CustomExceptions.EmptyCypherError:
                error_message = "No path between %s and %s via relationship %s" % (
                    source_name, target_label, relationship_type)
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                return response

        # extract the source_node_number
        for node, data in g.nodes(data=True):
            if data['properties']['name'] == source_name:
                source_node_number = node
                break

        # Get all the target numbers
        target_numbers = []
        for node, data in g.nodes(data=True):
            if data['properties']['name'] != source_name:
                target_numbers.append(node)

        # if there's an intermediate node, get the name
        if has_intermediate_node:
            neighbors = list(g.neighbors(source_node_number))
            if len(neighbors) > 1:
                error_message = "More than one intermediate node"
                error_code = "AmbiguousPath"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                return response
            else:
                intermediate_node = neighbors.pop()

        # Format the results.
        if not use_json:
            results_list = list()
            for target_number in target_numbers:
                data = g.node[target_number]
                results_list.append({
                    'type':
                    list(set(data['labels']) - {'Base'}).pop(),
                    'name':
                    data['properties']['name'],
                    'desc':
                    data['properties']['description'],
                    'prob':
                    1
                })  # All these are known to be true
            return results_list
        else:  # You want the standardized API output format
            response = FormatOutput.FormatResponse(3)  # it's a Q3 question
            source_description = g.node[source_node_number]['properties'][
                'description']
            for target_number in target_numbers:
                target_description = g.node[target_number]['properties'][
                    'description']
                if not has_intermediate_node:
                    subgraph = g.subgraph([source_node_number, target_number])
                else:
                    subgraph = g.subgraph(
                        [source_node_number, intermediate_node, target_number])
                response.add_subgraph(
                    subgraph.nodes(data=True), subgraph.edges(data=True),
                    "%s and %s are connected by the relationship %s" %
                    (source_description, target_description,
                     relationship_type), 1)
            return response
    def answer(disease_id, use_json=False, num_show=25):

        num_input_disease_symptoms = 25  # number of representative symptoms of the disease to keep
        num_omim_keep = 25  # number of genetic conditions to keep
        num_protein_keep = 25  # number of implicated proteins to keep
        num_pathways_keep = 25  # number of pathways to keep
        num_pathway_proteins_selected = 25  # number of proteins enriched for the above pathways to select
        num_drugs_keep = 2 * num_show  # number of drugs that target those proteins to keep
        num_paths = 2  # number of paths to keep for each drug selected

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)
        response.response.table_column_names = [
            "disease name", "disease ID", "drug name", "drug ID", "confidence"
        ]

        # get the description of the disease
        disease_description = RU.get_node_property(disease_id, 'name')

        # Find symptoms of disease
        # symptoms = RU.get_one_hop_target("disease", disease_id, "phenotypic_feature", "has_phenotype")
        # symptoms_set = set(symptoms)
        (symptoms_dict,
         symptoms) = RU.top_n_fisher_exact([disease_id],
                                           "disease",
                                           "phenotypic_feature",
                                           rel_type="has_phenotype",
                                           n=num_input_disease_symptoms)
        symptoms_set = set(symptoms)
        # check for an error
        if not symptoms_set:
            error_message = "I found no phenotypic_features for %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # Find diseases enriched for that phenotype
        path_type = [
            "gene_mutations_contribute_to", "protein", "participates_in",
            "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]
        (genetic_diseases_dict,
         genetic_diseases_selected) = RU.top_n_fisher_exact(
             symptoms,
             "phenotypic_feature",
             "disease",
             rel_type="has_phenotype",
             n=num_omim_keep,
             curie_prefix="OMIM",
             on_path=path_type,
             exclude=disease_id)

        if not genetic_diseases_selected:
            error_message = "I found no diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find the most representative proteins in these diseases
        path_type = [
            "participates_in", "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]
        (implicated_proteins_dict,
         implicated_proteins_selected) = RU.top_n_fisher_exact(
             genetic_diseases_selected,
             "disease",
             "protein",
             rel_type="gene_mutations_contribute_to",
             n=num_protein_keep,
             on_path=path_type)

        if not implicated_proteins_selected:
            error_message = "I found no proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find enriched pathways from those proteins
        path_type = [
            "participates_in", "protein", "physically_interacts_with",
            "chemical_substance"
        ]
        (pathways_selected_dict, pathways_selected) = RU.top_n_fisher_exact(
            implicated_proteins_selected,
            "protein",
            "pathway",
            rel_type="participates_in",
            n=num_pathways_keep,
            on_path=path_type)

        if not pathways_selected:
            error_message = "I found no pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find proteins enriched for those pathways
        path_type = ["physically_interacts_with", "chemical_substance"]
        (pathway_proteins_dict,
         pathway_proteins_selected) = RU.top_n_fisher_exact(
             pathways_selected,
             "pathway",
             "protein",
             rel_type="participates_in",
             n=num_pathway_proteins_selected,
             on_path=path_type)

        if not pathway_proteins_selected:
            error_message = "I found no proteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find drugs enriched for targeting those proteins
        (drugs_selected_dict, drugs_selected) = RU.top_n_fisher_exact(
            pathway_proteins_selected,
            "protein",
            "chemical_substance",
            rel_type="physically_interacts_with",
            n=num_drugs_keep)

        if not drugs_selected:
            error_message = "I found no drugs connected toproteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        path_type = [
            "disease", "has_phenotype", "phenotypic_feature", "has_phenotype",
            "disease", "gene_mutations_contribute_to", "protein",
            "participates_in", "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]
        g = RU.get_subgraph_through_node_sets_known_relationships(
            path_type, [[disease_id], symptoms, genetic_diseases_selected,
                        implicated_proteins_selected, pathways_selected,
                        pathway_proteins_selected, drugs_selected],
            directed=True)

        graph_weight_tuples = []
        for drug in drugs_selected:
            # get the relevant subgraph from this drug back to the input disease
            node_types = [
                "disease", "phenotypic_feature", "disease", "protein",
                "pathway", "protein", "chemical_substance"
            ]
            drug_pathway_protein_neighbors = RU.one_hope_neighbors_of_type(
                g, drug, 'protein', 'R')
            drug_pathway_neighbors = set()
            for protein in drug_pathway_protein_neighbors:
                drug_pathway_neighbors.update(
                    RU.one_hope_neighbors_of_type(g, protein, 'pathway', 'R'))
            drug_protein_neighbors = set()
            for pathway in drug_pathway_neighbors:
                drug_protein_neighbors.update(
                    RU.one_hope_neighbors_of_type(g, pathway, 'protein', 'L'))
            drug_disease_neighbors = set()
            for protein in drug_protein_neighbors:
                drug_disease_neighbors.update(
                    RU.one_hope_neighbors_of_type(g, protein, 'disease', 'R'))
            drug_phenotype = set()
            for disease in drug_disease_neighbors:
                drug_phenotype.update(
                    RU.one_hope_neighbors_of_type(g, disease,
                                                  'phenotypic_feature', 'R'))
            g2 = RU.get_subgraph_through_node_sets_known_relationships(
                path_type,
                [[disease_id], drug_phenotype, drug_disease_neighbors,
                 drug_protein_neighbors, drug_pathway_neighbors,
                 drug_pathway_protein_neighbors, [drug]],
                directed=False)
            drug_id_old_curie = drug.replace("CHEMBL.COMPOUND:CHEMBL",
                                             "ChEMBL:")
            # Machine learning probability of "treats"
            prob = p.prob_single(drug_id_old_curie, disease_id)
            if not prob:
                prob = -1
            else:
                prob = prob[0]
            graph_weight_tuples.append((g, prob, drug))

        # sort by the path weight
        graph_weight_tuples.sort(key=lambda x: x[1], reverse=True)

        # print out the results
        if not use_json:
            num_shown = 0
            for graph, weight, drug_id in graph_weight_tuples:
                num_shown += 1
                if num_shown > num_show:
                    break
                drug_description = RU.get_node_property(
                    drug_id, "name", node_label="chemical_substance")
                drug_id_old_curie = drug_id.replace("CHEMBL.COMPOUND:CHEMBL",
                                                    "ChEMBL:")
                # Machine learning probability of "treats"
                prob = p.prob_single(drug_id_old_curie, disease_id)
                if not prob:
                    prob = -1
                else:
                    prob = prob[0]
                print("%s %f %f" % (drug_description, weight, prob))
        else:
            # add the neighborhood graph
            response.add_neighborhood_graph(g.nodes(data=True),
                                            g.edges(data=True))
            response.response.table_column_names = [
                "disease name", "disease ID", "drug name", "drug ID",
                "path weight", "drug disease google distance",
                "ML probability drug treats disease"
            ]
            num_shown = 0
            for graph, weight, drug_id in graph_weight_tuples:
                num_shown += 1
                if num_shown > num_show:
                    break
                drug_description = RU.get_node_property(
                    drug_id, "name", node_label="chemical_substance")
                drug_id_old_curie = drug_id.replace("CHEMBL.COMPOUND:CHEMBL",
                                                    "ChEMBL:")
                # Machine learning probability of "treats"
                prob = p.prob_single(drug_id_old_curie, disease_id)
                if not prob:
                    prob = -1
                else:
                    prob = prob[0]
                confidence = prob
                # Google distance
                gd = NormGoogleDistance.get_ngd_for_all(
                    [drug_id, disease_id],
                    [drug_description, disease_description])
                # populate the graph
                res = response.add_subgraph(
                    graph.nodes(data=True),
                    graph.edges(data=True),
                    "The drug %s is predicted to treat %s." %
                    (drug_description, disease_description),
                    confidence,
                    return_result=True)
                res.essence = "%s" % drug_description  # populate with essence of question result
                row_data = []  # initialize the row data
                row_data.append("%s" % disease_description)
                row_data.append("%s" % disease_id)
                row_data.append("%s" % drug_description)
                row_data.append("%s" % drug_id)
                row_data.append("%f" % weight)
                row_data.append("%f" % gd)
                row_data.append("%f" % prob)
                res.row_data = row_data
            response.print()
Esempio n. 19
0
    def answer(disease_id, use_json=False, num_show=25):

        num_input_disease_symptoms = 25  # number of representative symptoms of the disease to keep
        num_omim_keep = 25  # number of genetic conditions to keep
        num_protein_keep = 25  # number of implicated proteins to keep
        num_pathways_keep = 25  # number of pathways to keep
        num_pathway_proteins_selected = 25  # number of proteins enriched for the above pathways to select
        num_drugs_keep = num_show  # number of drugs that target those proteins to keep
        num_paths = 2  # number of paths to keep for each drug selected

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)
        response.response.table_column_names = [
            "disease name", "disease ID", "drug name", "drug ID", "confidence"
        ]

        # get the description of the disease
        disease_description = RU.get_node_property(disease_id, 'name')

        # Find symptoms of disease
        # symptoms = RU.get_one_hop_target("disease", disease_id, "phenotypic_feature", "has_phenotype")
        # symptoms_set = set(symptoms)
        (symptoms_dict,
         symptoms) = RU.top_n_fisher_exact([disease_id],
                                           "disease",
                                           "phenotypic_feature",
                                           rel_type="has_phenotype",
                                           n=num_input_disease_symptoms)
        symptoms_set = set(symptoms)
        # check for an error
        if not symptoms_set:
            error_message = "I found no phenotypic_features for %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # Find diseases enriched for that phenotype
        path_type = [
            "gene_mutations_contribute_to", "protein", "participates_in",
            "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]
        (genetic_diseases_dict,
         genetic_diseases_selected) = RU.top_n_fisher_exact(
             symptoms,
             "phenotypic_feature",
             "disease",
             rel_type="has_phenotype",
             n=num_omim_keep,
             curie_prefix="OMIM",
             on_path=path_type,
             exclude=disease_id)

        if not genetic_diseases_selected:
            error_message = "I found no diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find the most representative proteins in these diseases
        path_type = [
            "participates_in", "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]
        (implicated_proteins_dict,
         implicated_proteins_selected) = RU.top_n_fisher_exact(
             genetic_diseases_selected,
             "disease",
             "protein",
             rel_type="gene_mutations_contribute_to",
             n=num_protein_keep,
             on_path=path_type)

        if not implicated_proteins_selected:
            error_message = "I found no proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find enriched pathways from those proteins
        path_type = [
            "participates_in", "protein", "physically_interacts_with",
            "chemical_substance"
        ]
        (pathways_selected_dict, pathways_selected) = RU.top_n_fisher_exact(
            implicated_proteins_selected,
            "protein",
            "pathway",
            rel_type="participates_in",
            n=num_pathways_keep,
            on_path=path_type)

        if not pathways_selected:
            error_message = "I found no pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find proteins enriched for those pathways
        path_type = ["physically_interacts_with", "chemical_substance"]
        (pathway_proteins_dict,
         pathway_proteins_selected) = RU.top_n_fisher_exact(
             pathways_selected,
             "pathway",
             "protein",
             rel_type="participates_in",
             n=num_pathway_proteins_selected,
             on_path=path_type)

        if not pathway_proteins_selected:
            error_message = "I found no proteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # find drugs enriched for targeting those proteins
        (drugs_selected_dict, drugs_selected) = RU.top_n_fisher_exact(
            pathway_proteins_selected,
            "protein",
            "chemical_substance",
            rel_type="physically_interacts_with",
            n=num_drugs_keep)

        if not drugs_selected:
            error_message = "I found no drugs connected toproteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description
            if not use_json:
                print(error_message)
                return
            else:
                error_code = "NoPathsFound"
                response = FormatOutput.FormatResponse(3)
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # Next, find the most likely paths
        # extract the relevant subgraph
        path_type = [
            "disease", "has_phenotype", "phenotypic_feature", "has_phenotype",
            "disease", "gene_mutations_contribute_to", "protein",
            "participates_in", "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]
        g = RU.get_subgraph_through_node_sets_known_relationships(
            path_type, [[disease_id], symptoms, genetic_diseases_selected,
                        implicated_proteins_selected, pathways_selected,
                        pathway_proteins_selected, drugs_selected])

        # decorate graph with fisher p-values
        # get dict of id to nx nodes
        nx_node_to_id = nx.get_node_attributes(g, "names")
        nx_id_to_node = dict()
        # reverse the dictionary
        for node in nx_node_to_id.keys():
            id = nx_node_to_id[node]
            nx_id_to_node[id] = node

        i = 0
        for u, v, d in g.edges(data=True):
            u_id = nx_node_to_id[u]
            v_id = nx_node_to_id[v]
            # decorate correct nodes
            # input disease to symptoms, decorated by symptom p-value
            if (u_id in symptoms_set
                    and v_id == disease_id) or (v_id in symptoms_set
                                                and u_id == disease_id):
                try:
                    d["p_value"] = symptoms_dict[v_id]
                except:
                    d["p_value"] = symptoms_dict[u_id]
                continue
            # symptom to disease, decorated by disease p-value
            if (u_id in symptoms_set and v_id in genetic_diseases_dict) or (
                    v_id in symptoms_set and u_id in genetic_diseases_dict):
                try:
                    d["p_value"] = genetic_diseases_dict[v_id]
                except:
                    d["p_value"] = genetic_diseases_dict[u_id]
                continue
            # disease to protein
            if (u_id in genetic_diseases_dict
                    and v_id in implicated_proteins_dict) or (
                        v_id in genetic_diseases_dict
                        and u_id in implicated_proteins_dict):
                try:
                    d["p_value"] = implicated_proteins_dict[v_id]
                except:
                    d["p_value"] = implicated_proteins_dict[u_id]
                continue
            # protein to pathway
            if (u_id in implicated_proteins_dict
                    and v_id in pathways_selected_dict) or (
                        v_id in implicated_proteins_dict
                        and u_id in pathways_selected_dict):
                try:
                    d["p_value"] = pathways_selected_dict[v_id]
                except:
                    d["p_value"] = pathways_selected_dict[u_id]
                continue
            # pathway to protein
            if (u_id in pathways_selected_dict
                    and v_id in pathway_proteins_dict) or (
                        v_id in pathways_selected_dict
                        and u_id in pathway_proteins_dict):
                try:
                    d["p_value"] = pathway_proteins_dict[v_id]
                except:
                    d["p_value"] = pathway_proteins_dict[u_id]
                continue
            # protein to drug
            if (u_id in pathway_proteins_dict and v_id in drugs_selected_dict
                ) or (v_id in pathway_proteins_dict
                      and u_id in drugs_selected_dict):
                try:
                    d["p_value"] = drugs_selected_dict[v_id]
                except:
                    d["p_value"] = drugs_selected_dict[u_id]
                continue
            # otherwise, stick a p_value of 1
            d["p_value"] = 1

        # decorate with COHD data
        RU.weight_disease_phenotype_by_cohd(
            g, max_phenotype_oxo_dist=2, default_value=1
        )  # automatically pulls it out to top-level property

        # decorate with drug->target binding probability
        RU.weight_graph_with_property(
            g, "probability", default_value=1,
            transformation=lambda x: x)  # pulls it out to top level property

        # transform the graph properties so they all point the same direction
        # will be finding shortest paths, so make 0=bad, 1=good transform to 0=good, 1=bad
        RU.transform_graph_weight(
            g,
            "cohd_freq",
            default_value=0,
            transformation=lambda x: 1 / float(x + .001) - 1 / (1 + .001))
        RU.transform_graph_weight(
            g,
            "probability",
            default_value=0,
            transformation=lambda x: 1 / float(x + .001) - 1 / (1 + .001))

        # merge the graph properties (additively)
        RU.merge_graph_properties(g, ["p_value", "cohd_freq", "probability"],
                                  "merged",
                                  operation=lambda x, y: x + y)

        graph_weight_tuples = []
        for drug in drugs_selected:
            decorated_paths, decorated_path_edges, path_lengths = RU.get_top_shortest_paths(
                g, disease_id, drug, num_paths, property='merged')
            for path_ind in range(num_paths):
                g2 = nx.Graph()
                path = decorated_paths[path_ind]
                for node_prop in path:
                    node_uuid = node_prop['properties']['UUID']
                    g2.add_node(node_uuid, **node_prop)

                path = decorated_path_edges[path_ind]
                for edge_prop in path:
                    source_uuid = edge_prop['properties']['source_node_uuid']
                    target_uuid = edge_prop['properties']['target_node_uuid']
                    g2.add_edge(source_uuid, target_uuid, **edge_prop)
                graph_weight_tuples.append((g2, path_lengths[path_ind], drug))

        # sort by the path weight
        graph_weight_tuples.sort(key=lambda x: x[1])

        # print out the results
        if not use_json:
            for graph, weight, drug_id in graph_weight_tuples:
                drug_description = RU.get_node_property(
                    drug_id, "name", node_label="chemical_substance")
                print("%s %f" % (drug_description, weight))
        else:
            response.response.table_column_names = [
                "disease name", "disease ID", "drug name", "drug ID",
                "path weight", "drug disease google distance",
                "ML probability drug treats disease"
            ]
            for graph, weight, drug_id in graph_weight_tuples:
                drug_description = RU.get_node_property(
                    drug_id, "name", node_label="chemical_substance")
                drug_id_old_curie = drug_id.replace("CHEMBL.COMPOUND:CHEMBL",
                                                    "ChEMBL:")
                # Machine learning probability of "treats"
                prob = p.prob_single(drug_id_old_curie, disease_id)
                if not prob:
                    prob = -1
                else:
                    prob = prob[0]
                confidence = prob
                # Google distance
                gd = NormGoogleDistance.get_ngd_for_all(
                    [drug_id, disease_id],
                    [drug_description, disease_description])
                # populate the graph
                res = response.add_subgraph(
                    graph.nodes(data=True),
                    graph.edges(data=True),
                    "The drug %s is predicted to treat %s." %
                    (drug_description, disease_description),
                    confidence,
                    return_result=True)
                res.essence = "%s" % drug_description  # populate with essence of question result
                row_data = []  # initialize the row data
                row_data.append("%s" % disease_description)
                row_data.append("%s" % disease_id)
                row_data.append("%s" % drug_description)
                row_data.append("%s" % drug_id)
                row_data.append("%f" % weight)
                row_data.append("%f" % gd)
                row_data.append("%f" % prob)
                res.row_data = row_data
            response.print()
Esempio n. 20
0
    def answer(source_node_ID,
               target_node_type,
               association_node_type,
               use_json=False,
               threshold=0.2,
               n=20):
        """
		Answers the question what X are similar to Y based on overlap of common Z nodes. X is target_node_type,
		Y is source_node_ID, Z is association_node_type. The relationships are automatically determined in
		SimilarNodesInCommon by looking for 1 hop relationships and poping the FIRST one (you are warned).
		:param source_node_ID: actual name in the KG
		:param target_node_type: kinds of nodes you want returned
		:param association_node_type: kind of node you are computing the Jaccard overlap on
		:param use_json: print the results in standardized format
		:param threshold: only return results where jaccard is >= this threshold
		:param n: number of results to return (default 20)
		:return: reponse (or printed text)
		"""

        # Initialize the response class
        response = FormatOutput.FormatResponse(5)

        # Initialize the similar nodes class
        similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon()

        # get the description
        source_node_description = RU.get_node_property(source_node_ID,
                                                       'description')

        # get the source node label
        source_node_label = RU.get_node_property(source_node_ID, 'label')

        # Get the nodes in common
        node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association(
            source_node_ID, target_node_type, association_node_type, threshold)

        # reduce to top 100
        if len(node_jaccard_tuples_sorted) > n:
            node_jaccard_tuples_sorted = node_jaccard_tuples_sorted[0:n]

        # make sure that the input node isn't in the list
        node_jaccard_tuples_sorted = [
            i for i in node_jaccard_tuples_sorted if i[0] != source_node_ID
        ]

        # check for an error
        if error_code is not None or error_message is not None:
            if not use_json:
                print(error_message)
                return
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return

        # Otherwise return the results
        if not use_json:
            to_print = "The %s's involving similar %s's as %s are: \n" % (
                target_node_type, association_node_type,
                source_node_description)
            for other_disease_ID, jaccard in node_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
            print(to_print)
        else:
            node_jaccard_ID_sorted = [
                id for id, jac in node_jaccard_tuples_sorted
            ]

            # print(RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type,
            #										[association_node_type], with_rel=[], directed=True, debug=True))

            # get the entire subgraph
            g = RU.return_subgraph_through_node_labels(source_node_ID,
                                                       source_node_label,
                                                       node_jaccard_ID_sorted,
                                                       target_node_type,
                                                       [association_node_type],
                                                       with_rel=[],
                                                       directed=False,
                                                       debug=False)

            # extract the source_node_number
            for node, data in g.nodes(data=True):
                if data['properties']['name'] == source_node_ID:
                    source_node_number = node
                    break

            # Get all the target numbers
            target_id2numbers = dict()
            node_jaccard_ID_sorted_set = set(node_jaccard_ID_sorted)
            for node, data in g.nodes(data=True):
                if data['properties']['name'] in node_jaccard_ID_sorted_set:
                    target_id2numbers[data['properties']['name']] = node

            for other_disease_ID, jaccard in node_jaccard_tuples_sorted:
                to_print = "The %s %s involves similar %s's as %s with similarity value %f" % (
                    target_node_type,
                    RU.get_node_property(other_disease_ID, 'description'),
                    association_node_type, source_node_description, jaccard)

                # get all the shortest paths between source and target
                all_paths = nx.all_shortest_paths(
                    g, source_node_number, target_id2numbers[other_disease_ID])

                # get all the nodes on these paths
                try:
                    rel_nodes = set()
                    for path in all_paths:
                        for node in path:
                            rel_nodes.add(node)

                    if rel_nodes:
                        # extract the relevant subgraph
                        sub_g = nx.subgraph(g, rel_nodes)

                        # add it to the response
                        response.add_subgraph(sub_g.nodes(data=True),
                                              sub_g.edges(data=True), to_print,
                                              jaccard)
                except:
                    pass
            response.print()
Esempio n. 21
0
    def answer(disease_id, use_json=False, num_show=20):
        num_diseases_to_select = 10  # number of diseases with shared phenotypes to keep
        num_omim_keep = 10  # number of genetic conditions to keep
        num_proteins_keep = 10  # number of proteins implicated in diseases to keep
        num_pathways_keep = 10  # number of relevant pathways to keep
        num_proteins_in_pathways_keep = 10  # number of proteins in those pathways to keep
        num_drugs_keep = 10  # number of drugs that target those proteins to keep

        # The kinds of paths we're looking for
        path_type = [
            "gene_mutations_contribute_to", "protein", "participates_in",
            "pathway", "participates_in", "protein",
            "physically_interacts_with", "chemical_substance"
        ]

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # get the description of the disease
        disease_description = RU.get_node_property(disease_id, 'name')

        # What are the defining symptoms of the disease?
        # get diseases that have many raw symptoms in common
        # select top N of them
        # get subraph of these with the input disease
        # weight by COHD data
        # pick diseases with maximal (since frequency) average distance i.e. maximal expected graph distance

        # get disease that have many raw symptoms in common
        similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon()
        node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association(
            disease_id, "disease", "phenotypic_feature", 0)

        # select the omims
        diseases_selected = []
        for n, j in node_jaccard_tuples_sorted:
            if n.split(":")[0] == "OMIM":
                diseases_selected.append(n)

        # if we found no genetic conditions, add error message and quit
        if not diseases_selected:
            response.add_error_message(
                "NoGeneticConditions",
                "There appears to be no genetic conditions with phenotypes in common with %s"
                % disease_description)
            response.print()
            return

        # subset to top N omims that actually have the relationship types that we want:
        num_selected = 0
        diseases_selected_on_desired_path = []
        for selected_disease in diseases_selected:
            if RU.paths_of_type_source_fixed_target_free_exists(
                    selected_disease, "disease", path_type, limit=1):
                diseases_selected_on_desired_path.append(selected_disease)
                num_selected += 1
            if num_selected >= num_omim_keep:
                break

        diseases_selected = diseases_selected_on_desired_path

        # Find most representative symptoms by consulting COHD. TODO: see if this actually helps anything
        # get subgraph of these with the input disease
        # get all symptoms of input disease
        # all_symptoms = set()
        # for selected_disease in diseases_selected:
        #	intermediate_phenotypes = RU.get_intermediate_node_ids(disease_id, "disease", "has_phenotype", "phenotypic_feature", "has_phenotype", selected_disease, "disease")
        #	all_symptoms.update(intermediate_phenotypes)
        # turn it back into a list
        # all_symptoms = list(all_symptoms)
        # get the subgraph of all relevant symptoms, the omims selected, and the input disease
        # g = RU.get_graph_from_nodes(all_symptoms + diseases_selected + [disease_id], edges=True)

        # weight by COHD data (if you want to)
        # RU.weight_disease_phenotype_by_cohd(g, max_phenotype_oxo_dist=2)

        # sort by COHD freq
        # disease_path_weight_sorted = RU.get_sorted_path_weights_disease_to_disease(g, disease_id)
        # genetic_diseases_selected = []
        # num_omim = 0
        # for id, weight in disease_path_weight_sorted:
        #	if id.split(":")[0] == "OMIM":
        #		genetic_diseases_selected.append(id)
        #		num_omim += 1
        #	if num_omim >= num_omim_keep:
        #		break

        # in the mean-time, use them all
        genetic_diseases_selected = diseases_selected

        # select representative diseases
        # Do nothing for now (use all of them)

        # get drugs that are connected along the paths we want and count how many such paths there are
        genetic_diseases_to_chemical_substance_dict = dict()
        for selected_disease in genetic_diseases_selected:
            res = RU.count_paths_of_type_source_fixed_target_free(
                selected_disease, "disease", path_type, limit=num_drugs_keep)
            # add it to our dictionary
            genetic_diseases_to_chemical_substance_dict[selected_disease] = res

        # get the unique drugs
        drug_counts_tuples = [
            item
            for items in genetic_diseases_to_chemical_substance_dict.values()
            for item in items
        ]
        drugs_path_counts = dict()
        for drug, count in drug_counts_tuples:
            if drug not in drugs_path_counts:
                drugs_path_counts[drug] = count
            else:
                drugs_path_counts[drug] += count

        # put them as tuples in a list, sorted by the ones with the most paths
        drugs_path_counts_tuples = []
        for drug in drugs_path_counts.keys():
            count = drugs_path_counts[drug]
            drugs_path_counts_tuples.append((drug, count))
        drugs_path_counts_tuples.sort(key=lambda x: x[1], reverse=True)

        if not use_json:
            #for drug, count in drugs_path_counts_tuples:
            #	name = RU.get_node_property(drug, "name", node_label="chemical_substance")
            #	print("%s (%s): %d" % (name, drug, count))
            print("source,target")
            for drug, count in drugs_path_counts_tuples:
                drug_old_curie = drug.split(":")[1].replace("L", "L:").replace(
                    "H", "h")
                print("%s,%s" % (drug_old_curie, disease_id))
Esempio n. 22
0
    def answer(drug_id,
               use_json=False,
               num_show=20,
               rev=True,
               conservative=True):
        """
		Answers the question 'what diseases does $drug commonly treat?'
		:param disease_id: KG disease node name
		:param use_json: bool, use JSON output
		:param num_show: int, number to display
		:param rev: bool. order by most frequent
		:param conservative: bool, True if using exact matches, False if using any synonyms returned by COHD
		:return: none
		"""

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # get the description
        drug_description = RU.get_node_property(drug_id,
                                                'name',
                                                name_type='id')

        # Get the conditions that COHD says it's used to treat
        conditions_treated = COHDUtilities.get_conditions_treating(
            drug_description, conservative=conservative)

        # sort the diseases by frequency
        ids_counts = []
        for id in conditions_treated:
            cond = conditions_treated[id]
            ids_counts.append((id, cond['concept_count']))

        ids_counts_sorted = sorted(ids_counts, key=lambda x: x[1], reverse=rev)
        ids_sorted = [i[0] for i in ids_counts_sorted]

        # reduce to top n
        ids_sorted_top_n = ids_sorted
        if len(ids_sorted_top_n) > num_show:
            ids_sorted_top_n = ids_sorted_top_n[0:num_show]

        # return the results
        if not use_json:
            if rev:
                to_print = "The most common conditions "
            else:
                to_print = "The least common conditions "
            to_print += "treated with %s, according to the Columbia Open Health Data, are:\n" % drug_description
            for id in ids_sorted_top_n:
                to_print += "condition: %s\t count %d \t frequency %f \n" % (
                    conditions_treated[id]['associated_concept_name'],
                    conditions_treated[id]['concept_count'],
                    conditions_treated[id]['concept_frequency'])
            print(to_print)
        else:
            #  otherwise, you want a JSON output
            #  Attempt to map the COHD names to the KG (this takes some time)l. TODO: find further speed improvements
            drug_as_graph = RU.get_node_as_graph(drug_id)
            drug_node_info = list(drug_as_graph.nodes(data=True))[0][1]
            id_to_KG_name = dict()
            id_to_name = dict()
            id_to_count = dict()
            id_to_frequency = dict()
            id_to_id = dict()

            # Map ID's to all relevant values
            for id in ids_sorted_top_n:
                id_to_name[id] = conditions_treated[id][
                    'associated_concept_name']
                id_to_count[id] = conditions_treated[id]['concept_count']
                id_to_frequency[id] = conditions_treated[id][
                    'concept_frequency']
                id_to_KG_name[id] = None
                try:
                    id_to_KG_name[id] = RU.get_id_from_property(
                        id_to_name[id], 'name', label="phenotypic_feature")
                    id_to_id[id_to_KG_name[id]] = id
                except:
                    try:
                        id_to_KG_name[id] = RU.get_id_from_property(
                            id_to_name[id], 'name', label="disease")
                        id_to_id[id_to_KG_name[id]] = id
                    except:
                        try:
                            id_to_KG_name[id] = RU.get_id_from_property(
                                id_to_name[id].lower(),
                                'name',
                                label="phenotypic_feature")
                            id_to_id[id_to_KG_name[id]] = id
                        except:
                            try:
                                id_to_KG_name[id] = RU.get_id_from_property(
                                    id_to_name[id].lower(),
                                    'name',
                                    label="disease")
                                id_to_id[id_to_KG_name[id]] = id
                            except:
                                pass

            # get the graph (one call) of all the nodes that wer mapped
            KG_names = []
            for id in ids_sorted_top_n:
                if id_to_KG_name[id] is not None:
                    KG_names.append(id_to_KG_name[id])

            if not KG_names:
                error_message = "Sorry, Columbia Open Health Data has no data on the use of %s" % drug_description
                error_code = "EmptyResult"
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

            all_conditions_graph = RU.get_graph_from_nodes(KG_names)

            # Get the info of the mapped nodes
            id_to_info = dict()
            for u, data in all_conditions_graph.nodes(data=True):
                id = data['properties']['id']
                id = id_to_id[id]
                id_to_info[id] = data

            # for each condition, return the results (with the nice sub-graph if the cohd id's were mapped)
            for id in ids_sorted_top_n:
                if id_to_KG_name[id] is not None:
                    to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \
                         "%f out of all patients treated with %s (count=%d)." % (
                    drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id])
                    nodes = []
                    disease_node_info = id_to_info[id]
                    nodes.append((2, disease_node_info))
                    nodes.append((1, drug_node_info))
                    edges = [(1, 2, {
                        'id': 3,
                        'properties': {
                            'is_defined_by':
                            'RTX',
                            'predicate':
                            'treats',
                            'provided_by':
                            'COHD',
                            'relation':
                            'treats',
                            'seed_node_uuid':
                            '-1',
                            'source_node_uuid':
                            drug_node_info['properties']['UUID'],
                            'target_node_uuid':
                            disease_node_info['properties']['UUID']
                        },
                        'type': 'treats'
                    })]
                    response.add_subgraph(nodes, edges, to_print,
                                          id_to_frequency[id])
                else:
                    to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \
                      "%f out of all patients treated with %s (count=%d). This condition is not in our " \
                      "Knowledge graph, so no graph is shown." % (
                     drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id])
                    g = RU.get_node_as_graph(drug_id)
                    response.add_subgraph(g.nodes(data=True),
                                          g.edges(data=True), to_print,
                                          id_to_frequency[id])
            response.print()
Esempio n. 23
0
    def answer(tissue_id,
               input_protein_list,
               use_json=False,
               num_show=20,
               rev=True):

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # Make sure everything exists in the graph
        if not RU.node_exists_with_property(tissue_id, "id"):
            tissue_id = RU.get_node_property(tissue_id,
                                             "id",
                                             node_label="anatomical_entity")

        for i in range(len(input_protein_list)):
            id = input_protein_list[i]
            if not RU.node_exists_with_property(id, "id"):
                input_protein_list[i] = RU.get_node_property(
                    id, "id", node_label="protein")

        # Initialize the QueryLilGim class
        q = QueryLilGIM.QueryLilGIM()

        # get the description
        tissue_description = RU.get_node_property(
            tissue_id, 'name', node_label="anatomical_entity")

        # Get the correlated proteins
        try:
            correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy(
                tissue_id, tuple(input_protein_list))
            #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347}
        except:
            error_message = "Lil'GIM is experiencing a problem."
            error_code = "LilGIMerror"
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

        # as a list of tuples
        correlated_proteins_tupes = []
        for k, v in correlated_proteins_dict.items():
            correlated_proteins_tupes.append((k, v))

        # sort by freq
        correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes,
                                                  key=lambda x: x[1],
                                                  reverse=rev)
        correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[
            0:num_show]
        correlated_proteins_tupes = correlated_proteins_tupes_sorted

        # return the results
        if not use_json:
            try:
                protein_descriptions = RU.get_node_property(
                    input_protein_list[0],
                    "name",
                    node_label="protein",
                    name_type="id")
            except:
                protein_descriptions = input_protein_list[0]
            for id in input_protein_list[1:-1]:
                protein_descriptions += ", "
                try:
                    protein_descriptions += RU.get_node_property(
                        id, "name", node_label="protein", name_type="id")
                except:
                    protein_descriptions += id
            if len(input_protein_list) > 1:
                try:
                    protein_descriptions += ", and %s" % RU.get_node_property(
                        input_protein_list[-1],
                        "name",
                        node_label="protein",
                        name_type="id")
                except:
                    protein_descriptions += ", and %s" % input_protein_list[-1]
            if rev:
                to_print = "In the tissue: %s, the proteins that correlate most with %s" % (
                    tissue_description, protein_descriptions)
            else:
                to_print = "In the tissue: %s, the proteins that correlate least with %s" % (
                    tissue_description, protein_descriptions)
            to_print += " according to Lil'GIM, are:\n"
            for id, val in correlated_proteins_tupes_sorted:
                try:
                    to_print += "protein: %s\t correlation %f\n" % (
                        RU.get_node_property(
                            id, "name", node_label="protein",
                            name_type="id"), val)
                except:
                    to_print += "protein: %s\t correlation %f\n" % (id, val)
            print(to_print)
        else:
            #  otherwise, you want a JSON output
            protein_descriptions = []
            is_in_KG_list = []
            for protein, corr in correlated_proteins_tupes:
                try:
                    description = RU.get_node_property(protein,
                                                       "name",
                                                       node_label="protein",
                                                       name_type="id")
                    protein_descriptions.append(description)
                    is_in_KG_list.append(True)
                except:
                    protein_description = protein
                    protein_descriptions.append(protein_description)
                    is_in_KG_list.append(False)

            # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG
            correlated_proteins_tupes_in_KG = []
            for i in range(len(correlated_proteins_tupes)):
                if is_in_KG_list[i]:
                    correlated_proteins_tupes_in_KG.append(
                        correlated_proteins_tupes[i])

            # Return the results
            full_g = RU.get_graph_from_nodes(
                [id for id, val in correlated_proteins_tupes_in_KG],
                node_property_label="id")
            id2node = dict()
            for nx_id, node in full_g.nodes(data=True):
                id2node[node['properties']['id']] = node
            for id, corr in correlated_proteins_tupes_in_KG:
                to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." % (
                    tissue_description,
                    RU.get_node_property(
                        id, "name", node_label="protein",
                        name_type="id"), corr)
                response.add_subgraph([(id, id2node[id])], [], to_print, corr)
            response.print()