Ejemplo n.º 1
0
	def get_similar_nodes_in_common_parameters(node_ID, target_node_label, association_node_label):
		"""
		This function will get the parameters for get_similar_nodes_in_common based on target node, target label, and association label
		:param node_ID: source node ID (name in KG)
		:param target_label: the node types that you want returned
		:param association_node_label: the association node (node in common between source and target) type
		:return: dict, error_code, error_message (dict keys input_node_ID, input_node_label, association_node_label, input_association_relationship,
				target_association_relationship, target_node_label)
		"""
		# Check if node exists
		if not RU.node_exists_with_property(node_ID, 'id'):
			error_message = "Sorry, the disease %s is not yet in our knowledge graph." % node_ID
			error_code = "DiseaseNotFound"
			return dict(), error_code, error_message

		# Get label/kind of node the source is
		input_node_label = RU.get_node_property(node_ID, "label")
		input_node_ID = node_ID

		# Get relationship between source and association label
		rels = RU.get_relationship_types_between(input_node_ID, input_node_label, "", association_node_label, max_path_len=1)
		# TODO: there could be multiple relationship types, for now, let's just pop one
		if not rels:
			error_code = "NoRelationship"
			error_message = "Sorry, the %s %s is not connected to any %s." % (input_node_label, input_node_ID, association_node_label)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (
				RU.get_node_property(parent, 'name'), RU.get_node_property(input_node_ID, 'name'))
			return dict(), error_code, error_message
		input_association_relationship = rels.pop()

		# Get relationship between target and association label
		rels = RU.get_relationship_types_between("", target_node_label, "", association_node_label, max_path_len=1)
		if not rels:
			error_code = "NoRelationship"
			error_message = "Sorry, no %s is not connected to any %s." % (target_node_label, association_node_label)
			return dict(), error_code, error_message
		target_association_relationship = rels.pop()
		# TODO: kludgy fix for microRNA's having multiple relationship types, only one of which shows up frequently
		if target_association_relationship == "gene_mutations_contribute_to":
			target_association_relationship = "gene_associated_with_condition"

		# populate the arguments
		arguments = dict(input_node_ID=input_node_ID,
						input_node_label=input_node_label,
						association_node_label=association_node_label,
						input_association_relationship=input_association_relationship,
						target_association_relationship=target_association_relationship,
						target_node_label=target_node_label)
		return arguments, None, None
Ejemplo n.º 2
0
def old_answer(disease_ID, use_json=False, threshold=0.2):
    # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded
    # Initialize the response class
    response = FormatOutput.FormatResponse(4)

    # Check if node exists
    if not RU.node_exists_with_property(disease_ID, 'name'):
        error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
        error_code = "DiseaseNotFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # Get label/kind of node the source is
    disease_label = RU.get_node_property(disease_ID, "label")
    if disease_label != "disease" and disease_label != "disease":
        error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
            " Please try a different term" % disease_label
        error_code = "NotADisease"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # get the description
    disease_description = RU.get_node_property(disease_ID, 'description')

    # get the phenotypes associated to the disease
    disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                               "phenotypic_feature",
                                               "has_phenotype")

    # Look more steps beyond if we didn't get any physically_interacts_with
    if disease_phenotypes == []:
        for max_path_len in range(2, 5):
            disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                disease_label,
                disease_ID,
                "phenotypic_feature",
                max_path_len=max_path_len,
                direction="u")
            if disease_phenotypes:
                break
    # print("Total of %d phenotypes" % len(disease_phenotypes))

    # Make sure you actually picked up at least one phenotype
    if not disease_phenotypes:
        error_message = "No phenotypes found for this disease."
        error_code = "NoPhenotypesFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1
    disease_phenotypes_set = set(disease_phenotypes)

    # get all the other disease that connect and get the phenotypes in common
    other_disease_IDs_to_intersection_counts = dict()
    for target_label in ["disease", "disease"]:

        # direct connection
        # print("direct")
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
            disease_ID, disease_label, target_label, node_label_list,
            relationship_label_list, node_of_interest_position)
        for ID in names2counts.keys():
            if names2counts[ID] / float(
                    len(disease_phenotypes_set)
            ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]

    if not other_disease_IDs_to_intersection_counts:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of").pop()
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys())))
    # Now for each of the diseases in here, compute the actual Jaccard index
    disease_jaccard_tuples = []
    # i = 0
    for other_disease_ID in other_disease_IDs_to_intersection_counts.keys():
        # print(i)
        # i += 1
        # print(other_disease_ID)
        # get the phenotypes associated to the disease
        if other_disease_ID.split(":")[0] == "DOID":
            other_disease_label = "disease"
        if other_disease_ID.split(":")[0] == "OMIM":
            other_disease_label = "disease"
        other_disease_phenotypes = RU.get_one_hop_target(
            other_disease_label, other_disease_ID, "phenotypic_feature",
            "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if other_disease_phenotypes == []:
            for max_path_len in range(2, 5):
                other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    other_disease_label,
                    other_disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if other_disease_phenotypes:
                    break

        # compute the Jaccard index
        if not other_disease_phenotypes:
            jaccard = 0
        else:
            other_disease_phenotypes_set = set(other_disease_phenotypes)
            jaccard = other_disease_IDs_to_intersection_counts[
                other_disease_ID] / float(
                    len(
                        list(
                            disease_phenotypes_set.union(
                                other_disease_phenotypes_set))))
        # print("jaccard %f" % jaccard)
        if jaccard > threshold:
            disease_jaccard_tuples.append((other_disease_ID, jaccard))

    # Format the results.
    # Maybe nothing passed the threshold
    if not disease_jaccard_tuples:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of")
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            return 1

    # Otherwise there are results to return, first sort them largest to smallest
    disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
        disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
    if not use_json:
        to_print = "The diseases similar to %s are: \n" % disease_description
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print += "%s\t%s\tJaccard %f\n" % (
                other_disease_ID,
                RU.get_node_property(other_disease_ID, 'description'), jaccard)
        print(to_print)
    else:
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print = "%s is similar to the disease %s with similarity value %f" % (
                disease_description,
                RU.get_node_property(other_disease_ID, 'decription'), jaccard)
            g = RU.get_node_as_graph(other_disease_ID)
            response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                  to_print, jaccard)
        response.print()
Ejemplo n.º 3
0
    def answer(self, disease_ID, use_json=False, threshold=0.2):
        """
		Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID
		(in terms of phenotype overlap)
		:param disease_ID: KG disease name (eg. DOID:8398)
		:param use_json: use the standardized output format
		:param threshold: only include diseases with Jaccard index above this
		:return: None (print to stdout), unless there's an error, then return 1
		"""
        # Initialize the response class
        response = FormatOutput.FormatResponse(4)

        # Check if node exists
        if not RU.node_exists_with_property(disease_ID, 'name'):
            error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
            error_code = "DiseaseNotFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Get label/kind of node the source is
        disease_label = RU.get_node_property(disease_ID, "label")
        if disease_label != "disease" and disease_label != "disease":
            error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
                " Please try a different term" % disease_label
            error_code = "NotADisease"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # get the description
        disease_description = RU.get_node_property(disease_ID, 'description')

        # get the phenotypes associated to the disease
        disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                                   "phenotypic_feature",
                                                   "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if disease_phenotypes == []:
            for max_path_len in range(2, 5):
                disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    disease_label,
                    disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if disease_phenotypes:
                    break

        # Make sure you actually picked up at least one phenotype
        if not disease_phenotypes:
            error_message = "No phenotypes found for this disease."
            error_code = "NoPhenotypesFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1
        disease_phenotypes_set = set(disease_phenotypes)

        # get all the other disease that connect and get the phenotypes in common
        # direct connection
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_disease_IDs_to_intersection_counts = dict()
        for target_label in ["disease", "disease"]:
            names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
                disease_ID, disease_label, target_label, node_label_list,
                relationship_label_list, node_of_interest_position)
            for ID in names2counts.keys():
                if names2counts[ID] / float(
                        len(disease_phenotypes_set)
                ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                    other_disease_IDs_to_intersection_counts[
                        ID] = names2counts[ID]

        # check if any other diseases passed the threshold
        if not other_disease_IDs_to_intersection_counts:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Now for each of the diseases connecting to source, count number of phenotypes
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        # union the two
        other_disease_counts = dict()
        for key in other_doid_counts.keys():
            other_disease_counts[key] = other_doid_counts[key]
        for key in other_omim_counts.keys():
            other_disease_counts[key] = other_omim_counts[key]

        # then compute the jaccard index
        disease_jaccard_tuples = []
        for other_disease_ID in other_disease_counts.keys():
            jaccard = 0
            if other_disease_ID in other_disease_IDs_to_intersection_counts:
                union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \
                    other_disease_IDs_to_intersection_counts[other_disease_ID]
                jaccard = other_disease_IDs_to_intersection_counts[
                    other_disease_ID] / float(union_card)
            if jaccard > threshold:
                disease_jaccard_tuples.append((other_disease_ID, jaccard))

        # Format the results.
        # Maybe nothing passed the threshold
        if not disease_jaccard_tuples:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                return 1

        # Otherwise there are results to return, first sort them largest to smallest
        disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
            disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
        if not use_json:
            to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
            print(to_print)
        else:
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % (
                    disease_description,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
                g = RU.get_node_as_graph(other_disease_ID)
                response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                      to_print, jaccard)
            response.print()
Ejemplo n.º 4
0
    def answer(tissue_id,
               input_protein_list,
               use_json=False,
               num_show=20,
               rev=True):

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # Make sure everything exists in the graph
        if not RU.node_exists_with_property(tissue_id, "id"):
            tissue_id = RU.get_node_property(tissue_id,
                                             "id",
                                             node_label="anatomical_entity")

        for i in range(len(input_protein_list)):
            id = input_protein_list[i]
            if not RU.node_exists_with_property(id, "id"):
                input_protein_list[i] = RU.get_node_property(
                    id, "id", node_label="protein")

        # Initialize the QueryLilGim class
        q = QueryLilGIM.QueryLilGIM()

        # get the description
        tissue_description = RU.get_node_property(
            tissue_id, 'name', node_label="anatomical_entity")

        # Get the correlated proteins
        try:
            correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy(
                tissue_id, tuple(input_protein_list))
            #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347}
        except:
            error_message = "Lil'GIM is experiencing a problem."
            error_code = "LilGIMerror"
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

        # as a list of tuples
        correlated_proteins_tupes = []
        for k, v in correlated_proteins_dict.items():
            correlated_proteins_tupes.append((k, v))

        # sort by freq
        correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes,
                                                  key=lambda x: x[1],
                                                  reverse=rev)
        correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[
            0:num_show]
        correlated_proteins_tupes = correlated_proteins_tupes_sorted

        # return the results
        if not use_json:
            try:
                protein_descriptions = RU.get_node_property(
                    input_protein_list[0],
                    "name",
                    node_label="protein",
                    name_type="id")
            except:
                protein_descriptions = input_protein_list[0]
            for id in input_protein_list[1:-1]:
                protein_descriptions += ", "
                try:
                    protein_descriptions += RU.get_node_property(
                        id, "name", node_label="protein", name_type="id")
                except:
                    protein_descriptions += id
            if len(input_protein_list) > 1:
                try:
                    protein_descriptions += ", and %s" % RU.get_node_property(
                        input_protein_list[-1],
                        "name",
                        node_label="protein",
                        name_type="id")
                except:
                    protein_descriptions += ", and %s" % input_protein_list[-1]
            if rev:
                to_print = "In the tissue: %s, the proteins that correlate most with %s" % (
                    tissue_description, protein_descriptions)
            else:
                to_print = "In the tissue: %s, the proteins that correlate least with %s" % (
                    tissue_description, protein_descriptions)
            to_print += " according to Lil'GIM, are:\n"
            for id, val in correlated_proteins_tupes_sorted:
                try:
                    to_print += "protein: %s\t correlation %f\n" % (
                        RU.get_node_property(
                            id, "name", node_label="protein",
                            name_type="id"), val)
                except:
                    to_print += "protein: %s\t correlation %f\n" % (id, val)
            print(to_print)
        else:
            #  otherwise, you want a JSON output
            protein_descriptions = []
            is_in_KG_list = []
            for protein, corr in correlated_proteins_tupes:
                try:
                    description = RU.get_node_property(protein,
                                                       "name",
                                                       node_label="protein",
                                                       name_type="id")
                    protein_descriptions.append(description)
                    is_in_KG_list.append(True)
                except:
                    protein_description = protein
                    protein_descriptions.append(protein_description)
                    is_in_KG_list.append(False)

            # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG
            correlated_proteins_tupes_in_KG = []
            for i in range(len(correlated_proteins_tupes)):
                if is_in_KG_list[i]:
                    correlated_proteins_tupes_in_KG.append(
                        correlated_proteins_tupes[i])

            # Return the results
            full_g = RU.get_graph_from_nodes(
                [id for id, val in correlated_proteins_tupes_in_KG],
                node_property_label="id")
            id2node = dict()
            for nx_id, node in full_g.nodes(data=True):
                id2node[node['properties']['id']] = node
            for id, corr in correlated_proteins_tupes_in_KG:
                to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." % (
                    tissue_description,
                    RU.get_node_property(
                        id, "name", node_label="protein",
                        name_type="id"), corr)
                response.add_subgraph([(id, id2node[id])], [], to_print, corr)
            response.print()