def get_similar_nodes_in_common_parameters(node_ID, target_node_label, association_node_label): """ This function will get the parameters for get_similar_nodes_in_common based on target node, target label, and association label :param node_ID: source node ID (name in KG) :param target_label: the node types that you want returned :param association_node_label: the association node (node in common between source and target) type :return: dict, error_code, error_message (dict keys input_node_ID, input_node_label, association_node_label, input_association_relationship, target_association_relationship, target_node_label) """ # Check if node exists if not RU.node_exists_with_property(node_ID, 'id'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % node_ID error_code = "DiseaseNotFound" return dict(), error_code, error_message # Get label/kind of node the source is input_node_label = RU.get_node_property(node_ID, "label") input_node_ID = node_ID # Get relationship between source and association label rels = RU.get_relationship_types_between(input_node_ID, input_node_label, "", association_node_label, max_path_len=1) # TODO: there could be multiple relationship types, for now, let's just pop one if not rels: error_code = "NoRelationship" error_message = "Sorry, the %s %s is not connected to any %s." % (input_node_label, input_node_ID, association_node_label) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % ( RU.get_node_property(parent, 'name'), RU.get_node_property(input_node_ID, 'name')) return dict(), error_code, error_message input_association_relationship = rels.pop() # Get relationship between target and association label rels = RU.get_relationship_types_between("", target_node_label, "", association_node_label, max_path_len=1) if not rels: error_code = "NoRelationship" error_message = "Sorry, no %s is not connected to any %s." % (target_node_label, association_node_label) return dict(), error_code, error_message target_association_relationship = rels.pop() # TODO: kludgy fix for microRNA's having multiple relationship types, only one of which shows up frequently if target_association_relationship == "gene_mutations_contribute_to": target_association_relationship = "gene_associated_with_condition" # populate the arguments arguments = dict(input_node_ID=input_node_ID, input_node_label=input_node_label, association_node_label=association_node_label, input_association_relationship=input_association_relationship, target_association_relationship=target_association_relationship, target_node_label=target_node_label) return arguments, None, None
def old_answer(disease_ID, use_json=False, threshold=0.2): # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # print("Total of %d phenotypes" % len(disease_phenotypes)) # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: # direct connection # print("direct") node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID] if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of").pop() if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys()))) # Now for each of the diseases in here, compute the actual Jaccard index disease_jaccard_tuples = [] # i = 0 for other_disease_ID in other_disease_IDs_to_intersection_counts.keys(): # print(i) # i += 1 # print(other_disease_ID) # get the phenotypes associated to the disease if other_disease_ID.split(":")[0] == "DOID": other_disease_label = "disease" if other_disease_ID.split(":")[0] == "OMIM": other_disease_label = "disease" other_disease_phenotypes = RU.get_one_hop_target( other_disease_label, other_disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if other_disease_phenotypes == []: for max_path_len in range(2, 5): other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target( other_disease_label, other_disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if other_disease_phenotypes: break # compute the Jaccard index if not other_disease_phenotypes: jaccard = 0 else: other_disease_phenotypes_set = set(other_disease_phenotypes) jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float( len( list( disease_phenotypes_set.union( other_disease_phenotypes_set)))) # print("jaccard %f" % jaccard) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of") if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'decription'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def answer(self, disease_ID, use_json=False, threshold=0.2): """ Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID (in terms of phenotype overlap) :param disease_ID: KG disease name (eg. DOID:8398) :param use_json: use the standardized output format :param threshold: only include diseases with Jaccard index above this :return: None (print to stdout), unless there's an error, then return 1 """ # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common # direct connection node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ ID] = names2counts[ID] # check if any other diseases passed the threshold if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Now for each of the diseases connecting to source, count number of phenotypes node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) # union the two other_disease_counts = dict() for key in other_doid_counts.keys(): other_disease_counts[key] = other_doid_counts[key] for key in other_omim_counts.keys(): other_disease_counts[key] = other_omim_counts[key] # then compute the jaccard index disease_jaccard_tuples = [] for other_disease_ID in other_disease_counts.keys(): jaccard = 0 if other_disease_ID in other_disease_IDs_to_intersection_counts: union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \ other_disease_IDs_to_intersection_counts[other_disease_ID] jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float(union_card) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'description'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def answer(tissue_id, input_protein_list, use_json=False, num_show=20, rev=True): # Initialize the response class response = FormatOutput.FormatResponse(6) # Make sure everything exists in the graph if not RU.node_exists_with_property(tissue_id, "id"): tissue_id = RU.get_node_property(tissue_id, "id", node_label="anatomical_entity") for i in range(len(input_protein_list)): id = input_protein_list[i] if not RU.node_exists_with_property(id, "id"): input_protein_list[i] = RU.get_node_property( id, "id", node_label="protein") # Initialize the QueryLilGim class q = QueryLilGIM.QueryLilGIM() # get the description tissue_description = RU.get_node_property( tissue_id, 'name', node_label="anatomical_entity") # Get the correlated proteins try: correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy( tissue_id, tuple(input_protein_list)) #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347} except: error_message = "Lil'GIM is experiencing a problem." error_code = "LilGIMerror" response.add_error_message(error_code, error_message) response.print() return 1 # as a list of tuples correlated_proteins_tupes = [] for k, v in correlated_proteins_dict.items(): correlated_proteins_tupes.append((k, v)) # sort by freq correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes, key=lambda x: x[1], reverse=rev) correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[ 0:num_show] correlated_proteins_tupes = correlated_proteins_tupes_sorted # return the results if not use_json: try: protein_descriptions = RU.get_node_property( input_protein_list[0], "name", node_label="protein", name_type="id") except: protein_descriptions = input_protein_list[0] for id in input_protein_list[1:-1]: protein_descriptions += ", " try: protein_descriptions += RU.get_node_property( id, "name", node_label="protein", name_type="id") except: protein_descriptions += id if len(input_protein_list) > 1: try: protein_descriptions += ", and %s" % RU.get_node_property( input_protein_list[-1], "name", node_label="protein", name_type="id") except: protein_descriptions += ", and %s" % input_protein_list[-1] if rev: to_print = "In the tissue: %s, the proteins that correlate most with %s" % ( tissue_description, protein_descriptions) else: to_print = "In the tissue: %s, the proteins that correlate least with %s" % ( tissue_description, protein_descriptions) to_print += " according to Lil'GIM, are:\n" for id, val in correlated_proteins_tupes_sorted: try: to_print += "protein: %s\t correlation %f\n" % ( RU.get_node_property( id, "name", node_label="protein", name_type="id"), val) except: to_print += "protein: %s\t correlation %f\n" % (id, val) print(to_print) else: # otherwise, you want a JSON output protein_descriptions = [] is_in_KG_list = [] for protein, corr in correlated_proteins_tupes: try: description = RU.get_node_property(protein, "name", node_label="protein", name_type="id") protein_descriptions.append(description) is_in_KG_list.append(True) except: protein_description = protein protein_descriptions.append(protein_description) is_in_KG_list.append(False) # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG correlated_proteins_tupes_in_KG = [] for i in range(len(correlated_proteins_tupes)): if is_in_KG_list[i]: correlated_proteins_tupes_in_KG.append( correlated_proteins_tupes[i]) # Return the results full_g = RU.get_graph_from_nodes( [id for id, val in correlated_proteins_tupes_in_KG], node_property_label="id") id2node = dict() for nx_id, node in full_g.nodes(data=True): id2node[node['properties']['id']] = node for id, corr in correlated_proteins_tupes_in_KG: to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." % ( tissue_description, RU.get_node_property( id, "name", node_label="protein", name_type="id"), corr) response.add_subgraph([(id, id2node[id])], [], to_print, corr) response.print()