def get_similar_nodes_in_common_parameters(node_ID, target_node_label, association_node_label): """ This function will get the parameters for get_similar_nodes_in_common based on target node, target label, and association label :param node_ID: source node ID (name in KG) :param target_label: the node types that you want returned :param association_node_label: the association node (node in common between source and target) type :return: dict, error_code, error_message (dict keys input_node_ID, input_node_label, association_node_label, input_association_relationship, target_association_relationship, target_node_label) """ # Check if node exists if not RU.node_exists_with_property(node_ID, 'id'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % node_ID error_code = "DiseaseNotFound" return dict(), error_code, error_message # Get label/kind of node the source is input_node_label = RU.get_node_property(node_ID, "label") input_node_ID = node_ID # Get relationship between source and association label rels = RU.get_relationship_types_between(input_node_ID, input_node_label, "", association_node_label, max_path_len=1) # TODO: there could be multiple relationship types, for now, let's just pop one if not rels: error_code = "NoRelationship" error_message = "Sorry, the %s %s is not connected to any %s." % (input_node_label, input_node_ID, association_node_label) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % ( RU.get_node_property(parent, 'name'), RU.get_node_property(input_node_ID, 'name')) return dict(), error_code, error_message input_association_relationship = rels.pop() # Get relationship between target and association label rels = RU.get_relationship_types_between("", target_node_label, "", association_node_label, max_path_len=1) if not rels: error_code = "NoRelationship" error_message = "Sorry, no %s is not connected to any %s." % (target_node_label, association_node_label) return dict(), error_code, error_message target_association_relationship = rels.pop() # TODO: kludgy fix for microRNA's having multiple relationship types, only one of which shows up frequently if target_association_relationship == "gene_mutations_contribute_to": target_association_relationship = "gene_associated_with_condition" # populate the arguments arguments = dict(input_node_ID=input_node_ID, input_node_label=input_node_label, association_node_label=association_node_label, input_association_relationship=input_association_relationship, target_association_relationship=target_association_relationship, target_node_label=target_node_label) return arguments, None, None
def old_answer(disease_ID, use_json=False, threshold=0.2): # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # print("Total of %d phenotypes" % len(disease_phenotypes)) # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: # direct connection # print("direct") node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID] if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of").pop() if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys()))) # Now for each of the diseases in here, compute the actual Jaccard index disease_jaccard_tuples = [] # i = 0 for other_disease_ID in other_disease_IDs_to_intersection_counts.keys(): # print(i) # i += 1 # print(other_disease_ID) # get the phenotypes associated to the disease if other_disease_ID.split(":")[0] == "DOID": other_disease_label = "disease" if other_disease_ID.split(":")[0] == "OMIM": other_disease_label = "disease" other_disease_phenotypes = RU.get_one_hop_target( other_disease_label, other_disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if other_disease_phenotypes == []: for max_path_len in range(2, 5): other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target( other_disease_label, other_disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if other_disease_phenotypes: break # compute the Jaccard index if not other_disease_phenotypes: jaccard = 0 else: other_disease_phenotypes_set = set(other_disease_phenotypes) jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float( len( list( disease_phenotypes_set.union( other_disease_phenotypes_set)))) # print("jaccard %f" % jaccard) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of") if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'decription'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def answer(self, disease_ID, use_json=False, threshold=0.2): """ Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID (in terms of phenotype overlap) :param disease_ID: KG disease name (eg. DOID:8398) :param use_json: use the standardized output format :param threshold: only include diseases with Jaccard index above this :return: None (print to stdout), unless there's an error, then return 1 """ # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common # direct connection node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ ID] = names2counts[ID] # check if any other diseases passed the threshold if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Now for each of the diseases connecting to source, count number of phenotypes node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) # union the two other_disease_counts = dict() for key in other_doid_counts.keys(): other_disease_counts[key] = other_doid_counts[key] for key in other_omim_counts.keys(): other_disease_counts[key] = other_omim_counts[key] # then compute the jaccard index disease_jaccard_tuples = [] for other_disease_ID in other_disease_counts.keys(): jaccard = 0 if other_disease_ID in other_disease_IDs_to_intersection_counts: union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \ other_disease_IDs_to_intersection_counts[other_disease_ID] jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float(union_card) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'description'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def get_similar_nodes_in_common(input_node_ID, input_node_label, association_node_label, input_association_relationship, target_association_relationship, target_node_label, threshold=0.2): """ This function returns the nodes that are associated with an input node based on Jaccard index similarity of shared intermediate nodes :param input_node_ID: input node ID (in KG) :param input_node_label: label of the input node :param association_node_label: what kind of node you want to calculate the Jaccard index with :param input_association_relationship: how the input node is connected to the association nodes :param target_association_relationship: how the target node is connected to the association node :param target_node_label: what kind of target nodes to return :param threshold: threshold to compute the Jaccard index :return: a list of tuples, an error_code, and an error_message. tuple[0] is a target node with tuple[1] jaccard index based on association nodes """ # get the description input_node_description = RU.get_node_property(input_node_ID, 'name') # get the nodes associated to the input node input_node_associated_nodes = RU.get_one_hop_target(input_node_label, input_node_ID, association_node_label, input_association_relationship) # Look more steps beyond if we didn't get any physically_interacts_with if input_node_associated_nodes == []: for max_path_len in range(2, 5): input_node_associated_nodes = RU.get_node_names_of_type_connected_to_target(input_node_label, input_node_ID, association_node_label, max_path_len=max_path_len, direction="u") if input_node_associated_nodes: break # Make sure you actually picked up at least one associated node if not input_node_associated_nodes: error_code = "NoNodesFound" error_message = "No %s found for %s." % (association_node_label, input_node_description) return [], error_code, error_message input_node_associated_nodes_set = set(input_node_associated_nodes) # get all the other disease that connect and get the association nodes in common # direct connection node_label_list = [association_node_label] relationship_label_list = [input_association_relationship, target_association_relationship] node_of_interest_position = 0 other_node_IDs_to_intersection_counts = dict() #if target_node_label == "disease" or target_node_label == "disease": # target_labels = ["disease", "disease"] #else: target_labels = [target_node_label] for target_label in target_labels: names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(input_node_ID, input_node_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float(len( input_node_associated_nodes_set)) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_node_IDs_to_intersection_counts[ID] = names2counts[ID] # check if any other associated nodes passed the threshold if not other_node_IDs_to_intersection_counts: error_code = "NoNodesFound" error_message = "No %s were found with similarity crossing the threshold of %f." % (target_node_label, threshold) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % ( RU.get_node_property(parent, 'name'), input_node_description) return [], error_code, error_message # Now for each of the nodes connecting to source, count number of association nodes node_label_list = [association_node_label] relationship_label_list = [input_association_relationship, target_association_relationship] node_of_interest_position = 0 other_node_counts = dict() for target_label in target_labels: temp_other_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(input_node_ID, input_node_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) # add it to the dictionary for key in temp_other_counts.keys(): other_node_counts[key] = temp_other_counts[key] # then compute the jaccard index node_jaccard_tuples = [] for other_node_ID in other_node_counts.keys(): jaccard = 0 if other_node_ID in other_node_IDs_to_intersection_counts: union_card = len(input_node_associated_nodes) + other_node_counts[other_node_ID] - \ other_node_IDs_to_intersection_counts[other_node_ID] jaccard = other_node_IDs_to_intersection_counts[other_node_ID] / float(union_card) if jaccard > threshold: node_jaccard_tuples.append((other_node_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not node_jaccard_tuples: error_code = "NoNodesFound" error_message = "No %s's were found with similarity crossing the threshold of %f." % (target_node_label, threshold) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (RU.get_node_property(parent, 'description'), input_node_description) return [], error_code, error_message # Otherwise there are results to return, first sort them largest to smallest node_jaccard_tuples_sorted = [(x, y) for x, y in sorted(node_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] return node_jaccard_tuples_sorted, None, None