def old_answer(disease_ID, use_json=False, threshold=0.2): # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # print("Total of %d phenotypes" % len(disease_phenotypes)) # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: # direct connection # print("direct") node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID] if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of").pop() if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys()))) # Now for each of the diseases in here, compute the actual Jaccard index disease_jaccard_tuples = [] # i = 0 for other_disease_ID in other_disease_IDs_to_intersection_counts.keys(): # print(i) # i += 1 # print(other_disease_ID) # get the phenotypes associated to the disease if other_disease_ID.split(":")[0] == "DOID": other_disease_label = "disease" if other_disease_ID.split(":")[0] == "OMIM": other_disease_label = "disease" other_disease_phenotypes = RU.get_one_hop_target( other_disease_label, other_disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if other_disease_phenotypes == []: for max_path_len in range(2, 5): other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target( other_disease_label, other_disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if other_disease_phenotypes: break # compute the Jaccard index if not other_disease_phenotypes: jaccard = 0 else: other_disease_phenotypes_set = set(other_disease_phenotypes) jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float( len( list( disease_phenotypes_set.union( other_disease_phenotypes_set)))) # print("jaccard %f" % jaccard) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of") if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'decription'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def answer(drug_id, use_json=False, num_show=20, rev=True, conservative=True): """ Answers the question 'what diseases does $drug commonly treat?' :param disease_id: KG disease node name :param use_json: bool, use JSON output :param num_show: int, number to display :param rev: bool. order by most frequent :param conservative: bool, True if using exact matches, False if using any synonyms returned by COHD :return: none """ # Initialize the response class response = FormatOutput.FormatResponse(6) # get the description drug_description = RU.get_node_property(drug_id, 'name', name_type='id') # Get the conditions that COHD says it's used to treat conditions_treated = COHDUtilities.get_conditions_treating( drug_description, conservative=conservative) # sort the diseases by frequency ids_counts = [] for id in conditions_treated: cond = conditions_treated[id] ids_counts.append((id, cond['concept_count'])) ids_counts_sorted = sorted(ids_counts, key=lambda x: x[1], reverse=rev) ids_sorted = [i[0] for i in ids_counts_sorted] # reduce to top n ids_sorted_top_n = ids_sorted if len(ids_sorted_top_n) > num_show: ids_sorted_top_n = ids_sorted_top_n[0:num_show] # return the results if not use_json: if rev: to_print = "The most common conditions " else: to_print = "The least common conditions " to_print += "treated with %s, according to the Columbia Open Health Data, are:\n" % drug_description for id in ids_sorted_top_n: to_print += "condition: %s\t count %d \t frequency %f \n" % ( conditions_treated[id]['associated_concept_name'], conditions_treated[id]['concept_count'], conditions_treated[id]['concept_frequency']) print(to_print) else: # otherwise, you want a JSON output # Attempt to map the COHD names to the KG (this takes some time)l. TODO: find further speed improvements drug_as_graph = RU.get_node_as_graph(drug_id) drug_node_info = list(drug_as_graph.nodes(data=True))[0][1] id_to_KG_name = dict() id_to_name = dict() id_to_count = dict() id_to_frequency = dict() id_to_id = dict() # Map ID's to all relevant values for id in ids_sorted_top_n: id_to_name[id] = conditions_treated[id][ 'associated_concept_name'] id_to_count[id] = conditions_treated[id]['concept_count'] id_to_frequency[id] = conditions_treated[id][ 'concept_frequency'] id_to_KG_name[id] = None try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id], 'name', label="phenotypic_feature") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id], 'name', label="disease") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id].lower(), 'name', label="phenotypic_feature") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id].lower(), 'name', label="disease") id_to_id[id_to_KG_name[id]] = id except: pass # get the graph (one call) of all the nodes that wer mapped KG_names = [] for id in ids_sorted_top_n: if id_to_KG_name[id] is not None: KG_names.append(id_to_KG_name[id]) if not KG_names: error_message = "Sorry, Columbia Open Health Data has no data on the use of %s" % drug_description error_code = "EmptyResult" response.add_error_message(error_code, error_message) response.print() return 1 all_conditions_graph = RU.get_graph_from_nodes(KG_names) # Get the info of the mapped nodes id_to_info = dict() for u, data in all_conditions_graph.nodes(data=True): id = data['properties']['id'] id = id_to_id[id] id_to_info[id] = data # for each condition, return the results (with the nice sub-graph if the cohd id's were mapped) for id in ids_sorted_top_n: if id_to_KG_name[id] is not None: to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \ "%f out of all patients treated with %s (count=%d)." % ( drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id]) nodes = [] disease_node_info = id_to_info[id] nodes.append((2, disease_node_info)) nodes.append((1, drug_node_info)) edges = [(1, 2, { 'id': 3, 'properties': { 'is_defined_by': 'RTX', 'predicate': 'treats', 'provided_by': 'COHD', 'relation': 'treats', 'seed_node_uuid': '-1', 'source_node_uuid': drug_node_info['properties']['UUID'], 'target_node_uuid': disease_node_info['properties']['UUID'] }, 'type': 'treats' })] response.add_subgraph(nodes, edges, to_print, id_to_frequency[id]) else: to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \ "%f out of all patients treated with %s (count=%d). This condition is not in our " \ "Knowledge graph, so no graph is shown." % ( drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id]) g = RU.get_node_as_graph(drug_id) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, id_to_frequency[id]) response.print()
def answer(self, disease_ID, use_json=False, threshold=0.2): """ Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID (in terms of phenotype overlap) :param disease_ID: KG disease name (eg. DOID:8398) :param use_json: use the standardized output format :param threshold: only include diseases with Jaccard index above this :return: None (print to stdout), unless there's an error, then return 1 """ # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common # direct connection node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ ID] = names2counts[ID] # check if any other diseases passed the threshold if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Now for each of the diseases connecting to source, count number of phenotypes node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) # union the two other_disease_counts = dict() for key in other_doid_counts.keys(): other_disease_counts[key] = other_doid_counts[key] for key in other_omim_counts.keys(): other_disease_counts[key] = other_omim_counts[key] # then compute the jaccard index disease_jaccard_tuples = [] for other_disease_ID in other_disease_counts.keys(): jaccard = 0 if other_disease_ID in other_disease_IDs_to_intersection_counts: union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \ other_disease_IDs_to_intersection_counts[other_disease_ID] jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float(union_card) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'description'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()