Example #1
0
def old_answer(disease_ID, use_json=False, threshold=0.2):
    # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded
    # Initialize the response class
    response = FormatOutput.FormatResponse(4)

    # Check if node exists
    if not RU.node_exists_with_property(disease_ID, 'name'):
        error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
        error_code = "DiseaseNotFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # Get label/kind of node the source is
    disease_label = RU.get_node_property(disease_ID, "label")
    if disease_label != "disease" and disease_label != "disease":
        error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
            " Please try a different term" % disease_label
        error_code = "NotADisease"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # get the description
    disease_description = RU.get_node_property(disease_ID, 'description')

    # get the phenotypes associated to the disease
    disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                               "phenotypic_feature",
                                               "has_phenotype")

    # Look more steps beyond if we didn't get any physically_interacts_with
    if disease_phenotypes == []:
        for max_path_len in range(2, 5):
            disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                disease_label,
                disease_ID,
                "phenotypic_feature",
                max_path_len=max_path_len,
                direction="u")
            if disease_phenotypes:
                break
    # print("Total of %d phenotypes" % len(disease_phenotypes))

    # Make sure you actually picked up at least one phenotype
    if not disease_phenotypes:
        error_message = "No phenotypes found for this disease."
        error_code = "NoPhenotypesFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1
    disease_phenotypes_set = set(disease_phenotypes)

    # get all the other disease that connect and get the phenotypes in common
    other_disease_IDs_to_intersection_counts = dict()
    for target_label in ["disease", "disease"]:

        # direct connection
        # print("direct")
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
            disease_ID, disease_label, target_label, node_label_list,
            relationship_label_list, node_of_interest_position)
        for ID in names2counts.keys():
            if names2counts[ID] / float(
                    len(disease_phenotypes_set)
            ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]

    if not other_disease_IDs_to_intersection_counts:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of").pop()
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys())))
    # Now for each of the diseases in here, compute the actual Jaccard index
    disease_jaccard_tuples = []
    # i = 0
    for other_disease_ID in other_disease_IDs_to_intersection_counts.keys():
        # print(i)
        # i += 1
        # print(other_disease_ID)
        # get the phenotypes associated to the disease
        if other_disease_ID.split(":")[0] == "DOID":
            other_disease_label = "disease"
        if other_disease_ID.split(":")[0] == "OMIM":
            other_disease_label = "disease"
        other_disease_phenotypes = RU.get_one_hop_target(
            other_disease_label, other_disease_ID, "phenotypic_feature",
            "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if other_disease_phenotypes == []:
            for max_path_len in range(2, 5):
                other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    other_disease_label,
                    other_disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if other_disease_phenotypes:
                    break

        # compute the Jaccard index
        if not other_disease_phenotypes:
            jaccard = 0
        else:
            other_disease_phenotypes_set = set(other_disease_phenotypes)
            jaccard = other_disease_IDs_to_intersection_counts[
                other_disease_ID] / float(
                    len(
                        list(
                            disease_phenotypes_set.union(
                                other_disease_phenotypes_set))))
        # print("jaccard %f" % jaccard)
        if jaccard > threshold:
            disease_jaccard_tuples.append((other_disease_ID, jaccard))

    # Format the results.
    # Maybe nothing passed the threshold
    if not disease_jaccard_tuples:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of")
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            return 1

    # Otherwise there are results to return, first sort them largest to smallest
    disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
        disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
    if not use_json:
        to_print = "The diseases similar to %s are: \n" % disease_description
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print += "%s\t%s\tJaccard %f\n" % (
                other_disease_ID,
                RU.get_node_property(other_disease_ID, 'description'), jaccard)
        print(to_print)
    else:
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print = "%s is similar to the disease %s with similarity value %f" % (
                disease_description,
                RU.get_node_property(other_disease_ID, 'decription'), jaccard)
            g = RU.get_node_as_graph(other_disease_ID)
            response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                  to_print, jaccard)
        response.print()
Example #2
0
    def answer(drug_id,
               use_json=False,
               num_show=20,
               rev=True,
               conservative=True):
        """
		Answers the question 'what diseases does $drug commonly treat?'
		:param disease_id: KG disease node name
		:param use_json: bool, use JSON output
		:param num_show: int, number to display
		:param rev: bool. order by most frequent
		:param conservative: bool, True if using exact matches, False if using any synonyms returned by COHD
		:return: none
		"""

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # get the description
        drug_description = RU.get_node_property(drug_id,
                                                'name',
                                                name_type='id')

        # Get the conditions that COHD says it's used to treat
        conditions_treated = COHDUtilities.get_conditions_treating(
            drug_description, conservative=conservative)

        # sort the diseases by frequency
        ids_counts = []
        for id in conditions_treated:
            cond = conditions_treated[id]
            ids_counts.append((id, cond['concept_count']))

        ids_counts_sorted = sorted(ids_counts, key=lambda x: x[1], reverse=rev)
        ids_sorted = [i[0] for i in ids_counts_sorted]

        # reduce to top n
        ids_sorted_top_n = ids_sorted
        if len(ids_sorted_top_n) > num_show:
            ids_sorted_top_n = ids_sorted_top_n[0:num_show]

        # return the results
        if not use_json:
            if rev:
                to_print = "The most common conditions "
            else:
                to_print = "The least common conditions "
            to_print += "treated with %s, according to the Columbia Open Health Data, are:\n" % drug_description
            for id in ids_sorted_top_n:
                to_print += "condition: %s\t count %d \t frequency %f \n" % (
                    conditions_treated[id]['associated_concept_name'],
                    conditions_treated[id]['concept_count'],
                    conditions_treated[id]['concept_frequency'])
            print(to_print)
        else:
            #  otherwise, you want a JSON output
            #  Attempt to map the COHD names to the KG (this takes some time)l. TODO: find further speed improvements
            drug_as_graph = RU.get_node_as_graph(drug_id)
            drug_node_info = list(drug_as_graph.nodes(data=True))[0][1]
            id_to_KG_name = dict()
            id_to_name = dict()
            id_to_count = dict()
            id_to_frequency = dict()
            id_to_id = dict()

            # Map ID's to all relevant values
            for id in ids_sorted_top_n:
                id_to_name[id] = conditions_treated[id][
                    'associated_concept_name']
                id_to_count[id] = conditions_treated[id]['concept_count']
                id_to_frequency[id] = conditions_treated[id][
                    'concept_frequency']
                id_to_KG_name[id] = None
                try:
                    id_to_KG_name[id] = RU.get_id_from_property(
                        id_to_name[id], 'name', label="phenotypic_feature")
                    id_to_id[id_to_KG_name[id]] = id
                except:
                    try:
                        id_to_KG_name[id] = RU.get_id_from_property(
                            id_to_name[id], 'name', label="disease")
                        id_to_id[id_to_KG_name[id]] = id
                    except:
                        try:
                            id_to_KG_name[id] = RU.get_id_from_property(
                                id_to_name[id].lower(),
                                'name',
                                label="phenotypic_feature")
                            id_to_id[id_to_KG_name[id]] = id
                        except:
                            try:
                                id_to_KG_name[id] = RU.get_id_from_property(
                                    id_to_name[id].lower(),
                                    'name',
                                    label="disease")
                                id_to_id[id_to_KG_name[id]] = id
                            except:
                                pass

            # get the graph (one call) of all the nodes that wer mapped
            KG_names = []
            for id in ids_sorted_top_n:
                if id_to_KG_name[id] is not None:
                    KG_names.append(id_to_KG_name[id])

            if not KG_names:
                error_message = "Sorry, Columbia Open Health Data has no data on the use of %s" % drug_description
                error_code = "EmptyResult"
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

            all_conditions_graph = RU.get_graph_from_nodes(KG_names)

            # Get the info of the mapped nodes
            id_to_info = dict()
            for u, data in all_conditions_graph.nodes(data=True):
                id = data['properties']['id']
                id = id_to_id[id]
                id_to_info[id] = data

            # for each condition, return the results (with the nice sub-graph if the cohd id's were mapped)
            for id in ids_sorted_top_n:
                if id_to_KG_name[id] is not None:
                    to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \
                         "%f out of all patients treated with %s (count=%d)." % (
                    drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id])
                    nodes = []
                    disease_node_info = id_to_info[id]
                    nodes.append((2, disease_node_info))
                    nodes.append((1, drug_node_info))
                    edges = [(1, 2, {
                        'id': 3,
                        'properties': {
                            'is_defined_by':
                            'RTX',
                            'predicate':
                            'treats',
                            'provided_by':
                            'COHD',
                            'relation':
                            'treats',
                            'seed_node_uuid':
                            '-1',
                            'source_node_uuid':
                            drug_node_info['properties']['UUID'],
                            'target_node_uuid':
                            disease_node_info['properties']['UUID']
                        },
                        'type': 'treats'
                    })]
                    response.add_subgraph(nodes, edges, to_print,
                                          id_to_frequency[id])
                else:
                    to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \
                      "%f out of all patients treated with %s (count=%d). This condition is not in our " \
                      "Knowledge graph, so no graph is shown." % (
                     drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id])
                    g = RU.get_node_as_graph(drug_id)
                    response.add_subgraph(g.nodes(data=True),
                                          g.edges(data=True), to_print,
                                          id_to_frequency[id])
            response.print()
Example #3
0
    def answer(self, disease_ID, use_json=False, threshold=0.2):
        """
		Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID
		(in terms of phenotype overlap)
		:param disease_ID: KG disease name (eg. DOID:8398)
		:param use_json: use the standardized output format
		:param threshold: only include diseases with Jaccard index above this
		:return: None (print to stdout), unless there's an error, then return 1
		"""
        # Initialize the response class
        response = FormatOutput.FormatResponse(4)

        # Check if node exists
        if not RU.node_exists_with_property(disease_ID, 'name'):
            error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
            error_code = "DiseaseNotFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Get label/kind of node the source is
        disease_label = RU.get_node_property(disease_ID, "label")
        if disease_label != "disease" and disease_label != "disease":
            error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
                " Please try a different term" % disease_label
            error_code = "NotADisease"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # get the description
        disease_description = RU.get_node_property(disease_ID, 'description')

        # get the phenotypes associated to the disease
        disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                                   "phenotypic_feature",
                                                   "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if disease_phenotypes == []:
            for max_path_len in range(2, 5):
                disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    disease_label,
                    disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if disease_phenotypes:
                    break

        # Make sure you actually picked up at least one phenotype
        if not disease_phenotypes:
            error_message = "No phenotypes found for this disease."
            error_code = "NoPhenotypesFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1
        disease_phenotypes_set = set(disease_phenotypes)

        # get all the other disease that connect and get the phenotypes in common
        # direct connection
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_disease_IDs_to_intersection_counts = dict()
        for target_label in ["disease", "disease"]:
            names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
                disease_ID, disease_label, target_label, node_label_list,
                relationship_label_list, node_of_interest_position)
            for ID in names2counts.keys():
                if names2counts[ID] / float(
                        len(disease_phenotypes_set)
                ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                    other_disease_IDs_to_intersection_counts[
                        ID] = names2counts[ID]

        # check if any other diseases passed the threshold
        if not other_disease_IDs_to_intersection_counts:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Now for each of the diseases connecting to source, count number of phenotypes
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        # union the two
        other_disease_counts = dict()
        for key in other_doid_counts.keys():
            other_disease_counts[key] = other_doid_counts[key]
        for key in other_omim_counts.keys():
            other_disease_counts[key] = other_omim_counts[key]

        # then compute the jaccard index
        disease_jaccard_tuples = []
        for other_disease_ID in other_disease_counts.keys():
            jaccard = 0
            if other_disease_ID in other_disease_IDs_to_intersection_counts:
                union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \
                    other_disease_IDs_to_intersection_counts[other_disease_ID]
                jaccard = other_disease_IDs_to_intersection_counts[
                    other_disease_ID] / float(union_card)
            if jaccard > threshold:
                disease_jaccard_tuples.append((other_disease_ID, jaccard))

        # Format the results.
        # Maybe nothing passed the threshold
        if not disease_jaccard_tuples:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                return 1

        # Otherwise there are results to return, first sort them largest to smallest
        disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
            disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
        if not use_json:
            to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
            print(to_print)
        else:
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % (
                    disease_description,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
                g = RU.get_node_as_graph(other_disease_ID)
                response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                      to_print, jaccard)
            response.print()