コード例 #1
0
	def get_similar_nodes_in_common_parameters(node_ID, target_node_label, association_node_label):
		"""
		This function will get the parameters for get_similar_nodes_in_common based on target node, target label, and association label
		:param node_ID: source node ID (name in KG)
		:param target_label: the node types that you want returned
		:param association_node_label: the association node (node in common between source and target) type
		:return: dict, error_code, error_message (dict keys input_node_ID, input_node_label, association_node_label, input_association_relationship,
				target_association_relationship, target_node_label)
		"""
		# Check if node exists
		if not RU.node_exists_with_property(node_ID, 'id'):
			error_message = "Sorry, the disease %s is not yet in our knowledge graph." % node_ID
			error_code = "DiseaseNotFound"
			return dict(), error_code, error_message

		# Get label/kind of node the source is
		input_node_label = RU.get_node_property(node_ID, "label")
		input_node_ID = node_ID

		# Get relationship between source and association label
		rels = RU.get_relationship_types_between(input_node_ID, input_node_label, "", association_node_label, max_path_len=1)
		# TODO: there could be multiple relationship types, for now, let's just pop one
		if not rels:
			error_code = "NoRelationship"
			error_message = "Sorry, the %s %s is not connected to any %s." % (input_node_label, input_node_ID, association_node_label)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (
				RU.get_node_property(parent, 'name'), RU.get_node_property(input_node_ID, 'name'))
			return dict(), error_code, error_message
		input_association_relationship = rels.pop()

		# Get relationship between target and association label
		rels = RU.get_relationship_types_between("", target_node_label, "", association_node_label, max_path_len=1)
		if not rels:
			error_code = "NoRelationship"
			error_message = "Sorry, no %s is not connected to any %s." % (target_node_label, association_node_label)
			return dict(), error_code, error_message
		target_association_relationship = rels.pop()
		# TODO: kludgy fix for microRNA's having multiple relationship types, only one of which shows up frequently
		if target_association_relationship == "gene_mutations_contribute_to":
			target_association_relationship = "gene_associated_with_condition"

		# populate the arguments
		arguments = dict(input_node_ID=input_node_ID,
						input_node_label=input_node_label,
						association_node_label=association_node_label,
						input_association_relationship=input_association_relationship,
						target_association_relationship=target_association_relationship,
						target_node_label=target_node_label)
		return arguments, None, None
コード例 #2
0
def old_answer(disease_ID, use_json=False, threshold=0.2):
    # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded
    # Initialize the response class
    response = FormatOutput.FormatResponse(4)

    # Check if node exists
    if not RU.node_exists_with_property(disease_ID, 'name'):
        error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
        error_code = "DiseaseNotFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # Get label/kind of node the source is
    disease_label = RU.get_node_property(disease_ID, "label")
    if disease_label != "disease" and disease_label != "disease":
        error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
            " Please try a different term" % disease_label
        error_code = "NotADisease"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # get the description
    disease_description = RU.get_node_property(disease_ID, 'description')

    # get the phenotypes associated to the disease
    disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                               "phenotypic_feature",
                                               "has_phenotype")

    # Look more steps beyond if we didn't get any physically_interacts_with
    if disease_phenotypes == []:
        for max_path_len in range(2, 5):
            disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                disease_label,
                disease_ID,
                "phenotypic_feature",
                max_path_len=max_path_len,
                direction="u")
            if disease_phenotypes:
                break
    # print("Total of %d phenotypes" % len(disease_phenotypes))

    # Make sure you actually picked up at least one phenotype
    if not disease_phenotypes:
        error_message = "No phenotypes found for this disease."
        error_code = "NoPhenotypesFound"
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1
    disease_phenotypes_set = set(disease_phenotypes)

    # get all the other disease that connect and get the phenotypes in common
    other_disease_IDs_to_intersection_counts = dict()
    for target_label in ["disease", "disease"]:

        # direct connection
        # print("direct")
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
            disease_ID, disease_label, target_label, node_label_list,
            relationship_label_list, node_of_interest_position)
        for ID in names2counts.keys():
            if names2counts[ID] / float(
                    len(disease_phenotypes_set)
            ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]

    if not other_disease_IDs_to_intersection_counts:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of").pop()
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

    # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys())))
    # Now for each of the diseases in here, compute the actual Jaccard index
    disease_jaccard_tuples = []
    # i = 0
    for other_disease_ID in other_disease_IDs_to_intersection_counts.keys():
        # print(i)
        # i += 1
        # print(other_disease_ID)
        # get the phenotypes associated to the disease
        if other_disease_ID.split(":")[0] == "DOID":
            other_disease_label = "disease"
        if other_disease_ID.split(":")[0] == "OMIM":
            other_disease_label = "disease"
        other_disease_phenotypes = RU.get_one_hop_target(
            other_disease_label, other_disease_ID, "phenotypic_feature",
            "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if other_disease_phenotypes == []:
            for max_path_len in range(2, 5):
                other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    other_disease_label,
                    other_disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if other_disease_phenotypes:
                    break

        # compute the Jaccard index
        if not other_disease_phenotypes:
            jaccard = 0
        else:
            other_disease_phenotypes_set = set(other_disease_phenotypes)
            jaccard = other_disease_IDs_to_intersection_counts[
                other_disease_ID] / float(
                    len(
                        list(
                            disease_phenotypes_set.union(
                                other_disease_phenotypes_set))))
        # print("jaccard %f" % jaccard)
        if jaccard > threshold:
            disease_jaccard_tuples.append((other_disease_ID, jaccard))

    # Format the results.
    # Maybe nothing passed the threshold
    if not disease_jaccard_tuples:
        error_code = "NoDiseasesFound"
        error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
        parent = RU.get_one_hop_target(disease_label, disease_ID,
                                       disease_label, "subclass_of")
        if parent:
            error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                RU.get_node_property(parent,
                                     'description'), disease_description)
        if not use_json:
            print(error_message)
            return 1
        else:
            response.add_error_message(error_code, error_message)
            return 1

    # Otherwise there are results to return, first sort them largest to smallest
    disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
        disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
    if not use_json:
        to_print = "The diseases similar to %s are: \n" % disease_description
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print += "%s\t%s\tJaccard %f\n" % (
                other_disease_ID,
                RU.get_node_property(other_disease_ID, 'description'), jaccard)
        print(to_print)
    else:
        for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
            to_print = "%s is similar to the disease %s with similarity value %f" % (
                disease_description,
                RU.get_node_property(other_disease_ID, 'decription'), jaccard)
            g = RU.get_node_as_graph(other_disease_ID)
            response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                  to_print, jaccard)
        response.print()
コード例 #3
0
    def answer(self, disease_ID, use_json=False, threshold=0.2):
        """
		Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID
		(in terms of phenotype overlap)
		:param disease_ID: KG disease name (eg. DOID:8398)
		:param use_json: use the standardized output format
		:param threshold: only include diseases with Jaccard index above this
		:return: None (print to stdout), unless there's an error, then return 1
		"""
        # Initialize the response class
        response = FormatOutput.FormatResponse(4)

        # Check if node exists
        if not RU.node_exists_with_property(disease_ID, 'name'):
            error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
            error_code = "DiseaseNotFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Get label/kind of node the source is
        disease_label = RU.get_node_property(disease_ID, "label")
        if disease_label != "disease" and disease_label != "disease":
            error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
                " Please try a different term" % disease_label
            error_code = "NotADisease"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # get the description
        disease_description = RU.get_node_property(disease_ID, 'description')

        # get the phenotypes associated to the disease
        disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                                   "phenotypic_feature",
                                                   "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if disease_phenotypes == []:
            for max_path_len in range(2, 5):
                disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    disease_label,
                    disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if disease_phenotypes:
                    break

        # Make sure you actually picked up at least one phenotype
        if not disease_phenotypes:
            error_message = "No phenotypes found for this disease."
            error_code = "NoPhenotypesFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1
        disease_phenotypes_set = set(disease_phenotypes)

        # get all the other disease that connect and get the phenotypes in common
        # direct connection
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_disease_IDs_to_intersection_counts = dict()
        for target_label in ["disease", "disease"]:
            names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
                disease_ID, disease_label, target_label, node_label_list,
                relationship_label_list, node_of_interest_position)
            for ID in names2counts.keys():
                if names2counts[ID] / float(
                        len(disease_phenotypes_set)
                ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                    other_disease_IDs_to_intersection_counts[
                        ID] = names2counts[ID]

        # check if any other diseases passed the threshold
        if not other_disease_IDs_to_intersection_counts:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Now for each of the diseases connecting to source, count number of phenotypes
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        # union the two
        other_disease_counts = dict()
        for key in other_doid_counts.keys():
            other_disease_counts[key] = other_doid_counts[key]
        for key in other_omim_counts.keys():
            other_disease_counts[key] = other_omim_counts[key]

        # then compute the jaccard index
        disease_jaccard_tuples = []
        for other_disease_ID in other_disease_counts.keys():
            jaccard = 0
            if other_disease_ID in other_disease_IDs_to_intersection_counts:
                union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \
                    other_disease_IDs_to_intersection_counts[other_disease_ID]
                jaccard = other_disease_IDs_to_intersection_counts[
                    other_disease_ID] / float(union_card)
            if jaccard > threshold:
                disease_jaccard_tuples.append((other_disease_ID, jaccard))

        # Format the results.
        # Maybe nothing passed the threshold
        if not disease_jaccard_tuples:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                return 1

        # Otherwise there are results to return, first sort them largest to smallest
        disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
            disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
        if not use_json:
            to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
            print(to_print)
        else:
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % (
                    disease_description,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
                g = RU.get_node_as_graph(other_disease_ID)
                response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                      to_print, jaccard)
            response.print()
コード例 #4
0
	def get_similar_nodes_in_common(input_node_ID, input_node_label, association_node_label, input_association_relationship,
				target_association_relationship, target_node_label, threshold=0.2):
		"""
		This function returns the nodes that are associated with an input node based on Jaccard index similarity of
		shared intermediate nodes
		:param input_node_ID: input node ID (in KG)
		:param input_node_label: label of the input node
		:param association_node_label: what kind of node you want to calculate the Jaccard index with
		:param input_association_relationship: how the input node is connected to the association nodes
		:param target_association_relationship: how the target node is connected to the association node
		:param target_node_label: what kind of target nodes to return
		:param threshold: threshold to compute the Jaccard index
		:return: a list of tuples, an error_code, and an error_message. tuple[0] is a target node with tuple[1] jaccard index based on association nodes
		"""
		# get the description
		input_node_description = RU.get_node_property(input_node_ID, 'name')

		# get the nodes associated to the input node
		input_node_associated_nodes = RU.get_one_hop_target(input_node_label, input_node_ID, association_node_label,
															input_association_relationship)

		# Look more steps beyond if we didn't get any physically_interacts_with
		if input_node_associated_nodes == []:
			for max_path_len in range(2, 5):
				input_node_associated_nodes = RU.get_node_names_of_type_connected_to_target(input_node_label, input_node_ID,
																			association_node_label,
																			max_path_len=max_path_len,
																			direction="u")
				if input_node_associated_nodes:
					break

		# Make sure you actually picked up at least one associated node
		if not input_node_associated_nodes:
			error_code = "NoNodesFound"
			error_message = "No %s found for %s." % (association_node_label, input_node_description)
			return [], error_code, error_message

		input_node_associated_nodes_set = set(input_node_associated_nodes)

		# get all the other disease that connect and get the association nodes in common
		# direct connection
		node_label_list = [association_node_label]
		relationship_label_list = [input_association_relationship, target_association_relationship]
		node_of_interest_position = 0
		other_node_IDs_to_intersection_counts = dict()
		#if target_node_label == "disease" or target_node_label == "disease":
		#	target_labels = ["disease", "disease"]
		#else:
		target_labels = [target_node_label]
		for target_label in target_labels:
			names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(input_node_ID, input_node_label,
																						target_label, node_label_list,
																						relationship_label_list,
																						node_of_interest_position)
			for ID in names2counts.keys():
				if names2counts[ID] / float(len(
						input_node_associated_nodes_set)) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
					other_node_IDs_to_intersection_counts[ID] = names2counts[ID]

		# check if any other associated nodes passed the threshold
		if not other_node_IDs_to_intersection_counts:
			error_code = "NoNodesFound"
			error_message = "No %s were found with similarity crossing the threshold of %f." % (target_node_label, threshold)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (
				RU.get_node_property(parent, 'name'), input_node_description)
			return [], error_code, error_message

		# Now for each of the nodes connecting to source, count number of association nodes
		node_label_list = [association_node_label]
		relationship_label_list = [input_association_relationship, target_association_relationship]
		node_of_interest_position = 0
		other_node_counts = dict()
		for target_label in target_labels:
			temp_other_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(input_node_ID, input_node_label,
																					   target_label, node_label_list,
																					   relationship_label_list,
																					   node_of_interest_position)
			# add it to the dictionary
			for key in temp_other_counts.keys():
				other_node_counts[key] = temp_other_counts[key]

		# then compute the jaccard index
		node_jaccard_tuples = []
		for other_node_ID in other_node_counts.keys():
			jaccard = 0
			if other_node_ID in other_node_IDs_to_intersection_counts:
				union_card = len(input_node_associated_nodes) + other_node_counts[other_node_ID] - \
							other_node_IDs_to_intersection_counts[other_node_ID]
				jaccard = other_node_IDs_to_intersection_counts[other_node_ID] / float(union_card)
			if jaccard > threshold:
				node_jaccard_tuples.append((other_node_ID, jaccard))

		# Format the results.
		# Maybe nothing passed the threshold
		if not node_jaccard_tuples:
			error_code = "NoNodesFound"
			error_message = "No %s's were found with similarity crossing the threshold of %f." % (target_node_label, threshold)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (RU.get_node_property(parent, 'description'), input_node_description)
			return [], error_code, error_message

		# Otherwise there are results to return, first sort them largest to smallest
		node_jaccard_tuples_sorted = [(x, y) for x, y in
										sorted(node_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]

		return node_jaccard_tuples_sorted, None, None