def main(): parser = argparse.ArgumentParser( description= "Answers questions of the form: 'what pathways are most enriched by $protein_list?'", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-s', '--source', type=str, help="source curie ID", default="UniProtKB:Q96M43") parser.add_argument('-t', '--target', type=str, help="target node type", default="pathway") parser.add_argument('-y', '--type', type=str, help="source node type", default="protein") parser.add_argument( '-j', '--json', action='store_true', help= 'Flag specifying that results should be printed in JSON format (to stdout)', default=False) parser.add_argument( '-r', '--rel_type', type=str, help='Only do the Fisher exact test along edges of this type', default=None) parser.add_argument( '--describe', action='store_true', help='Print a description of the question to stdout and quit', default=False) parser.add_argument('--num_show', type=int, help='Maximum number of results to return', default=20) # Parse and check args args = parser.parse_args() source_arg = args.source target_type = args.target source_type = args.type use_json = args.json describe_flag = args.describe num_show = args.num_show rel_type = args.rel_type if source_arg[0] == "[": if "','" not in source_arg: source_arg = source_arg.replace(",", "','").replace("[", "['").replace( "]", "']") source_list = ast.literal_eval(source_arg) source_list_strip = [] for source in source_list: source_list_strip.append(source.strip()) source_list = source_list_strip else: source_list = [source_arg] # Initialize the question class Q = QuestionFisher() if describe_flag: res = Q.describe() print(res) else: # Initialize the response class response = FormatOutput.FormatResponse(6) response.response.table_column_names = [ "target name", "target ID", "P value" ] graph_weight_tuples = [] q_answer = Q.answer(source_list, source_type, target_type, use_json=use_json, num_show=num_show, rel_type=rel_type) if not q_answer: # if q_answer == None return None # All messages printed out; safe to quit p_dict, target_list = q_answer # print out the results if not use_json: for target_name in target_list: target_description = RU.get_node_property( target_name, "name", node_label=target_type) print("%s %f" % (target_description, p_dict[target_name])) else: #response.response.table_column_names = ["source name", "source ID", "target name", "target ID", "path weight", # "target source google distance", # "ML probability target treats source"] for target_name in target_list: target_description = RU.get_node_property( target_name, "name", node_label=target_type) target_id_old_curie = target_name.replace( "CHEMBL.COMPOUND:CHEMBL", "ChEMBL:") confidence = p_dict[target_name] # populate the graph graph = RU.get_graph_from_nodes([target_name]) res = response.add_subgraph( graph.nodes(data=True), graph.edges(data=True), "The target %s is enriched by %s." % (target_description, str(source_list)), confidence, return_result=True) res.essence = "%s" % target_description # populate with essence of question result row_data = [] # initialize the row data #row_data.append("%s" % source_description) #row_data.append("%s" % source_id) row_data.append("%s" % target_description) row_data.append("%s" % target_name) row_data.append("%f" % confidence) #row_data.append("%f" % gd) #row_data.append("%f" % prob) res.row_data = row_data response.print()
def answer(drug_id, use_json=False, num_show=20, rev=True, conservative=True): """ Answers the question 'what diseases does $drug commonly treat?' :param disease_id: KG disease node name :param use_json: bool, use JSON output :param num_show: int, number to display :param rev: bool. order by most frequent :param conservative: bool, True if using exact matches, False if using any synonyms returned by COHD :return: none """ # Initialize the response class response = FormatOutput.FormatResponse(6) # get the description drug_description = RU.get_node_property(drug_id, 'name', name_type='id') # Get the conditions that COHD says it's used to treat conditions_treated = COHDUtilities.get_conditions_treating( drug_description, conservative=conservative) # sort the diseases by frequency ids_counts = [] for id in conditions_treated: cond = conditions_treated[id] ids_counts.append((id, cond['concept_count'])) ids_counts_sorted = sorted(ids_counts, key=lambda x: x[1], reverse=rev) ids_sorted = [i[0] for i in ids_counts_sorted] # reduce to top n ids_sorted_top_n = ids_sorted if len(ids_sorted_top_n) > num_show: ids_sorted_top_n = ids_sorted_top_n[0:num_show] # return the results if not use_json: if rev: to_print = "The most common conditions " else: to_print = "The least common conditions " to_print += "treated with %s, according to the Columbia Open Health Data, are:\n" % drug_description for id in ids_sorted_top_n: to_print += "condition: %s\t count %d \t frequency %f \n" % ( conditions_treated[id]['associated_concept_name'], conditions_treated[id]['concept_count'], conditions_treated[id]['concept_frequency']) print(to_print) else: # otherwise, you want a JSON output # Attempt to map the COHD names to the KG (this takes some time)l. TODO: find further speed improvements drug_as_graph = RU.get_node_as_graph(drug_id) drug_node_info = list(drug_as_graph.nodes(data=True))[0][1] id_to_KG_name = dict() id_to_name = dict() id_to_count = dict() id_to_frequency = dict() id_to_id = dict() # Map ID's to all relevant values for id in ids_sorted_top_n: id_to_name[id] = conditions_treated[id][ 'associated_concept_name'] id_to_count[id] = conditions_treated[id]['concept_count'] id_to_frequency[id] = conditions_treated[id][ 'concept_frequency'] id_to_KG_name[id] = None try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id], 'name', label="phenotypic_feature") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id], 'name', label="disease") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id].lower(), 'name', label="phenotypic_feature") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id].lower(), 'name', label="disease") id_to_id[id_to_KG_name[id]] = id except: pass # get the graph (one call) of all the nodes that wer mapped KG_names = [] for id in ids_sorted_top_n: if id_to_KG_name[id] is not None: KG_names.append(id_to_KG_name[id]) if not KG_names: error_message = "Sorry, Columbia Open Health Data has no data on the use of %s" % drug_description error_code = "EmptyResult" response.add_error_message(error_code, error_message) response.print() return 1 all_conditions_graph = RU.get_graph_from_nodes(KG_names) # Get the info of the mapped nodes id_to_info = dict() for u, data in all_conditions_graph.nodes(data=True): id = data['properties']['id'] id = id_to_id[id] id_to_info[id] = data # for each condition, return the results (with the nice sub-graph if the cohd id's were mapped) for id in ids_sorted_top_n: if id_to_KG_name[id] is not None: to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \ "%f out of all patients treated with %s (count=%d)." % ( drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id]) nodes = [] disease_node_info = id_to_info[id] nodes.append((2, disease_node_info)) nodes.append((1, drug_node_info)) edges = [(1, 2, { 'id': 3, 'properties': { 'is_defined_by': 'RTX', 'predicate': 'treats', 'provided_by': 'COHD', 'relation': 'treats', 'seed_node_uuid': '-1', 'source_node_uuid': drug_node_info['properties']['UUID'], 'target_node_uuid': disease_node_info['properties']['UUID'] }, 'type': 'treats' })] response.add_subgraph(nodes, edges, to_print, id_to_frequency[id]) else: to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \ "%f out of all patients treated with %s (count=%d). This condition is not in our " \ "Knowledge graph, so no graph is shown." % ( drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id]) g = RU.get_node_as_graph(drug_id) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, id_to_frequency[id]) response.print()
def answer(tissue_id, input_protein_list, use_json=False, num_show=20, rev=True): # Initialize the response class response = FormatOutput.FormatResponse(6) # Make sure everything exists in the graph if not RU.node_exists_with_property(tissue_id, "id"): tissue_id = RU.get_node_property(tissue_id, "id", node_label="anatomical_entity") for i in range(len(input_protein_list)): id = input_protein_list[i] if not RU.node_exists_with_property(id, "id"): input_protein_list[i] = RU.get_node_property( id, "id", node_label="protein") # Initialize the QueryLilGim class q = QueryLilGIM.QueryLilGIM() # get the description tissue_description = RU.get_node_property( tissue_id, 'name', node_label="anatomical_entity") # Get the correlated proteins try: correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy( tissue_id, tuple(input_protein_list)) #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347} except: error_message = "Lil'GIM is experiencing a problem." error_code = "LilGIMerror" response.add_error_message(error_code, error_message) response.print() return 1 # as a list of tuples correlated_proteins_tupes = [] for k, v in correlated_proteins_dict.items(): correlated_proteins_tupes.append((k, v)) # sort by freq correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes, key=lambda x: x[1], reverse=rev) correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[ 0:num_show] correlated_proteins_tupes = correlated_proteins_tupes_sorted # return the results if not use_json: try: protein_descriptions = RU.get_node_property( input_protein_list[0], "name", node_label="protein", name_type="id") except: protein_descriptions = input_protein_list[0] for id in input_protein_list[1:-1]: protein_descriptions += ", " try: protein_descriptions += RU.get_node_property( id, "name", node_label="protein", name_type="id") except: protein_descriptions += id if len(input_protein_list) > 1: try: protein_descriptions += ", and %s" % RU.get_node_property( input_protein_list[-1], "name", node_label="protein", name_type="id") except: protein_descriptions += ", and %s" % input_protein_list[-1] if rev: to_print = "In the tissue: %s, the proteins that correlate most with %s" % ( tissue_description, protein_descriptions) else: to_print = "In the tissue: %s, the proteins that correlate least with %s" % ( tissue_description, protein_descriptions) to_print += " according to Lil'GIM, are:\n" for id, val in correlated_proteins_tupes_sorted: try: to_print += "protein: %s\t correlation %f\n" % ( RU.get_node_property( id, "name", node_label="protein", name_type="id"), val) except: to_print += "protein: %s\t correlation %f\n" % (id, val) print(to_print) else: # otherwise, you want a JSON output protein_descriptions = [] is_in_KG_list = [] for protein, corr in correlated_proteins_tupes: try: description = RU.get_node_property(protein, "name", node_label="protein", name_type="id") protein_descriptions.append(description) is_in_KG_list.append(True) except: protein_description = protein protein_descriptions.append(protein_description) is_in_KG_list.append(False) # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG correlated_proteins_tupes_in_KG = [] for i in range(len(correlated_proteins_tupes)): if is_in_KG_list[i]: correlated_proteins_tupes_in_KG.append( correlated_proteins_tupes[i]) # Return the results full_g = RU.get_graph_from_nodes( [id for id, val in correlated_proteins_tupes_in_KG], node_property_label="id") id2node = dict() for nx_id, node in full_g.nodes(data=True): id2node[node['properties']['id']] = node for id, corr in correlated_proteins_tupes_in_KG: to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." % ( tissue_description, RU.get_node_property( id, "name", node_label="protein", name_type="id"), corr) response.add_subgraph([(id, id2node[id])], [], to_print, corr) response.print()