def get_connection_normalizing_count(gene_list, node_type_list): # dictionary that keeps track of all connections from a gene to any node type connection_dict = {} for gene_symbol in gene_list: gene_found = False gene_query = ht.query(gene_symbol)['Gene'] for i in gene_query: if (i['SYMBOL'].lower() == gene_symbol.lower()): gene = i gene_found = True if (gene_found == True): count = 0 input_object = gene for x in node_type_list: try: ## only look at direct connections fc = FindConnection(input_obj=input_object, output_obj=x, intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] count = count + rows except: print("gene " + str(gene_symbol) + " for node intermediate " + str(x) + " failed") connection_dict[gene_symbol] = count else: print(gene_symbol + ' could not be found') connection_dict[gene_symbol] = 'Unknown' return (connection_dict)
def get_disease_to_gene_results(disease_input): disease_to_gene_results = {} #directly related fc = FindConnection(input_obj=disease_input, output_obj='Gene', intermediate_nodes=None) fc.connect(verbose=False) disease_to_genes = fc.display_table_view() disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str. contains('UMLS')] # keep track of number of occurrences from direct disease -> gene connection print("running disease -> gene") i = list(disease_to_genes["output_name"]) d = {x: i.count(x) for x in i} sorted_disease_to_genes = { k: v for k, v in sorted(d.items(), key=lambda item: item[1]) } disease_to_gene_results[ "sorted_disease_to_genes"] = sorted_disease_to_genes # print("occurences of genes directly related to genes") # print(disease_to_gene_results["sorted_disease_to_genes"]) one_step_genes_pub_counts = {} for index, row in disease_to_genes.iterrows(): current_pubcount = 0 if (row["pred1_pubmed"] != None): current_pubcount = current_pubcount + row["pred1_pubmed"].count( ",") + 1 if row["output_name"] in one_step_genes_pub_counts: one_step_genes_pub_counts[ row["output_name"]] = one_step_genes_pub_counts[ row["output_name"]] + current_pubcount else: one_step_genes_pub_counts[row["output_name"]] = current_pubcount disease_to_gene_results[ "one_step_genes_pub_counts"] = one_step_genes_pub_counts disease_to_genes_list = list(reversed(list( sorted_disease_to_genes.keys()))) disease_to_gene_results["disease_to_genes_list"] = disease_to_genes_list return (disease_to_gene_results)
def predict_many(input_object, intermediate_node_list, output_type): df_list = [] for inter in intermediate_node_list: try: print("Intermediate Node type running:") print(inter) fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter]) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print("FAILED") if (len(df_list) > 0): return pd.concat(df_list) else: return None
def get(self): input_obj = self.get_query_argument('input_obj') output_obj = self.get_query_argument('output_obj') print("executing connect query: ", self.request.uri) intermediate_nodes = self.get_query_argument('intermediate_nodes') if type(input_obj) == str: input_obj = tornado.escape.json_decode(input_obj) if type(output_obj) == str: output_obj = tornado.escape.json_decode(output_obj) if type(intermediate_nodes) == str: intermediate_nodes = ast.literal_eval(intermediate_nodes) fc = FindConnection(input_obj=input_obj, output_obj=output_obj, intermediate_nodes=intermediate_nodes, registry=reg) fc.connect() df = fc.display_table_view() if df.empty: res = [] else: df = df[[ 'input', 'pred1', 'pred1_api', 'node1_name', 'node1_type', 'pred2', 'pred2_api', 'output_name' ]] df.drop_duplicates(inplace=True) res = df.to_dict('records') if res: self.set_status(200) self.write( tornado.escape.json_encode({ 'data': res, 'log': fc.fc.log })) self.finish() return else: self.set_status(404) self.write(json.dumps({'error': "Unable to find any connection"})) self.finish() return
# Select the correct representation of depression depression = depression_hint["Disease"][0] print(depression) print() # help(FindConnection.__init__) fc = FindConnection(input_obj=depression, output_obj=tbi, intermediate_nodes="BiologicalEntity") # BTE finding connection fc.connect(verbose=True) print() print("Displaying and filter results") # Displaying and filter results df = fc.display_table_view() # because UMLS is not currently well-integrated in our ID-to-object translation system, removing UMLS-only entries here patternDel = "^UMLS:C\d+" filter = df.node1_id.str.contains(patternDel) df = df[~filter] fc.to_graphml("TBI.graphml") fc.to_reasoner_std() print(df.shape) df.sample(10) # Which diseases are mentioned the most mentioned = df.node1_name.value_counts().head(10) print(mentioned)
def determined_genes_to_symptoms(gene_list, symptom_list): # gene -> phenotypic feature nodes print("Genes -> PhenotypicFeatures") df_list = [] for x in gene_list: # print(x) try: gene = ht.query(x)["Gene"][0] fc = FindConnection(input_obj=gene, output_obj='PhenotypicFeature', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print(str(x) + " FAILED") if (len(df_list) > 0): top_gene_to_phenotypicFeature = pd.concat(df_list) ## Get names for HP ids HP_ids = top_gene_to_phenotypicFeature[ top_gene_to_phenotypicFeature["output_name"].str.contains( "HP:", regex=False)]["output_name"] HP_ids = list(HP_ids) HP_ids = list(dict.fromkeys(HP_ids)) # len(HP_ids) HP_dict = {} for x in HP_ids: HP_ID = x.split(':')[1] r = requests.get('https://biothings.ncats.io/hpo/phenotype/HP%3A' + HP_ID) res = r.json() if (('_id' in res) & ('name' in res)): HP_dict[res['_id']] = res['name'].lower() phen_indices = get_similar_phen_indices( list(top_gene_to_phenotypicFeature["output_name"]), symptom_list, 0.95, HP_dict) phen_top = top_gene_to_phenotypicFeature.iloc[phen_indices, :] # phen_top = top_gene_to_phenotypicFeature # phen_top for index in range(phen_top.shape[0]): # if("HP:" in row['output_name']): # print(index) if (phen_top.iloc[index]["output_name"] in HP_dict): phen_top.iloc[index]["output_name"] = HP_dict[phen_top.iloc[index] ["output_name"]] phen_top # gene -> bioprocess print("Genes -> Bioprocesses") df_list = [] for x in gene_list: # print(x) try: gene = ht.query(x)["Gene"][0] fc = FindConnection(input_obj=gene, output_obj='BiologicalProcess', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print(str(x) + " FAILED") if (len(df_list) > 0): top_gene_to_bioprocesses = pd.concat(df_list) go_ids = top_gene_to_bioprocesses[ top_gene_to_bioprocesses["output_name"].str.contains( "go:", regex=False)]["output_name"] go_ids = list(go_ids) go_ids = list(dict.fromkeys(go_ids)) # len(go_ids) go_dict = {} for x in go_ids: go_ID = x.split(':')[1] r = requests.get('https://biothings.ncats.io/go_bp/geneset/GO%3A' + go_ID) res = r.json() if ('name' in res): go_dict[res['_id']] = res['name'].lower() bp_indices = get_similar_bp_indices( list(top_gene_to_bioprocesses["output_name"]), symptom_list, 0.95, go_dict) bioprocess_top = top_gene_to_bioprocesses.iloc[bp_indices, :] # bioprocess_top = top_gene_to_bioprocesses # Genes -> disease type "symptoms" print("Genes -> Diseases") df_list = [] for x in gene_list: try: gene = ht.query(x)["Gene"][0] fc = FindConnection(input_obj=gene, output_obj='Disease', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print(str(x) + " FAILED") if (len(df_list) > 0): top_gene_to_diseases = pd.concat(df_list) disease_indices = get_similar_disease_indices( list(top_gene_to_diseases["output_name"]), symptom_list, 0.95) relevant_top_gene_to_diseases = top_gene_to_diseases.iloc[ disease_indices, :] # relevant_top_gene_to_diseases = top_gene_to_diseases ## make dataframe with all genes -> symptoms all_gene_connections = pd.concat( [bioprocess_top, phen_top, relevant_top_gene_to_diseases]) all_gene_connections["output_name"] = all_gene_connections[ "output_name"].str.lower() return (all_gene_connections)
def get_symtpom_prevalence(hp_symptom_dict, disease_name): for key in hp_symptom_dict: print(key) edges_out_count = 0 # print("name: " + str(hp_symptom_dict[key])) UMLS = '' for y in ['PhenotypicFeature', 'Disease', 'BiologicalProcess']: if y == 'PhenotypicFeature': a = ht.query(key)[y] if len(a) > 0: b = a[0] if 'UMLS' in b: UMLS = b['UMLS'] try: fc = FindConnection(input_obj=b, output_obj='Gene', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() print('gene') print(df.shape) # print("phen") # print(hp_symptom_dict[key]) # print(df.shape[0]) if (df.shape[0] > 0): print("OKKKk") df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[0] print(edges_out_count) except: print("Nope") try: fc = FindConnection(input_obj=b, output_obj='Disease', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() print(df.shape) # print("phen") # print(hp_symptom_dict[key]) # print(df.shape[0]) if (df.shape[0] > 0): print("ok edge phen to dis") df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[0] print(edges_out_count) except: print("Nope") if (y == 'Disease') | (y == 'BiologicalProcess'): for z in hp_symptom_dict[key]["names"]: if ((y == 'Disease') & (len(UMLS) > 0)): try: a = ht.query(UMLS)[y] except: a = [] # pass else: try: a = ht.query(z)[y] except: a = [] # pass # print(a) for b in a: if b['name'].lower() == z.lower(): # print('match') # print(b) # print(z) try: fc = FindConnection(input_obj=b, output_obj='Gene', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() # print("BD") # print(df.shape[0]) if (df.shape[0] > 0): df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[ 0] except: print("Nope") try: fc = FindConnection(input_obj=b, output_obj='Disease', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() # print("BD") # print(df.shape[0]) if (df.shape[0] > 0): df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[ 0] except: print("Nope") print("edges out") print(edges_out_count) hp_symptom_dict[key]["edges_out_count"] = edges_out_count return (hp_symptom_dict)