def cluster_genes_heatprop(seed_genes, cluster_x_y_z): ''' Function to establish drugs potentially related to an input gene list, using network propagation methods inputs: - seed_genes: genes from which to initiate heat propagation simulation - path_to_DB_file: path to drug bank file, including filename - path_to_cluster_file: path to cluster file, including filename - plot_flag: should we plot the subnetwork with heat overlaid? Default False ''' #G_DB = nx.Graph() #G_DB.add_edges_from(DB_el) G_cluster = cluster_x_y_z #load_cluster_data(path_to_cluster_file) # calculate the degree-normalized adjacency matrix Wprime = network_prop.normalized_adj_matrix(G_cluster['cluster'], weighted=True) # run the network_propagation simulation starting from the seed genes Fnew = network_prop.network_propagation(G_cluster['cluster'], Wprime, seed_genes) # sort heat vector Fnew Fnew.sort(ascending=False) H = G_cluster['cluster'].subgraph(Fnew.head(500).keys()) return H
def drug_gene_heatprop(seed_genes,path_to_DB_file,path_to_cluster_file,plot_flag=False): ''' Function to establish drugs potentially related to an input gene list, using network propagation methods inputs: - seed_genes: genes from which to initiate heat propagation simulation - path_to_DB_file: path to drug bank file, including filename - path_to_cluster_file: path to cluster file, including filename - plot_flag: should we plot the subnetwork with heat overlaid? Default False ''' # load and parse the drug-bank file into a dict () DBdict = load_DB_data(path_to_DB_file) # make a network out of drug-gene interactions DB_el = [] for d in DBdict.keys(): node_list = DBdict[d]['node_list'] for n in node_list: DB_el.append((DBdict[d]['drugbank_id'],n['name'])) G_DB = nx.Graph() G_DB.add_edges_from(DB_el) G_cluster = load_cluster_data(path_to_cluster_file) # calculate the degree-normalized adjacency matrix Wprime = network_prop.normalized_adj_matrix(G_cluster,weighted=True) # run the network_propagation simulation starting from the seed genes Fnew = network_prop.network_propagation(G_cluster,Wprime,seed_genes) # sort heat vector Fnew Fnew.sort(ascending=False) # if plot_flag is on plot the cluster genes with heat overlaid if plot_flag: pos = nx.spring_layout(G_cluster) plt.figure(figsize=(10,10)) nx.draw_networkx_edges(G_cluster,pos=pos,alpha=.03) nx.draw_networkx_nodes(G_cluster,pos=pos,node_size=20,alpha=.8,node_color=Fnew[G_cluster.nodes()],cmap='jet', vmin=0,vmax=np.max(Fnew)/10) nx.draw_networkx_nodes(G_cluster,pos=pos,nodelist=seed_genes,node_size=50,alpha=.7,node_color='red',linewidths=2) plt.grid('off') plt.title('Sample subnetwork: post-heat propagation',fontsize=16) # find the drugs related to hot genes gene_drug_df = find_drugs_from_hot_genes(Fnew,G_DB,seed_genes,keep_seed_genes =True) return gene_drug_df
def drug_gene_heatprop(seed_genes, cluster_x_y_z, plot_flag=False): ''' Function to establish drugs potentially related to an input gene list, using network propagation methods inputs: - seed_genes: genes from which to initiate heat propagation simulation - path_to_DB_file: path to drug bank file, including filename - path_to_cluster_file: path to cluster file, including filename - plot_flag: should we plot the subnetwork with heat overlaid? Default False ''' # load and parse the drug-bank file into a dict () #DBdict = load_DB_data(path_to_DB_file) # make a network out of drug-gene interactions # DB_el = [] # for d in DBdict.keys(): # node_list = DBdict[d]['node_list'] # for n in node_list: # DB_el.append((DBdict[d]['drugbank_id'],n['name'])) #load_DB_el() start_time = time.time() G_DB = nx.Graph() G_DB.add_edges_from(DB_el) G_cluster = cluster_x_y_z #load_cluster_data(path_to_cluster_file) # calculate the degree-normalized adjacency matrix Wprime = network_prop.normalized_adj_matrix(G_cluster, weighted=True) # run the network_propagation simulation starting from the seed genes Fnew = network_prop.network_propagation(G_cluster, Wprime, seed_genes) # sort heat vector Fnew #Fnew.sort(ascending=False) Fnew.sort_values(inplace=True, ascending=False) # if plot_flag is on plot the cluster genes with heat overlaid if plot_flag: pos = nx.spring_layout(G_cluster) plt.figure(figsize=(10, 10)) nx.draw_networkx_edges(G_cluster, pos=pos, alpha=.03) nx.draw_networkx_nodes(G_cluster, pos=pos, node_size=20, alpha=.8, node_color=Fnew[G_cluster.nodes()], cmap='jet', vmin=0, vmax=np.max(Fnew) / 10) nx.draw_networkx_nodes(G_cluster, pos=pos, nodelist=seed_genes, node_size=50, alpha=.7, node_color='red', linewidths=2) plt.grid('off') plt.title('Sample subnetwork: post-heat propagation', fontsize=16) # find the drugs related to hot genes gene_drug_df = find_drugs_from_hot_genes(Fnew, G_DB, seed_genes, keep_seed_genes=True) #print gene_drug_df return gene_drug_df
def main(num_reps=10, seed_gene_file='HC_genes/ASD_HC_no_shared_200114.tsv', int_file='../interactomes/G_PCnet.gpickle', out_name='ASD', rand_method='degree_binning', single_or_double='single', save_fnew_rand=False): ''' Calculate z-scores for heat propagation python netprop_zscore.py 10 HC_genes/ASD_HC_no_shared_200114.tsv ../interactomes/G_PCnet.gpickle ASD degree_binning single False ''' print('number of randomizations = ' + str(num_reps)) print('background interactome = ' + int_file) print('randomization method = ' + rand_method) print('single or double = ' + single_or_double) print('save Fnew rand = ' + save_fnew_rand) num_reps = int(num_reps) # load interactome and select focal interactome Gint = nx.Graph() Gint = nx.read_gpickle(int_file) if 'None' in Gint.nodes(): Gint.remove_node('None') # load HC genes HC_genes_temp = pd.read_csv(seed_gene_file, sep='\t', index_col='Unnamed: 0') seed_HC = [ str(g[1:-1]).strip("'") for g in HC_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ') ] print(seed_gene_file + ':') print(len(seed_HC)) seed_HC = list(np.intersect1d(Gint.nodes(), seed_HC)) print(len(seed_HC)) # calculate the z-score # calc Wprime from Gint Wprime = network_prop.normalized_adj_matrix(Gint, conserve_heat=True) if single_or_double == 'single': # calculate z-scores from a single set of seed genes print('calculating z-scores: ' + seed_gene_file) z_seed, Fnew_rand_seed = calc_zscore_heat(Gint, Wprime, seed_HC, num_reps=num_reps, rand_method=rand_method) z_seed.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' + rand_method + '.tsv', sep='\t') if save_fnew_rand == 'True': # if true, save out the vector of randoms (this can be a large file) pd.DataFrame(Fnew_rand_seed).to_csv('Fnew_' + out_name + '_rand' + str(num_reps) + '_reps_' + rand_method + '.tsv', sep='\t') elif single_or_double == 'double': # calculate z-scores from two sets of seed genes: # --- not currently functional ---- print('calculating ASD-CHD z-scores') z_ASD_CHD, Fnew_rand_ASD_CHD = calc_zscore_heat_double( Gint, Wprime, ASD_HC, CHD_HC, num_reps=num_reps, rand_method=rand_method) z_ASD_CHD.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' + rand_method + '.tsv', sep='\t')
def main(num_reps=10, seed_gene_file='HC_genes/example_seed.tsv', int_file='../interactomes/G_PCnet.gpickle', out_name='ASD', rand_method='degree_binning', single_or_double='single'): ''' Calculate z-scores for heat propagation Inputs: num_reps: number of randomizations seed_gene_file: location of file containing seed genes (see example for format... clunky format due to historical reasons... need to improve) int_file: location of interactome to use (gpickle format) out_name: identifier for output files (currently saves in current directory... need to update to allow setting of save location) rand_method: type of randomization (default = 'degree_binning', alternate method 'degree_ks_test' deprecated) single_or_double: single network prop or double network prop. (default = 'single'. 'double' is deprecated) python netprop_zscore.py 10 HC_genes/example_seed.tsv ../interactomes/G_PCnet.gpickle ASD degree_binning single ''' print('number of randomizations = ' + str(num_reps)) print('background interactome = ' + int_file) print('randomization method = ' + rand_method) print('single or double = ' + single_or_double) num_reps = int(num_reps) # load interactome and select focal interactome Gint = nx.Graph() Gint = nx.read_gpickle(int_file) if 'None' in Gint.nodes(): Gint.remove_node('None') # load HC genes HC_genes_temp = pd.read_csv(seed_gene_file, sep='\t', index_col='Unnamed: 0') seed_HC = [ str(g[1:-1]).strip("'") for g in HC_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ') ] print(seed_gene_file + ':') print(len(seed_HC)) seed_HC = list(np.intersect1d(Gint.nodes(), seed_HC)) print(len(seed_HC)) # calculate the z-score # calc Wprime from Gint Wprime = network_prop.normalized_adj_matrix(Gint, conserve_heat=True) if single_or_double == 'single': # calculate z-scores from a single set of seed genes print('calculating z-scores: ' + seed_gene_file) z_seed, Fnew_rand_seed = calc_zscore_heat(Gint, Wprime, seed_HC, num_reps=num_reps, rand_method=rand_method) z_seed.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' + rand_method + '.tsv', sep='\t') #pd.DataFrame(Fnew_rand_seed).to_csv('Fnew_'+outname+'_rand'+str(num_reps)+'_reps_'+rand_method+'.tsv',sep='\t') elif single_or_double == 'double': # calculate z-scores from two sets of seed genes: # --- keeping for completeness, but currently not functional ---- print('calculating ASD-CHD z-scores') z_ASD_CHD, Fnew_rand_ASD_CHD = calc_zscore_heat_double( Gint, Wprime, ASD_HC, CHD_HC, num_reps=num_reps, rand_method=rand_method) z_ASD_CHD.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' + rand_method + '.tsv', sep='\t')