def cluster_genes_heatprop(seed_genes, cluster_x_y_z): ''' Function to establish drugs potentially related to an input gene list, using network propagation methods inputs: - seed_genes: genes from which to initiate heat propagation simulation - path_to_DB_file: path to drug bank file, including filename - path_to_cluster_file: path to cluster file, including filename - plot_flag: should we plot the subnetwork with heat overlaid? Default False ''' #G_DB = nx.Graph() #G_DB.add_edges_from(DB_el) G_cluster = cluster_x_y_z #load_cluster_data(path_to_cluster_file) # calculate the degree-normalized adjacency matrix Wprime = network_prop.normalized_adj_matrix(G_cluster['cluster'], weighted=True) # run the network_propagation simulation starting from the seed genes Fnew = network_prop.network_propagation(G_cluster['cluster'], Wprime, seed_genes) # sort heat vector Fnew Fnew.sort(ascending=False) H = G_cluster['cluster'].subgraph(Fnew.head(500).keys()) return H
def drug_gene_heatprop(seed_genes,path_to_DB_file,path_to_cluster_file,plot_flag=False): ''' Function to establish drugs potentially related to an input gene list, using network propagation methods inputs: - seed_genes: genes from which to initiate heat propagation simulation - path_to_DB_file: path to drug bank file, including filename - path_to_cluster_file: path to cluster file, including filename - plot_flag: should we plot the subnetwork with heat overlaid? Default False ''' # load and parse the drug-bank file into a dict () DBdict = load_DB_data(path_to_DB_file) # make a network out of drug-gene interactions DB_el = [] for d in DBdict.keys(): node_list = DBdict[d]['node_list'] for n in node_list: DB_el.append((DBdict[d]['drugbank_id'],n['name'])) G_DB = nx.Graph() G_DB.add_edges_from(DB_el) G_cluster = load_cluster_data(path_to_cluster_file) # calculate the degree-normalized adjacency matrix Wprime = network_prop.normalized_adj_matrix(G_cluster,weighted=True) # run the network_propagation simulation starting from the seed genes Fnew = network_prop.network_propagation(G_cluster,Wprime,seed_genes) # sort heat vector Fnew Fnew.sort(ascending=False) # if plot_flag is on plot the cluster genes with heat overlaid if plot_flag: pos = nx.spring_layout(G_cluster) plt.figure(figsize=(10,10)) nx.draw_networkx_edges(G_cluster,pos=pos,alpha=.03) nx.draw_networkx_nodes(G_cluster,pos=pos,node_size=20,alpha=.8,node_color=Fnew[G_cluster.nodes()],cmap='jet', vmin=0,vmax=np.max(Fnew)/10) nx.draw_networkx_nodes(G_cluster,pos=pos,nodelist=seed_genes,node_size=50,alpha=.7,node_color='red',linewidths=2) plt.grid('off') plt.title('Sample subnetwork: post-heat propagation',fontsize=16) # find the drugs related to hot genes gene_drug_df = find_drugs_from_hot_genes(Fnew,G_DB,seed_genes,keep_seed_genes =True) return gene_drug_df
def drug_gene_heatprop(seed_genes, cluster_x_y_z, plot_flag=False): ''' Function to establish drugs potentially related to an input gene list, using network propagation methods inputs: - seed_genes: genes from which to initiate heat propagation simulation - path_to_DB_file: path to drug bank file, including filename - path_to_cluster_file: path to cluster file, including filename - plot_flag: should we plot the subnetwork with heat overlaid? Default False ''' # load and parse the drug-bank file into a dict () #DBdict = load_DB_data(path_to_DB_file) # make a network out of drug-gene interactions # DB_el = [] # for d in DBdict.keys(): # node_list = DBdict[d]['node_list'] # for n in node_list: # DB_el.append((DBdict[d]['drugbank_id'],n['name'])) #load_DB_el() start_time = time.time() G_DB = nx.Graph() G_DB.add_edges_from(DB_el) G_cluster = cluster_x_y_z #load_cluster_data(path_to_cluster_file) # calculate the degree-normalized adjacency matrix Wprime = network_prop.normalized_adj_matrix(G_cluster, weighted=True) # run the network_propagation simulation starting from the seed genes Fnew = network_prop.network_propagation(G_cluster, Wprime, seed_genes) # sort heat vector Fnew #Fnew.sort(ascending=False) Fnew.sort_values(inplace=True, ascending=False) # if plot_flag is on plot the cluster genes with heat overlaid if plot_flag: pos = nx.spring_layout(G_cluster) plt.figure(figsize=(10, 10)) nx.draw_networkx_edges(G_cluster, pos=pos, alpha=.03) nx.draw_networkx_nodes(G_cluster, pos=pos, node_size=20, alpha=.8, node_color=Fnew[G_cluster.nodes()], cmap='jet', vmin=0, vmax=np.max(Fnew) / 10) nx.draw_networkx_nodes(G_cluster, pos=pos, nodelist=seed_genes, node_size=50, alpha=.7, node_color='red', linewidths=2) plt.grid('off') plt.title('Sample subnetwork: post-heat propagation', fontsize=16) # find the drugs related to hot genes gene_drug_df = find_drugs_from_hot_genes(Fnew, G_DB, seed_genes, keep_seed_genes=True) #print gene_drug_df return gene_drug_df
def calc_zscore_heat_double(Gint, Wprime, genes_D1, genes_D2, num_reps=10, ks_sig=0.3, rand_method='degree_binning'): ''' Helper function to calculate the z-score of heat values from two input sets of genes rand_method = 'degree_binning'. (this is the only option for now 'degree_ks_test' is deprecated) ''' seed_D1 = list(np.intersect1d(list(genes_D1), Gint.nodes())) Fnew_D1 = network_prop.network_propagation(Gint, Wprime, seed_D1, alpha=.5, num_its=20) seed_D2 = list(np.intersect1d(list(genes_D2), Gint.nodes())) Fnew_D2 = network_prop.network_propagation(Gint, Wprime, seed_D2, alpha=.5, num_its=20) Fnew_both = Fnew_D1 * Fnew_D2 Fnew_rand_both = np.zeros([num_reps, len(Fnew_both)]) if rand_method == 'degree_binning': bins = get_degree_binning(Gint, 10) min_degree, max_degree, genes_binned = zip(*bins) bin_df = pd.DataFrame({ 'min_degree': min_degree, 'max_degree': max_degree, 'genes_binned': genes_binned }) for r in range(num_reps): if (r % 50) == 0: print(r) # UPDATE 1/30/18 -- sample from degree bins seed_D1_random = [] for g in seed_D1: degree_temp = nx.degree(Gint, g) # find genes with similar degrees to focal gene degree genes_temp = bin_df[(bin_df['min_degree'] <= degree_temp) & (bin_df['max_degree'] >= degree_temp )]['genes_binned'].tolist()[0] np.random.shuffle(genes_temp) # shuffle them seed_D1_random.append( genes_temp[0]) # build the seed_D1_random list seed_D2_random = [] for g in seed_D2: degree_temp = nx.degree(Gint, g) # find genes with similar degrees to focal gene degree genes_temp = bin_df[(bin_df['min_degree'] <= degree_temp) & (bin_df['max_degree'] >= degree_temp )]['genes_binned'].tolist()[0] np.random.shuffle(genes_temp) # shuffle them seed_D2_random.append( genes_temp[0]) # build the seed_D1_random list Fnew_rand_1 = network_prop.network_propagation(Gint, Wprime, seed_D1_random, alpha=.5, num_its=20) Fnew_rand_1.loc[ seed_D1_random] = np.nan # set seeds to nan so they don't bias results Fnew_rand_2 = network_prop.network_propagation(Gint, Wprime, seed_D2_random, alpha=.5, num_its=20) Fnew_rand_2.loc[ seed_D2_random] = np.nan # set seeds to nan so they don't bias results Fnew_rand_both[r] = (Fnew_rand_1 * Fnew_rand_2).loc[Fnew_D1.index.tolist()] z_score_both = (np.log(Fnew_both) - np.nanmean(np.log(Fnew_rand_both), axis=0)) / np.nanstd( np.log(Fnew_rand_both), axis=0) return z_score_both, Fnew_rand_both
def calc_zscore_heat(Gint, Wprime, genes_D1, num_reps=10, ks_sig=0.3, rand_method='degree_binning'): ''' Helper function to calculate the z-score of heat values from one input seet of genes rand_method = 'degree_ks_test', or 'degree_binning'. select the type of randomization ''' seed_D1 = list(np.intersect1d(list(genes_D1), Gint.nodes())) Fnew_D1 = network_prop.network_propagation(Gint, Wprime, seed_D1, alpha=.5, num_its=20) num_focal_edges = len(nx.subgraph(Gint, seed_D1).edges()) Fnew_rand_D1 = np.zeros([num_reps, len(Fnew_D1)]) if rand_method == 'degree_ks_test': for r in range(num_reps): if (r % 50) == 0: print(r) # UPDATE 8/23/17 -- replace with randomly selecting seed nodes, checking for degree distribution equivalence p = 0 # resample until degree distributions are not significantly different while p < ks_sig: seed_D1_random = Gint.nodes() np.random.shuffle(seed_D1_random) seed_D1_random = seed_D1_random[0:len(seed_D1)] ks_stat, p = scipy.stats.ks_2samp( pd.Series(Gint.degree(seed_D1)), pd.Series(Gint.degree(seed_D1_random))) Fnew_rand_tmp = network_prop.network_propagation(Gint, Wprime, seed_D1_random, alpha=.5, num_its=20) Fnew_rand_tmp.loc[ seed_D1_random] = np.nan # set seeds to nan so they don't bias results Fnew_rand_D1[r] = Fnew_rand_tmp.loc[Fnew_D1.index.tolist()] elif rand_method == 'degree_binning': bins = get_degree_binning(Gint, 10) min_degree, max_degree, genes_binned = zip(*bins) bin_df = pd.DataFrame({ 'min_degree': min_degree, 'max_degree': max_degree, 'genes_binned': genes_binned }) # create a lookup table for degree and index actual_degree_to_bin_df_idx = {} for i in range(0, bin_df['max_degree'].max() + 1): idx_temp = bin_df[(bin_df['min_degree'].lt(i + 1)) & (bin_df['max_degree'].gt(i - 1))].index.tolist() if len( idx_temp ) > 0: # there are some degrees which aren't represented in the graph actual_degree_to_bin_df_idx[i] = idx_temp[0] # r_inputs = range(num_reps) # num_cores = multiprocessing.cpu_count()-1 # Fnew_rand_D1 = Parallel(n_jobs=num_cores)(delayed(calc_Fnew_rand_deg_binning)(r,Gint,bin_df,seed_D1,actual_degree_to_bin_df_idx,Fnew_D1,num_focal_edges,Wprime) for r in r_inputs) for r in range(num_reps): if (r % 50) == 0: print(r) # UPDATE 1/30/18 -- sample from degree bins seed_D1_random = [] for g in seed_D1: degree_temp = nx.degree(Gint, g) # find genes with similar degrees to focal gene degree genes_temp = bin_df.loc[ actual_degree_to_bin_df_idx[degree_temp]]['genes_binned'] np.random.shuffle(genes_temp) # shuffle them while genes_temp[ 0] in seed_D1_random: # make sure the gene isn't already in the list np.random.shuffle(genes_temp) # shuffle them seed_D1_random.append( genes_temp[0]) # build the seed_D1_random list # # modify random seeds so that they have similar localization properties to input set # prev_num_edges = len(nx.subgraph(Gint,seed_D1_random).edges()) # print(prev_num_edges) # # pick a gene at random and replace it, if the number of edges increases, keep it, otherwise, don't keep # counter=-1 # while (prev_num_edges < num_focal_edges) and (counter < 3000): # counter+=1 # if (counter%1000)==0: # print(counter) # print(prev_num_edges) # np.random.shuffle(seed_D1_random) # replace_gene = seed_D1_random[0] # deg_replace_gene = nx.degree(Gint,replace_gene) # replace_bin = actual_degree_to_bin_df_idx[deg_replace_gene] # genes_temp = bin_df.loc[replace_bin]['genes_binned'] # use the lookup table for speed # np.random.shuffle(genes_temp) # shuffle them # #print(seed_D1_random[0]) # seed_random_new=seed_D1_random[:] # seed_random_new[0]=genes_temp[0] # #print(seed_random_new[0]) # new_num_edges = len(nx.subgraph(Gint,seed_random_new).edges()) # #print(new_num_edges) # if new_num_edges>prev_num_edges: # prev_num_edges=new_num_edges # seed_D1_random = seed_random_new[:] Fnew_rand_tmp = network_prop.network_propagation(Gint, Wprime, seed_D1_random, alpha=.5, num_its=20) Fnew_rand_tmp.loc[ seed_D1_random] = np.nan # set seeds to nan so they don't bias results Fnew_rand_D1[r] = Fnew_rand_tmp.loc[Fnew_D1.index.tolist()] z_score_D1 = (np.log(Fnew_D1) - np.nanmean(np.log(Fnew_rand_D1), axis=0) ) / np.nanstd(np.log(Fnew_rand_D1), axis=0) return z_score_D1, Fnew_rand_D1