コード例 #1
0
def cluster_genes_heatprop(seed_genes, cluster_x_y_z):
    '''
    Function to establish drugs potentially related to an input gene list, using network propagation methods

    inputs:
        - seed_genes:  genes from which to initiate heat propagation simulation
        - path_to_DB_file:  path to drug bank file, including filename
        - path_to_cluster_file: path to cluster file, including filename
        - plot_flag: should we plot the subnetwork with heat overlaid? Default False

    '''

    #G_DB = nx.Graph()
    #G_DB.add_edges_from(DB_el)

    G_cluster = cluster_x_y_z  #load_cluster_data(path_to_cluster_file)

    # calculate the degree-normalized adjacency matrix
    Wprime = network_prop.normalized_adj_matrix(G_cluster['cluster'],
                                                weighted=True)

    # run the network_propagation simulation starting from the seed genes
    Fnew = network_prop.network_propagation(G_cluster['cluster'], Wprime,
                                            seed_genes)

    # sort heat vector Fnew
    Fnew.sort(ascending=False)

    H = G_cluster['cluster'].subgraph(Fnew.head(500).keys())

    return H
コード例 #2
0
def drug_gene_heatprop(seed_genes,path_to_DB_file,path_to_cluster_file,plot_flag=False):
    
    '''
    Function to establish drugs potentially related to an input gene list, using network propagation methods
    
    inputs:
        - seed_genes:  genes from which to initiate heat propagation simulation
        - path_to_DB_file:  path to drug bank file, including filename
        - path_to_cluster_file: path to cluster file, including filename
        - plot_flag: should we plot the subnetwork with heat overlaid? Default False
        
    '''
    
    
    # load and parse the drug-bank file into a dict ()
    DBdict = load_DB_data(path_to_DB_file)
    
    # make a network out of drug-gene interactions
    DB_el = []
    for d in DBdict.keys():
        node_list = DBdict[d]['node_list']
        for n in node_list:
            DB_el.append((DBdict[d]['drugbank_id'],n['name']))
            
            
    G_DB = nx.Graph()
    G_DB.add_edges_from(DB_el)
    
    G_cluster = load_cluster_data(path_to_cluster_file)
    
    # calculate the degree-normalized adjacency matrix
    Wprime = network_prop.normalized_adj_matrix(G_cluster,weighted=True)
    
    # run the network_propagation simulation starting from the seed genes
    Fnew = network_prop.network_propagation(G_cluster,Wprime,seed_genes)
    
    # sort heat vector Fnew
    Fnew.sort(ascending=False)
    
    # if plot_flag is on plot the cluster genes with heat overlaid
    if plot_flag:
        pos = nx.spring_layout(G_cluster)

        plt.figure(figsize=(10,10))
        nx.draw_networkx_edges(G_cluster,pos=pos,alpha=.03)
        nx.draw_networkx_nodes(G_cluster,pos=pos,node_size=20,alpha=.8,node_color=Fnew[G_cluster.nodes()],cmap='jet',
                               vmin=0,vmax=np.max(Fnew)/10)
        nx.draw_networkx_nodes(G_cluster,pos=pos,nodelist=seed_genes,node_size=50,alpha=.7,node_color='red',linewidths=2)

        plt.grid('off')
        plt.title('Sample subnetwork: post-heat propagation',fontsize=16)
    
    # find the drugs related to hot genes
    gene_drug_df = find_drugs_from_hot_genes(Fnew,G_DB,seed_genes,keep_seed_genes =True)
    
    return gene_drug_df
    
    
コード例 #3
0
def drug_gene_heatprop(seed_genes, cluster_x_y_z, plot_flag=False):
    '''
    Function to establish drugs potentially related to an input gene list, using network propagation methods
    
    inputs:
        - seed_genes:  genes from which to initiate heat propagation simulation
        - path_to_DB_file:  path to drug bank file, including filename
        - path_to_cluster_file: path to cluster file, including filename
        - plot_flag: should we plot the subnetwork with heat overlaid? Default False
        
    '''

    # load and parse the drug-bank file into a dict ()
    #DBdict = load_DB_data(path_to_DB_file)

    # make a network out of drug-gene interactions
    #    DB_el = []
    #    for d in DBdict.keys():
    #        node_list = DBdict[d]['node_list']
    #        for n in node_list:
    #            DB_el.append((DBdict[d]['drugbank_id'],n['name']))

    #load_DB_el()

    start_time = time.time()
    G_DB = nx.Graph()
    G_DB.add_edges_from(DB_el)

    G_cluster = cluster_x_y_z  #load_cluster_data(path_to_cluster_file)

    # calculate the degree-normalized adjacency matrix
    Wprime = network_prop.normalized_adj_matrix(G_cluster, weighted=True)

    # run the network_propagation simulation starting from the seed genes
    Fnew = network_prop.network_propagation(G_cluster, Wprime, seed_genes)

    # sort heat vector Fnew
    #Fnew.sort(ascending=False)
    Fnew.sort_values(inplace=True, ascending=False)
    # if plot_flag is on plot the cluster genes with heat overlaid
    if plot_flag:
        pos = nx.spring_layout(G_cluster)

        plt.figure(figsize=(10, 10))
        nx.draw_networkx_edges(G_cluster, pos=pos, alpha=.03)
        nx.draw_networkx_nodes(G_cluster,
                               pos=pos,
                               node_size=20,
                               alpha=.8,
                               node_color=Fnew[G_cluster.nodes()],
                               cmap='jet',
                               vmin=0,
                               vmax=np.max(Fnew) / 10)
        nx.draw_networkx_nodes(G_cluster,
                               pos=pos,
                               nodelist=seed_genes,
                               node_size=50,
                               alpha=.7,
                               node_color='red',
                               linewidths=2)

        plt.grid('off')
        plt.title('Sample subnetwork: post-heat propagation', fontsize=16)

    # find the drugs related to hot genes
    gene_drug_df = find_drugs_from_hot_genes(Fnew,
                                             G_DB,
                                             seed_genes,
                                             keep_seed_genes=True)

    #print gene_drug_df

    return gene_drug_df
コード例 #4
0
def calc_zscore_heat_double(Gint,
                            Wprime,
                            genes_D1,
                            genes_D2,
                            num_reps=10,
                            ks_sig=0.3,
                            rand_method='degree_binning'):
    '''
    Helper function to calculate the z-score of heat values from two input sets of genes
    
    rand_method = 'degree_binning'.  (this is the only option for now 'degree_ks_test' is deprecated) 
    '''
    seed_D1 = list(np.intersect1d(list(genes_D1), Gint.nodes()))
    Fnew_D1 = network_prop.network_propagation(Gint,
                                               Wprime,
                                               seed_D1,
                                               alpha=.5,
                                               num_its=20)

    seed_D2 = list(np.intersect1d(list(genes_D2), Gint.nodes()))
    Fnew_D2 = network_prop.network_propagation(Gint,
                                               Wprime,
                                               seed_D2,
                                               alpha=.5,
                                               num_its=20)

    Fnew_both = Fnew_D1 * Fnew_D2

    Fnew_rand_both = np.zeros([num_reps, len(Fnew_both)])
    if rand_method == 'degree_binning':
        bins = get_degree_binning(Gint, 10)
        min_degree, max_degree, genes_binned = zip(*bins)
        bin_df = pd.DataFrame({
            'min_degree': min_degree,
            'max_degree': max_degree,
            'genes_binned': genes_binned
        })
        for r in range(num_reps):
            if (r % 50) == 0:
                print(r)
            # UPDATE 1/30/18 -- sample from degree bins
            seed_D1_random = []
            for g in seed_D1:
                degree_temp = nx.degree(Gint, g)
                # find genes with similar degrees to focal gene degree
                genes_temp = bin_df[(bin_df['min_degree'] <= degree_temp)
                                    & (bin_df['max_degree'] >= degree_temp
                                       )]['genes_binned'].tolist()[0]
                np.random.shuffle(genes_temp)  # shuffle them
                seed_D1_random.append(
                    genes_temp[0])  # build the seed_D1_random list

            seed_D2_random = []
            for g in seed_D2:
                degree_temp = nx.degree(Gint, g)
                # find genes with similar degrees to focal gene degree
                genes_temp = bin_df[(bin_df['min_degree'] <= degree_temp)
                                    & (bin_df['max_degree'] >= degree_temp
                                       )]['genes_binned'].tolist()[0]
                np.random.shuffle(genes_temp)  # shuffle them
                seed_D2_random.append(
                    genes_temp[0])  # build the seed_D1_random list

            Fnew_rand_1 = network_prop.network_propagation(Gint,
                                                           Wprime,
                                                           seed_D1_random,
                                                           alpha=.5,
                                                           num_its=20)
            Fnew_rand_1.loc[
                seed_D1_random] = np.nan  # set seeds to nan so they don't bias results

            Fnew_rand_2 = network_prop.network_propagation(Gint,
                                                           Wprime,
                                                           seed_D2_random,
                                                           alpha=.5,
                                                           num_its=20)
            Fnew_rand_2.loc[
                seed_D2_random] = np.nan  # set seeds to nan so they don't bias results

            Fnew_rand_both[r] = (Fnew_rand_1 *
                                 Fnew_rand_2).loc[Fnew_D1.index.tolist()]

    z_score_both = (np.log(Fnew_both) -
                    np.nanmean(np.log(Fnew_rand_both), axis=0)) / np.nanstd(
                        np.log(Fnew_rand_both), axis=0)

    return z_score_both, Fnew_rand_both
コード例 #5
0
def calc_zscore_heat(Gint,
                     Wprime,
                     genes_D1,
                     num_reps=10,
                     ks_sig=0.3,
                     rand_method='degree_binning'):
    '''
    Helper function to calculate the z-score of heat values from one input seet of genes
    
    rand_method = 'degree_ks_test', or 'degree_binning'.  select the type of randomization
    '''
    seed_D1 = list(np.intersect1d(list(genes_D1), Gint.nodes()))
    Fnew_D1 = network_prop.network_propagation(Gint,
                                               Wprime,
                                               seed_D1,
                                               alpha=.5,
                                               num_its=20)

    num_focal_edges = len(nx.subgraph(Gint, seed_D1).edges())

    Fnew_rand_D1 = np.zeros([num_reps, len(Fnew_D1)])
    if rand_method == 'degree_ks_test':
        for r in range(num_reps):
            if (r % 50) == 0:
                print(r)
            # UPDATE 8/23/17 -- replace with randomly selecting seed nodes, checking for degree distribution equivalence

            p = 0
            # resample until degree distributions are not significantly different
            while p < ks_sig:
                seed_D1_random = Gint.nodes()
                np.random.shuffle(seed_D1_random)
                seed_D1_random = seed_D1_random[0:len(seed_D1)]
                ks_stat, p = scipy.stats.ks_2samp(
                    pd.Series(Gint.degree(seed_D1)),
                    pd.Series(Gint.degree(seed_D1_random)))

            Fnew_rand_tmp = network_prop.network_propagation(Gint,
                                                             Wprime,
                                                             seed_D1_random,
                                                             alpha=.5,
                                                             num_its=20)
            Fnew_rand_tmp.loc[
                seed_D1_random] = np.nan  # set seeds to nan so they don't bias results
            Fnew_rand_D1[r] = Fnew_rand_tmp.loc[Fnew_D1.index.tolist()]

    elif rand_method == 'degree_binning':
        bins = get_degree_binning(Gint, 10)
        min_degree, max_degree, genes_binned = zip(*bins)
        bin_df = pd.DataFrame({
            'min_degree': min_degree,
            'max_degree': max_degree,
            'genes_binned': genes_binned
        })
        # create a lookup table for degree and index
        actual_degree_to_bin_df_idx = {}
        for i in range(0, bin_df['max_degree'].max() + 1):
            idx_temp = bin_df[(bin_df['min_degree'].lt(i + 1))
                              & (bin_df['max_degree'].gt(i -
                                                         1))].index.tolist()
            if len(
                    idx_temp
            ) > 0:  # there are some degrees which aren't represented in the graph
                actual_degree_to_bin_df_idx[i] = idx_temp[0]

#        r_inputs = range(num_reps)
#        num_cores = multiprocessing.cpu_count()-1
#        Fnew_rand_D1 = Parallel(n_jobs=num_cores)(delayed(calc_Fnew_rand_deg_binning)(r,Gint,bin_df,seed_D1,actual_degree_to_bin_df_idx,Fnew_D1,num_focal_edges,Wprime) for r in r_inputs)
        for r in range(num_reps):
            if (r % 50) == 0:
                print(r)
            # UPDATE 1/30/18 -- sample from degree bins
            seed_D1_random = []
            for g in seed_D1:
                degree_temp = nx.degree(Gint, g)
                # find genes with similar degrees to focal gene degree
                genes_temp = bin_df.loc[
                    actual_degree_to_bin_df_idx[degree_temp]]['genes_binned']

                np.random.shuffle(genes_temp)  # shuffle them
                while genes_temp[
                        0] in seed_D1_random:  # make sure the gene isn't already in the list
                    np.random.shuffle(genes_temp)  # shuffle them
                seed_D1_random.append(
                    genes_temp[0])  # build the seed_D1_random list

#            # modify random seeds so that they have similar localization properties to input set
#            prev_num_edges = len(nx.subgraph(Gint,seed_D1_random).edges())
#            print(prev_num_edges)
#            # pick a gene at random and replace it, if the number of edges increases, keep it, otherwise, don't keep
#            counter=-1
#            while (prev_num_edges < num_focal_edges) and (counter < 3000):
#                counter+=1
#                if (counter%1000)==0:
#                    print(counter)
#                    print(prev_num_edges)
#                np.random.shuffle(seed_D1_random)
#                replace_gene = seed_D1_random[0]
#                deg_replace_gene = nx.degree(Gint,replace_gene)
#                replace_bin = actual_degree_to_bin_df_idx[deg_replace_gene]
#                genes_temp = bin_df.loc[replace_bin]['genes_binned'] # use the lookup table for speed
#                np.random.shuffle(genes_temp) # shuffle them
#                #print(seed_D1_random[0])
#                seed_random_new=seed_D1_random[:]
#                seed_random_new[0]=genes_temp[0]
#                #print(seed_random_new[0])
#                new_num_edges = len(nx.subgraph(Gint,seed_random_new).edges())
#                #print(new_num_edges)
#                if new_num_edges>prev_num_edges:
#                    prev_num_edges=new_num_edges
#                    seed_D1_random = seed_random_new[:]

            Fnew_rand_tmp = network_prop.network_propagation(Gint,
                                                             Wprime,
                                                             seed_D1_random,
                                                             alpha=.5,
                                                             num_its=20)
            Fnew_rand_tmp.loc[
                seed_D1_random] = np.nan  # set seeds to nan so they don't bias results
            Fnew_rand_D1[r] = Fnew_rand_tmp.loc[Fnew_D1.index.tolist()]

    z_score_D1 = (np.log(Fnew_D1) - np.nanmean(np.log(Fnew_rand_D1), axis=0)
                  ) / np.nanstd(np.log(Fnew_rand_D1), axis=0)

    return z_score_D1, Fnew_rand_D1