Ejemplo n.º 1
0
def modular_ntwrk():
    G_025 = nx.algorithms.community.LFR_benchmark_graph(n=250,
                                                        tau1=3,
                                                        tau2=1.5,
                                                        mu=0.25,
                                                        average_degree=5,
                                                        min_community=20,
                                                        seed=10)
    np.savetxt(pt.get_path() + '/data/modular_ntwrk_mu_025.txt',
               nx.to_numpy_matrix(G_025),
               delimiter="\t")

    G_015 = nx.algorithms.community.LFR_benchmark_graph(n=250,
                                                        tau1=3,
                                                        tau2=1.5,
                                                        mu=0.15,
                                                        average_degree=5,
                                                        min_community=20,
                                                        seed=10)
    np.savetxt(pt.get_path() + '/data/modular_ntwrk_mu_015.txt',
               nx.to_numpy_matrix(G_015),
               delimiter="\t")

    G_010 = nx.algorithms.community.LFR_benchmark_graph(n=250,
                                                        tau1=3,
                                                        tau2=1.5,
                                                        mu=0.01,
                                                        average_degree=5,
                                                        min_community=20,
                                                        seed=10)
    np.savetxt(pt.get_path() + '/data/modular_ntwrk_mu_010.txt',
               nx.to_numpy_matrix(G_010),
               delimiter="\t")
    communities = {frozenset(G_010.nodes[v]['community']) for v in G_010}
Ejemplo n.º 2
0
def run_ntwrk_cov_sims(var=1, cov=0.25):
    df_out = open(
        pt.get_path() + '/data/simulations/cov_ntwrk_euc_pos_only_010.txt',
        'w')
    n_pops = 20
    n_genes = 250
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n')
    C = np.loadtxt(pt.get_path() + '/data/modular_ntwrk_mu_010.txt',
                   delimiter='\t')  #, dtype='int')
    #print(np.mean(np.sum(ntwrk, axis =1)))
    C = C * cov
    np.fill_diagonal(C, var)
    for i in range(100):
        test_cov = np.stack(
            [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
            axis=0)
        X = pt.hellinger_transform(test_cov)
        pca = PCA()
        pca_fit = pca.fit_transform(X)
        euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
        sim_eucs = []
        for j in range(1000):
            X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
            pca_fit_j = pca.fit_transform(X_j)
            sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
        z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs)
        print(str(cov), ' ', str(i), ' ', str(z_score))
        df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n')

    df_out.close()
Ejemplo n.º 3
0
def poisson_power_G(alpha = 0.05):
    fig = plt.figure()
    df = pd.read_csv(pt.get_path() + '/data/simulations/ba_cov_G_sims.txt', sep='\t')
    covs = np.sort(list(set(df.Cov.values)))
    Ns = np.sort(list(set(df.G.values)))
    colors = ['powderblue',  'royalblue', 'navy']
    for i, cov in enumerate(covs):
        powers = []
        for j, N in enumerate(Ns):
            df_cov = df[ (df['Cov'] == cov) & (df['G'] == N) ]
            p = df_cov['dist_percent'].values
            #p = df_i[ (df_i['N_genes_sample'] == gene_shuffle) ].p.tolist()
            p_sig = [p_i for p_i in p if p_i >= (1-alpha)]
            powers.append(len(p_sig) / len(p))
        plt.plot(np.asarray(Ns), np.asarray(powers), linestyle='--', marker='o', color=colors[i], label=r'$\mathrm{cov}=$' + str(cov))

    plt.tight_layout()
    plt.legend(loc='upper left', fontsize=14)
    plt.xlabel('Number of genes, '+ r'$\mathrm{log}_{2}$', fontsize = 16)
    plt.xscale('log', basex=2)
    plt.axhline(0.05, color = 'dimgrey', lw = 2, ls = '--')
    plt.ylabel(r'$ \mathrm{P}\left ( \mathrm{reject} \; H_{0}   \mid H_{1} \;   \mathrm{is}\, \mathrm{true}, \, \alpha=0.05 \right ) $', fontsize = 16)
    fig_name = pt.get_path() + '/figs/poisson_power_G.png'
    fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Ejemplo n.º 4
0
def rndm_sample_tenaillon(iter1=1000, iter2=1000):
    df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = df_np.shape[0]
    df_out = open(pt.get_path() + '/data/Tenaillon_et_al/sample_size_sim.txt',
                  'w')
    df_out.write(
        '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n')
    Ns = list(range(2, 40, 2))
    for N in Ns:
        for i in range(iter1):
            #df_np_i = df_np[np.random.choice(n_rows, N, replace=False), :]
            #df_np_i = df_np_i[: , ~np.all(df_np_i == 0, axis=0)]
            #df_i = df.sample(N)
            df_np_i = df_np[np.random.randint(n_rows, size=N), :]
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(
                compress(gene_names, list(map(operator.not_, gene_bool))))
            df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
            #df_i = df_i.loc[:, (df_i != 0).any(axis=0)]
            np.seterr(divide='ignore')
            df_np_i_delta = pt.likelihood_matrix_array(
                df_np_i, gene_names_i,
                'Tenaillon_et_al').get_likelihood_matrix()
            X = pt.hellinger_transform(df_np_i_delta)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            for j in range(iter2):
                #df_np_i_j = pt.random_matrix(df_np_i)
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                df_np_i_j_delta = pt.likelihood_matrix_array(
                    df_np_i_j, gene_names_i,
                    'Tenaillon_et_al').get_likelihood_matrix()
                #df_i_j = pd.DataFrame(data=pt.random_matrix(df_np_i_j), index=df_i.index, columns=df_i.columns)
                #df_i_j_delta = pt.likelihood_matrix(df_i_j, 'Tenaillon_et_al').get_likelihood_matrix()
                X_j = pt.hellinger_transform(df_np_i_j_delta)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))

            G = df_np_i.shape[1]
            euc_percent = len([k for k in euc_dists if k < euc_dist
                               ]) / len(euc_dists)
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            print(str(N), str(i), str(G), str(euc_percent), str(z_score))
            df_out.write('\t'.join([
                str(N), str(G),
                str(i), str(euc_percent),
                str(z_score)
            ]) + '\n')

    df_out.close()
Ejemplo n.º 5
0
def run_all_sims():
    df_out = open(pt.get_path() + '/data/simulations/cov_euc.txt', 'w')
    n_pops = 20
    n_genes = 50
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    covs = [0.5, 0, -0.5]
    df_out.write('\t'.join(['Covariance', 'Iteration', 'z_score']) + '\n')
    for cov in covs:
        for i in range(100):
            print(str(cov) + ' ' + str(i))
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=cov) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_euclidean_distance(pca_fit)
            sim_eucs = []
            for j in range(1000):
                #if j % 100 == 0:
                #    print(j)
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                pca_fit_j = pca.fit_transform(X_j)
                sim_eucs.append(pt.get_euclidean_distance(pca_fit_j))
            z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs)

            df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n')

    df_out.close()
Ejemplo n.º 6
0
def run_block_cov_sims():
    df_out = open(
        pt.get_path() + '/data/simulations/cov_block_euc_pos_only.txt', 'w')
    n_pops = 20
    n_genes = 50
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n')
    #covs = [0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9]
    covs = [-0.9]
    for cov in covs:
        C = get_block_cov(n_genes, pos_cov=cov, neg_cov=cov)
        print(np.all(np.linalg.eigvals(C) > 0))
        print(C)
        for i in range(100):
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            sim_eucs = []
            for j in range(1000):
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                pca_fit_j = pca.fit_transform(X_j)
                sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
            z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs)
            print(str(cov), ' ', str(i), ' ', str(z_score))
            df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n')

    df_out.close()
Ejemplo n.º 7
0
def get_network_cov(cov=1 / 9, var=1):
    #df = pd.read_csv(pt.get_path() + '/data/disassoc_network_eq.txt', sep='\t',  header=None)
    #df = df.astype(int)
    ntwrk = np.loadtxt(pt.get_path() + '/data/disassoc_network_eq.txt',
                       delimiter='\t')  #, dtype='int')
    #print(np.mean(np.sum(ntwrk, axis =1)))
    ntwrk = ntwrk * cov
    np.fill_diagonal(ntwrk, var)

    # Gershgorin circle theorem sets limit on covariance
    # https://math.stackexchange.com/questions/2378428/how-to-create-a-positive-definite-covariance-matrix-from-an-adjacency-matrix
    graph = nx.barabasi_albert_graph(50, 5)
    graph_np = nx.to_numpy_matrix(graph)
    #print(np.sum(graph_np, axis =1))

    graph_np = graph_np * cov
    np.fill_diagonal(graph_np, 1)

    #print(np.linalg.eigvals(graph_np))
    #print(np.all(np.linalg.eigvals(graph_np) > 0))
    #graph_np = graph_np * 0.49
    #np.fill_diagonal(graph_np, 1)

    #print(ntwrk)
    print(np.linalg.eigvals(ntwrk))
    print(np.all(np.linalg.eigvals(ntwrk) > 0))
Ejemplo n.º 8
0
def run_ba_cov_neutral_sims(shape=1,
                            scale=1,
                            G=50,
                            N=50,
                            iter1=1000,
                            iter2=1000):
    df_out = open(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt',
                  'w')
    df_out.write('\t'.join([
        'N', 'G', 'lamba_mean', 'lambda_neutral', 'Cov', 'Iteration',
        'dist_percent'
    ]) + '\n')
    covs = [0.2]
    mean_gamma = shape * scale
    neutral_range = np.logspace(-2, 1, num=20, endpoint=True, base=10.0)
    neutral_range = neutral_range[::-1]
    for neutral_ in neutral_range:
        for cov in covs:
            for i in range(iter1):
                C = pt.get_ba_cov_matrix(G, cov)
                lambda_genes = np.random.gamma(shape=shape,
                                               scale=scale,
                                               size=G)
                lambda_genes_null = np.asarray([neutral_] * G)
                test_cov_adapt = np.stack(
                    [pt.get_count_pop(lambda_genes, C=C) for x in range(N)],
                    axis=0)
                # matrix with diaganol values equal to one
                test_cov_neutral = np.stack([
                    pt.get_count_pop(lambda_genes_null, C=np.identity(G))
                    for x in range(N)
                ],
                                            axis=0)
                test_cov = test_cov_adapt + test_cov_neutral

                X = pt.hellinger_transform(test_cov)
                pca = PCA()
                pca_fit = pca.fit_transform(X)
                euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
                euc_dists = []
                for j in range(iter2):
                    #X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                    X_j = pt.hellinger_transform(
                        pt.get_random_matrix(test_cov))
                    pca_fit_j = pca.fit_transform(X_j)
                    euc_dists.append(
                        pt.get_mean_pairwise_euc_distance(pca_fit_j))
                euc_percent = len([k for k in euc_dists if k < euc_dist
                                   ]) / len(euc_dists)
                print(neutral_, cov, i, euc_percent)
                df_out.write('\t'.join([
                    str(N),
                    str(G),
                    str(mean_gamma),
                    str(neutral_),
                    str(cov),
                    str(i),
                    str(euc_percent)
                ]) + '\n')
    df_out.close()
Ejemplo n.º 9
0
def tenaillon_fitness_hist():
    gene_by_pop_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    gene_by_pop = pd.read_csv(gene_by_pop_path, sep = '\t', header = 'infer', index_col = 0)
    fitness_path = pt.get_path() + '/data/Tenaillon_et_al/fitness.csv'
    fitness = pd.read_csv(fitness_path, sep = ',', header = 'infer', index_col = 0)
    # select fitness values from lines that were sequenced
    fitness_subset = fitness.ix[gene_by_pop.index.values]
    fitness_np = fitness_subset['W (avg)'].values
    fitness_np = fitness_np[np.logical_not(np.isnan(fitness_np))]

    kde = pt.get_kde(fitness_np)

    fig = plt.figure()
    plt.plot(kde[0], kde[1])
    plt.xlabel("Fitness", fontsize = 18)
    plt.ylabel("Frequency", fontsize = 18)
    fig.tight_layout()
    plot_path = pt.get_path() + '/figs/tenaillon_fitness.png'
    fig.savefig(plot_path, bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Ejemplo n.º 10
0
def run_pca_sample_size_permutation(iter=10000, analysis='PCA', k=3):
    df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_array = df.as_matrix()
    sample_sizes = np.linspace(2, df.shape[0], num=20, dtype=int)
    df_out = open(
        pt.get_path() + '/data/Tenaillon_et_al/sample_size_permute_' +
        analysis + '.txt', 'w')
    column_headers = [
        'Sample_size', 'Iteration', 'MCD', 'mean_angle', 'delta_L'
    ]
    df_out.write('\t'.join(column_headers) + '\n')
    for sample_size in sample_sizes:
        print("Sample size = " + str(sample_size))
        for i in range(iter):
            print("Sample size = " + str(sample_size) + ' Iteration = ' +
                  str(i))
            df_sample = df.sample(n=sample_size)
            #df_sample = df_sample.loc[:, (df_sample != 0).any(axis=0)]
            df_sample_delta = pt.likelihood_matrix(
                df_sample, 'Tenaillon_et_al').get_likelihood_matrix()
            df_sample_delta = df_sample_delta.loc[:,
                                                  (df_sample_delta != 0).any(
                                                      axis=0)]
            X = pt.hellinger_transform(df_sample_delta)
            pca = PCA()
            df_sample_delta_out = pca.fit_transform(X)
            mcd = pt.get_mean_centroid_distance(df_sample_delta_out, k=k)
            mean_angle = pt.get_mean_angle(df_sample_delta_out, k=k)
            mean_length = pt.get_euclidean_distance(df_sample_delta_out, k=k)

            df_out.write('\t'.join([
                str(sample_size),
                str(i),
                str(mcd),
                str(mean_angle),
                str(mean_length)
            ]) + '\n')

    df_out.close()
Ejemplo n.º 11
0
def power_figs(alpha=0.05):
    df = pd.read_csv(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt',
                     sep='\t')
    fig = plt.figure()
    covs = [0.05, 0.1, 0.15, 0.2]
    measures = [
        'euc_percent', 'eig_percent', 'mcd_percent_k1', 'mcd_percent_k3'
    ]
    colors = ['powderblue', 'skyblue', 'royalblue', 'blue', 'navy']
    labels = ['euclidean distance', 'eigenanalysis', 'mcd 1', 'mcd 1-3']

    for i, measure in enumerate(measures):
        #df_i = df[ (df['Cov'] == cov) &  (df['Cov'] == cov)]
        powers = []
        for j, cov in enumerate(covs):
            df_cov = df[df['Cov'] == cov]
            p = df_cov[measure].values
            #p = df_i[ (df_i['N_genes_sample'] == gene_shuffle) ].p.tolist()
            p_sig = [p_i for p_i in p if p_i >= (1 - alpha)]
            powers.append(len(p_sig) / len(p))
        print(powers)
        plt.plot(np.asarray(covs),
                 np.asarray(powers),
                 linestyle='--',
                 marker='o',
                 color=colors[i],
                 label=labels[i])
    #plt.title('Covariance', fontsize = 18)
    plt.legend(loc='lower right')
    plt.xlabel('Covariance', fontsize=16)
    plt.ylabel(
        r'$ \mathrm{P}\left ( \mathrm{reject} \; H_{0}   \mid H_{1} \;   \mathrm{is}\, \mathrm{true}, \, \alpha=0.05 \right ) $',
        fontsize=16)
    #plt.xlim(-0.02, 1.02)
    #plt.ylim(-0.02, 1.02)
    plt.tight_layout()
    fig_name = pt.get_path() + '/figs/power_method.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()
Ejemplo n.º 12
0
def poisson_neutral_fig(alpha = 0.05):
    df = pd.read_csv(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt', sep='\t')
    neuts = np.sort(list(set(df.lambda_neutral.values)))
    cov = 0.2
    powers = []
    for neut in neuts:
        df_neut = df[ (df['lambda_neutral'] == neut)  ]
        p = df_neut.dist_percent.values
        p_sig = [p_i for p_i in p if p_i >= (1-alpha)]
        powers.append(len(p_sig) / len(p))
    fig = plt.figure()
    plt.plot(np.asarray(1 / neuts), np.asarray(powers), linestyle='--', marker='o', color='royalblue', label=r'$\mathrm{cov}=$' + str(cov))

    plt.tight_layout()
    plt.legend(loc='upper left', fontsize=14)
    plt.xscale('log', basex=10)

    plt.xlabel("Adaptive vs. non-adaptive substitution rate, " + r'$\frac{ \left \langle \lambda \right \rangle }{\lambda_{0}}$', fontsize = 16)
    plt.axhline(0.05, color = 'dimgrey', lw = 2, ls = '--')
    plt.ylabel(r'$ \mathrm{P}\left ( \mathrm{reject} \; H_{0}   \mid H_{1} \;   \mathrm{is}\, \mathrm{true}, \, \alpha=0.05 \right ) $', fontsize = 16)
    fig_name = pt.get_path() + '/figs/poisson_power_neutral.png'
    fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Ejemplo n.º 13
0
def get_fig():
    df = pd.read_csv(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt',
                     sep='\t')
    x = df.Cov.values
    y = df.euc_z_score.values
    #print(np.mean(y))

    fig = plt.figure()
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    x_slope = np.linspace(0, 1, 1000)
    y_slope = intercept + (slope * x_slope)

    plt.scatter(x, y, c='#175ac6', marker = 'o', s = 70, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.2, zorder=2)#, edgecolors='none')
    plt.plot(x_slope, y_slope, c='k', lw=2)
    plt.axhline(y=0, color='red', lw=2, linestyle='--')
    plt.xlabel('Covariance')
    plt.ylabel('Z-score')

    plt.tight_layout()
    fig_name = pt.get_path() + '/figs/cov_ba_ntwrk_ev.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()
Ejemplo n.º 14
0
def run_ba_cov_sims(gene_list, pop_list, out_name, iter1=1000, iter2=1000):
    df_out = open(pt.get_path() + '/data/simulations/' + out_name + '.txt',
                  'w')
    df_out.write('\t'.join(['N', 'G', 'Cov', 'Iteration', 'dist_percent']) +
                 '\n')
    covs = [0.1, 0.15, 0.2]
    for G in gene_list:
        for N in pop_list:
            for cov in covs:
                for i in range(iter1):
                    C = pt.get_ba_cov_matrix(G, cov)
                    while True:
                        lambda_genes = np.random.gamma(shape=1,
                                                       scale=1,
                                                       size=G)
                        test_cov = np.stack([
                            pt.get_count_pop(lambda_genes, cov=C)
                            for x in range(N)
                        ],
                                            axis=0)
                        #test_cov_row_sum = test_cov.sum(axis=1)
                        if (np.any(test_cov.sum(axis=1) == 0)) == False:
                            break
                        #if np.count_nonzero(test_cov_row_sum) == len(test_cov_row_sum):
                        #    break
                    X = pt.hellinger_transform(test_cov)
                    pca = PCA()
                    pca_fit = pca.fit_transform(X)
                    euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
                    euc_dists = []
                    for j in range(iter2):
                        X_j = pt.hellinger_transform(
                            pt.get_random_matrix(test_cov))
                        #X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                        pca_fit_j = pca.fit_transform(X_j)
                        euc_dists.append(
                            pt.get_mean_pairwise_euc_distance(pca_fit_j))
                    euc_percent = len([k for k in euc_dists if k < euc_dist
                                       ]) / len(euc_dists)
                    print(N, G, cov, i, euc_percent)
                    df_out.write('\t'.join(
                        [str(N),
                         str(G),
                         str(cov),
                         str(i),
                         str(euc_percent)]) + '\n')
    df_out.close()
Ejemplo n.º 15
0
def simulate(N, Ks, G, alphas, mu, iter=100):
    #alphas and mus is a list
    df_out = open(pt.get_path() + '/data/simulations/test.txt', 'w')
    header = ['N', 'K', 'Gene', 'Alpha', 'Mu', 'Muts', 'Iter']
    df_out.write('\t'.join(header) + '\n')
    for K in Ks:
        for alpha in alphas:
            for i in range(iter):
                print('K = ' + str(K), 'alpha = ' + str(alpha),
                      'iter = ' + str(i))
                sim_gene_dict = simulate_NK(N, K, G, alpha,
                                            mu).get_directed_graph()
                for key, value in sim_gene_dict.items():
                    sim_out = [N, K, key, alpha, mu, value, i]
                    sim_out = [str(x) for x in sim_out]
                    df_out.write('\t'.join(sim_out) + '\n')

    df_out.close()
Ejemplo n.º 16
0
def get_correlated_rndm_ntwrk_original(nodes=10, m=2, rho=0.5):
    assort_ = []
    graph = nx.barabasi_albert_graph(nodes, m)
    graph_np = nx.to_numpy_matrix(graph)
    #np.savetxt(pt.get_path() + '/data/disassoc_network_n0.txt', graph_np.astype(int), delimiter="\t")
    #iter = 100
    count = 0
    current_rho = 0
    #while count < iter:
    rejected_counts = 0
    while abs(current_rho) < abs(rho):

        def get_two_edges(graph_array):
            #d = nx.to_dict_of_dicts(graph_array, edge_data=1)
            d = nx.to_dict_of_dicts(nx.from_numpy_matrix(graph_array),
                                    edge_data=1)
            l0_n0 = random.sample(list(d), 1)[0]
            l0_list = list(d[l0_n0])
            l0_n1 = random.sample(l0_list, 1)[0]

            def get_second_edge(d, l0_n0, l0_n1):
                l1_list = [i for i in list(d) if i not in [l0_n0, l0_n1]]
                l1 = []
                while len(l1) != 2:
                    l1_n0 = random.sample(list(l1_list), 1)[0]
                    l1_n1_list = d[l1_n0]
                    l1_n1_list = [
                        i for i in l1_n1_list if i not in [l0_n0, l0_n1]
                    ]
                    if len(l1_n1_list) > 0:
                        l1_n1 = random.sample(list(l1_n1_list), 1)[0]
                        l1.extend([l1_n0, l1_n1])
                return l1

            # get two links, make sure all four nodes are unique
            link1 = get_second_edge(d, l0_n0, l0_n1)
            row_sums = np.asarray(np.sum(graph_array, axis=0))[0]
            node_edge_counts = [(l0_n0, row_sums[l0_n0]),
                                (l0_n1, row_sums[l0_n1]),
                                (link1[0], row_sums[link1[0]]),
                                (link1[1], row_sums[link1[1]])]
            return node_edge_counts

        edges = get_two_edges(graph_np)
        graph_np_sums = np.sum(graph_np, axis=1)
        #if edges == edges_copy:
        #    continue
        # check whether new edges already exist
        if graph_np[edges[0][0],edges[3][0]] == 1 or \
            graph_np[edges[3][0],edges[0][0]] == 1 or \
            graph_np[edges[2][0],edges[1][0]] == 1 or \
            graph_np[edges[1][0],edges[2][0]] == 1:
            continue

        disc = (edges[0][1] - edges[2][1]) * \
                (edges[3][1] - edges[1][1])
        if (assortative == True and disc > 0) or (assortative == False
                                                  and disc < 0):
            graph_np[edges[0][0], edges[1][0]] = 0
            graph_np[edges[1][0], edges[0][0]] = 0
            graph_np[edges[2][0], edges[3][0]] = 0
            graph_np[edges[3][0], edges[2][0]] = 0

            graph_np[edges[0][0], edges[3][0]] = 1
            graph_np[edges[3][0], edges[0][0]] = 1
            graph_np[edges[2][0], edges[1][0]] = 1
            graph_np[edges[1][0], edges[2][0]] = 1

            assort_.append(
                nx.degree_assortativity_coefficient(
                    nx.from_numpy_matrix(graph_np)))
            count += 1
            current_rho = nx.degree_assortativity_coefficient(
                nx.from_numpy_matrix(graph_np))

            print(current_rho, rejected_counts)

        else:
            rejected_counts += 1
            #print(count, disc, nx.degree_assortativity_coefficient(nx.from_numpy_matrix(graph_np)))

    graph_np.astype(int)
    if assortative == True:
        txt_name = 'assoc_network_eq'
    else:
        txt_name = 'disassoc_network_eq'
    np.savetxt(pt.get_path() + '/data/' + txt_name + '.txt',
               graph_np.astype(int),
               delimiter="\t")
Ejemplo n.º 17
0
def hist_tenaillon_multi(k = 3):
    df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df_delta = pt.likelihood_matrix(df, 'Tenaillon_et_al').get_likelihood_matrix()
    X = pt.hellinger_transform(df_delta)
    pca = PCA()
    df_out = pca.fit_transform(X)

    df_null_path = pt.get_path() + '/data/Tenaillon_et_al/permute_PCA.txt'
    df_null = pd.read_csv(df_null_path, sep = '\t', header = 'infer', index_col = 0)

    mean_angle = pt.get_mean_angle(df_out, k = k)
    mcd = pt.get_mean_centroid_distance(df_out, k=k)
    #mean_length = pt.get_euclidean_distance(df_out, k=k)
    mean_dist = pt.get_mean_pairwise_euc_distance(df_out, k=k)
    x_stat = pt.get_x_stat(pca.explained_variance_[:-1])

    fig = plt.figure()

    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    ax1.axhline(y=0, color='k', linestyle=':', alpha = 0.8, zorder=1)
    ax1.axvline(x=0, color='k', linestyle=':', alpha = 0.8, zorder=2)
    ax1.scatter(0, 0, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3)
    ax1.scatter(df_out[:,0], df_out[:,1], marker = "o", edgecolors='#244162', c = '#175ac6', alpha = 0.4, s = 60, zorder=4)

    ax1.set_xlim([-0.75,0.75])
    ax1.set_ylim([-0.75,0.75])
    ax1.set_xlabel('PCA 1 (' + str(round(pca.explained_variance_ratio_[0],3)*100) + '%)' , fontsize = 14)
    ax1.set_ylabel('PCA 2 (' + str(round(pca.explained_variance_ratio_[1],3)*100) + '%)' , fontsize = 14)


    ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1)
    mcd_list = df_null.MCD.tolist()
    #ax2.hist(mcd_list, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b')
    ax2.hist(mcd_list,bins=30, weights=np.zeros_like(mcd_list) + 1. / len(mcd_list), alpha=0.8, color = '#175ac6')
    ax2.axvline(mcd, color = 'red', lw = 3)
    ax2.set_xlabel("Mean centroid distance, " + r'$ \left \langle \delta_{c}  \right \rangle$', fontsize = 14)
    ax2.set_ylabel("Frequency", fontsize = 16)

    mcd_list.append(mcd)
    relative_position_mcd = sorted(mcd_list).index(mcd) / (len(mcd_list) -1)
    if relative_position_mcd > 0.5:
        p_score_mcd = 1 - relative_position_mcd
    else:
        p_score_mcd = relative_position_mcd
    print('mean centroid distance p-score = ' + str(round(p_score_mcd, 3)))
    ax2.text(0.366, 0.088, r'$p < 0.05$', fontsize = 10)

    ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    delta_L_list = df_null.mean_dist.tolist()
    #ax3.hist(delta_L_list, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b')
    ax3.hist(delta_L_list,bins=30, weights=np.zeros_like(delta_L_list) + 1. / len(delta_L_list), alpha=0.8, color = '#175ac6')
    ax3.axvline(mean_dist, color = 'red', lw = 3)
    ax3.set_xlabel("Mean pair-wise \n Euclidean distance, " + r'$   \left \langle   d \right  \rangle$', fontsize = 14)
    ax3.set_ylabel("Frequency", fontsize = 16)

    delta_L_list.append(mean_dist)
    relative_position_delta_L = sorted(delta_L_list).index(mean_dist) / (len(delta_L_list) -1)
    if relative_position_delta_L > 0.5:
        p_score_delta_L = 1 - relative_position_delta_L
    else:
        p_score_delta_L = relative_position_delta_L
    print('mean difference in distances p-score = ' + str(round(p_score_delta_L, 3)))
    ax3.text(0.50, 0.09, r'$p < 0.05$', fontsize = 10)



    ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1)
    ax4_values = df_null.x_stat.values
    ax4_values = ax4_values[np.logical_not(np.isnan(ax4_values))]
    #ax4.hist(ax4_values, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b')
    ax4.hist(ax4_values, bins=30, weights=np.zeros_like(ax4_values) + 1. / len(ax4_values), alpha=0.8, color = '#175ac6')
    print(np.mean(ax4_values))
    print(stats.mode(ax4_values))

    ax4.axvline(x_stat, color = 'red', lw = 3)
    ax4.set_xlabel(r'$F_{1}$', fontsize = 14)
    ax4.set_ylabel("Frequency", fontsize = 16)

    mean_angle_list = ax4_values.tolist()
    mean_angle_list.append(mean_angle)
    relative_position_angle = sorted(mean_angle_list).index(mean_angle) / (len(mean_angle_list) -1)
    print(x_stat)
    print( len([x for x in mean_angle_list if x > x_stat])/  sum(mean_angle_list)  )
    if relative_position_angle > 0.5:
        p_score_angle = 1 - relative_position_angle
    else:
        p_score_angle = relative_position_angle
    print('F_{1} statistic p-score = ' + str(round(p_score_angle, 3)))
    ax4.text(19.1, 0.09, r'$p \nless  0.05$', fontsize = 10)

    plt.tight_layout()
    fig_name = pt.get_path() + '/figs/fig1.png'
    fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Ejemplo n.º 18
0
import pickle
import operator
import sys
import random
import copy
from itertools import compress
import numpy as np
import pandas as pd
import parevol_tools as pt
import clean_data as cd

import matplotlib.pyplot as plt

from scipy import stats

df_non_path = pt.get_path() + "/data/Tenaillon_et_al/gene_by_pop_nonsyn.txt"
df_non = pd.read_csv(df_non_path, sep="\t", header="infer", index_col=0)
genes_non = df_non.columns.to_list()
df_non_np = df_non.values
df_non_np = np.transpose(df_non_np)

mean_all = []
var_all = []

for gene in df_non_np:

    if sum(gene > 0) < 5:
        continue

    mean_all.append(np.mean(gene))
    var_all.append(np.var(gene))
Ejemplo n.º 19
0
def plot_permutation(dataset = 'good', analysis = 'PCA', alpha = 0.05):
    df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    to_exclude = pt.complete_nonmutator_lines()
    to_exclude.append('p5')
    df_nonmut = df[df.index.str.contains('|'.join( to_exclude))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    df_delta = pt.likelihood_matrix(df_nonmut, 'Good_et_al').get_likelihood_matrix()
    if analysis == 'PCA':
        X = pt.hellinger_transform(df_delta)
        pca = PCA()
        df_out = pca.fit_transform(X)
    elif analysis == 'cMDS':
        df_delta_bc = np.sqrt(pt.get_scipy_bray_curtis(df_delta.as_matrix()))
        df_out = pt.cmdscale(df_delta_bc)[0]

    time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values])))

    df_rndm_delta_out = pd.DataFrame(data=df_out, index=df_delta.index)
    mcds = []
    for tp in time_points_set:
        df_rndm_delta_out_tp = df_rndm_delta_out[df_rndm_delta_out.index.str.contains('_' + str(tp))]
        mcds.append(pt.get_mean_pairwise_euc_distance(df_rndm_delta_out_tp.as_matrix(), k=3))

    mcd_perm_path = pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt'
    mcd_perm = pd.read_csv(mcd_perm_path, sep = '\t', header = 'infer', index_col = 0)
    mcd_perm_x = np.sort(list(set(mcd_perm.Generation.tolist())))
    lower_ci = []
    upper_ci = []
    mean_mcds = []
    std_mcds = []
    lower_z_ci = []
    upper_z_ci = []
    for x in mcd_perm_x:
        mcd_perm_y = mcd_perm.loc[mcd_perm['Generation'] == x]
        mcd_perm_y_sort = np.sort(mcd_perm_y.mean_dist.tolist())
        mean_mcd_perm_y = np.mean(mcd_perm_y_sort)
        std_mcd_perm_y = np.std(mcd_perm_y_sort)
        mean_mcds.append(mean_mcd_perm_y)
        std_mcds.append(std_mcd_perm_y)
        lower_ci.append(mean_mcd_perm_y - mcd_perm_y_sort[int(len(mcd_perm_y_sort) * alpha)])
        upper_ci.append(abs(mean_mcd_perm_y - mcd_perm_y_sort[int(len(mcd_perm_y_sort) * (1 - alpha))]))
        # z-scores
        mcd_perm_y_sort_z = [ ((i - mean_mcd_perm_y) /  std_mcd_perm_y) for i in mcd_perm_y_sort]
        lower_z_ci.append(abs(mcd_perm_y_sort_z[int(len(mcd_perm_y_sort_z) * alpha)]))
        upper_z_ci.append(abs(mcd_perm_y_sort_z[int(len(mcd_perm_y_sort_z) * (1 - alpha))]))

    fig = plt.figure()

    plt.figure(1)
    plt.subplot(211)
    plt.errorbar(mcd_perm_x, mean_mcds, yerr = [lower_ci, upper_ci], fmt = 'o', alpha = 0.5, \
        barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1)
    plt.scatter(time_points_set, mcds, c='#175ac6', marker = 'o', s = 70, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none')

    #plt.xlabel("Time (generations)", fontsize = 16)
    #plt.ylabel("Mean \n Euclidean distance", fontsize = 14)
    plt.ylabel("Mean pair-wise \n Euclidean \n distance, " + r'$   \left \langle   d \right  \rangle$', fontsize = 14)


    plt.figure(1)
    plt.subplot(212)
    plt.errorbar(mcd_perm_x, [0] * len(mcd_perm_x), yerr = [lower_z_ci, upper_z_ci], fmt = 'o', alpha = 0.5, \
        barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1)
    # zip mean, std, and measured values to make z-scores
    zip_list = list(zip(mean_mcds, std_mcds, mcds))
    z_scores = [((i[2] - i[0]) / i[1]) for i in zip_list ]
    plt.scatter(time_points_set, z_scores, c='#175ac6', marker = 'o', s = 70, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none')
    plt.ylim(-2.2, 2.2)
    #plt.axhline(0, color = 'k', lw = 2, ls = '-')
    #plt.axhline(-1, color = 'dimgrey', lw = 2, ls = '--')
    #plt.axhline(-2, color = 'dimgrey', lw = 2, ls = ':')
    plt.xlabel("Time (generations)", fontsize = 16)

    plt.ylabel("Standardized mean \n pair-wise Euclidean \n distance, " + r'$   z_{\left \langle   d \right  \rangle}$', fontsize = 14)
    #plt.ylabel("Standardized mean \n Euclidean distance", fontsize = 14)

    fig.tight_layout()
    fig.savefig(pt.get_path() + '/figs/permutation_scatter_good.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Ejemplo n.º 20
0

def probability_absence(gene, N, mut_counts_dict, zeros=True):

    if zeros == True:
        mean_relative_muts_denom = sum( [mut_counts_dict[g]["mean_relative_muts"] for g in mut_counts_dict.keys()] )
        mean_relative_muts_num = sum([mut_counts_dict[g]["mean_relative_muts"] for g in mut_counts_dict.keys() if g != gene])

    else:
        mean_relative_muts_denom = sum([mut_counts_dict[g]["mean_relative_muts_no_zeros"] for g in mut_counts_dict.keys()])
        mean_relative_muts_num = sum([mut_counts_dict[g]["mean_relative_muts_no_zeros"] for g in mut_counts_dict.keys() if g != gene])

    return (mean_relative_muts_num / mean_relative_muts_denom) ** N


df_non_path = pt.get_path() + "/data/Tenaillon_et_al/gene_by_pop_nonsyn.txt"
df_non = pd.read_csv(df_non_path, sep="\t", header="infer", index_col=0)
genes_non = df_non.columns.to_list()
df_non_np = df_non.values
df_non_np = np.transpose(df_non_np)

locus_tags_non = map_tenaillon_genes_to_locus_tags(genes_non)

mut_counts_non_dict = get_mut_counts_dict(df_non_np, locus_tags_non)

# df_syn_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop_syn.txt'
# df_syn = pd.read_csv(df_syn_path, sep = '\t', header = 'infer', index_col = 0)
# genes_syn = df_syn.columns.to_list()
# df_syn_np = df_syn.values
# df_syn_np = np.transpose(df_syn_np)
Ejemplo n.º 21
0
def run_pca_permutation(iter=10000, analysis='PCA', dataset='tenaillon'):
    if dataset == 'tenaillon':
        k = 3
        df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
        df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
        df_array = df.as_matrix()
        df_out = open(
            pt.get_path() + '/data/Tenaillon_et_al/permute_' + analysis +
            '.txt', 'w')
        column_headers = [
            'Iteration', 'MCD', 'mean_angle', 'mean_dist', 'delta_L', 'x_stat'
        ]
        df_out.write('\t'.join(column_headers) + '\n')
        for i in range(iter):
            print(i)
            df_rndm = pd.DataFrame(data=pt.random_matrix(df_array),
                                   index=df.index,
                                   columns=df.columns)
            df_rndm_delta = pt.likelihood_matrix(
                df_rndm, 'Tenaillon_et_al').get_likelihood_matrix()
            if analysis == 'PCA':
                X = pt.hellinger_transform(df_rndm_delta)
                pca = PCA()
                df_rndm_delta_out = pca.fit_transform(X)
                #df_pca = pd.DataFrame(data=X_pca, index=df.index)
            mean_angle = pt.get_mean_angle(df_rndm_delta_out, k=k)
            mcd = pt.get_mean_centroid_distance(df_rndm_delta_out, k=k)
            mean_length = pt.get_euc_magnitude_diff(df_rndm_delta_out, k=k)
            mean_dist = pt.get_mean_pairwise_euc_distance(df_rndm_delta_out,
                                                          k=k)
            x_stat = pt.get_x_stat(pca.explained_variance_[:-1])
            df_out.write('\t'.join([
                str(i),
                str(mcd),
                str(mean_angle),
                str(mean_dist),
                str(mean_length),
                str(x_stat)
            ]) + '\n')
        df_out.close()

    elif dataset == 'good':
        k = 5
        df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt'
        df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
        to_exclude = pt.complete_nonmutator_lines()
        to_exclude.append('p5')
        df_nonmut = df[df.index.str.contains('|'.join(to_exclude))]
        # remove columns with all zeros
        df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
        time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
        time_points_set = sorted(
            list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
        df_nonmut_array = df_nonmut.as_matrix()
        time_points_positions = {}
        for x in time_points_set:
            time_points_positions[x] = [
                i for i, j in enumerate(time_points) if j == x
            ]
        df_final = df_nonmut.iloc[time_points_positions[time_points_set[-1]]]

        df_out = open(
            pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt',
            'w')
        #column_headers = ['Iteration', 'Generation', 'MCD']
        column_headers = [
            'Iteration', 'Generation', 'MCD', 'mean_angle', 'delta_L',
            'mean_dist'
        ]
        df_out.write('\t'.join(column_headers) + '\n')
        for i in range(iter):
            print("Iteration " + str(i))
            matrix_0 = df_nonmut.iloc[time_points_positions[
                time_points_set[0]]]
            matrix_0_rndm = pt.random_matrix(matrix_0.as_matrix())
            df_rndm_list = [
                pd.DataFrame(data=matrix_0_rndm,
                             index=matrix_0.index,
                             columns=matrix_0.columns)
            ]
            # skip first time step
            for j, tp in enumerate(time_points_set[0:]):
                if j == 0:
                    continue
                df_tp_minus1 = df_nonmut[df_nonmut.index.str.contains(
                    '_' + str(time_points_set[j - 1]))]
                df_tp = df_nonmut[df_nonmut.index.str.contains('_' + str(tp))]
                matrix_diff = df_tp.as_matrix() - df_tp_minus1.as_matrix()
                matrix_0_rndm = matrix_0_rndm + pt.random_matrix(matrix_diff)
                df_0_rndm = pd.DataFrame(data=matrix_0_rndm,
                                         index=df_tp.index,
                                         columns=df_tp.columns)
                df_rndm_list.append(df_0_rndm)

            df_rndm = pd.concat(df_rndm_list)
            df_rndm_delta = pt.likelihood_matrix(
                df_rndm, 'Good_et_al').get_likelihood_matrix()
            if analysis == 'PCA':
                X = pt.hellinger_transform(df_rndm_delta)
                pca = PCA()
                matrix_rndm_delta_out = pca.fit_transform(X)
            elif analysis == 'cMDS':
                matrix_rndm_delta_bc = np.sqrt(
                    pt.get_bray_curtis(df_rndm_delta.as_matrix()))
                matrix_rndm_delta_out = pt.cmdscale(matrix_rndm_delta_bc)[0]
            else:
                print("Analysis argument not accepted")
                continue

            df_rndm_delta_out = pd.DataFrame(data=matrix_rndm_delta_out,
                                             index=df_rndm_delta.index)
            for tp in time_points_set:
                df_rndm_delta_out_tp = df_rndm_delta_out[
                    df_rndm_delta_out.index.str.contains('_' + str(tp))]
                df_rndm_delta_out_tp_matrix = df_rndm_delta_out_tp.as_matrix()
                mean_angle = pt.get_mean_angle(df_rndm_delta_out_tp_matrix,
                                               k=k)
                mcd = pt.get_mean_centroid_distance(
                    df_rndm_delta_out_tp_matrix, k=k)
                mean_length = pt.get_euc_magnitude_diff(
                    df_rndm_delta_out_tp_matrix, k=k)
                mean_dist = pt.get_mean_pairwise_euc_distance(
                    df_rndm_delta_out_tp_matrix, k=k)
                df_out.write('\t'.join([
                    str(i),
                    str(tp),
                    str(mcd),
                    str(mean_angle),
                    str(mean_length),
                    str(mean_dist)
                ]) + '\n')

        df_out.close()
Ejemplo n.º 22
0
def calculate_subsampled_mae(df_np, df_genes, mut_counts_dict, subsamples=1, name="non"):

    population_idx = np.arange(0, df_np.shape[1], 1)
    n_subsamples = np.arange(10, df_np.shape[1], 5)
    #n_subsamples = [10]

    mae_dict = {}

    for n_i in n_subsamples:

        sys.stdout.write("%d populations......\n" % n_i)

        mae_dict[n_i] = {}

        mean_absolute_error_all = []
        mean_absolute_error_all_geometric = []

        for subsample in range(subsamples):

            population_idx_subsample = np.random.choice(population_idx, size=n_i, replace=False)

            df_np_subsample = df_np[:, population_idx_subsample]

            observed_occupancies_subsample, predicted_occupancies_subsample = get_predicted_observed_occupancies(df_np_subsample, df_genes, mut_counts_dict)

            # df_np_pres_abs_subsample = np.where(df_np_subsample > 0, 1, 0)
            # observed_occupancies_subsample = df_np_pres_abs_subsample.sum(axis=1) / df_np_subsample.shape[1]
            # N_subsample_array = df_np_subsample.sum(axis=0)
            # predicted_occupancies_subsample = []
            # for gene_idx, gene in enumerate(df_genes):

            #    absence_prob_subsample_list = [probability_absence(gene, N_subsample, mut_counts_dict) for N_subsample in N_subsample_array if N_subsample> 0 ]
            #    predicted_occupancies_subsample.append(1-np.mean(absence_prob_subsample_list))

            # predicted_occupancies_subsample = np.asarray(predicted_occupancies_subsample)
            predicted_occupancies_subsample = predicted_occupancies_subsample[observed_occupancies_subsample > 0]
            observed_occupancies_subsample = observed_occupancies_subsample[observed_occupancies_subsample > 0]

            mean_absolute_error_i = np.mean(np.absolute(observed_occupancies_subsample - predicted_occupancies_subsample) )
            mean_absolute_error_all.append(mean_absolute_error_i)

            # geometric
            observed_occupancies_subsample_geometric, predicted_occupancies_subsample_geometric = get_predicted_observed_occupancies_geometric(df_np_subsample, df_genes, mut_counts_dict)
            predicted_occupancies_subsample_geometric = predicted_occupancies_subsample_geometric[observed_occupancies_subsample_geometric > 0]
            observed_occupancies_subsample_geometric = observed_occupancies_subsample_geometric[observed_occupancies_subsample_geometric > 0]

            mean_absolute_error_i_geometric = np.mean(np.absolute(observed_occupancies_subsample_geometric - predicted_occupancies_subsample_geometric) )
            mean_absolute_error_all_geometric.append(mean_absolute_error_i_geometric)


        mean_absolute_error_all = np.asarray(mean_absolute_error_all)
        mean_absolute_error_all_geometric = np.asarray(mean_absolute_error_all_geometric)

        mae_dict[n_i]["mae_mean"] = np.mean(mean_absolute_error_all)
        mae_dict[n_i]["mae_025"] = np.percentile(mean_absolute_error_all, 2.5)
        mae_dict[n_i]["mae_975"] = np.percentile(mean_absolute_error_all, 97.5)

        mae_dict[n_i]["mae_mean_geometric"] = np.mean(mean_absolute_error_all_geometric)
        mae_dict[n_i]["mae_025_geometric"] = np.percentile(mean_absolute_error_all_geometric, 2.5)
        mae_dict[n_i]["mae_975_geometric"] = np.percentile(mean_absolute_error_all_geometric, 97.5)

    sys.stdout.write("Dumping pickle......\n")
    file_name = "%s/data/Tenaillon_et_al/subsample_poisson_occupancy_%s.pickle" % (pt.get_path(), name)
    with open(file_name, "wb") as handle:
        pickle.dump(mae_dict, handle)
    sys.stdout.write("Done!\n")
Ejemplo n.º 23
0
def run_ba_ntwk_cov_sims():
    df_out = open(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', 'w')
    n_pops = 100
    n_genes = 50
    ntwk = nx.barabasi_albert_graph(n_genes, 2)
    ntwk_np = nx.to_numpy_matrix(ntwk)
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join([
        'Cov', 'Iteration', 'euc_z_score', 'euc_percent', 'eig_percent',
        'mcd_percent_k1', 'mcd_percent_k3'
    ]) + '\n')
    covs = [0.05, 0.1, 0.15, 0.2]
    #covs = [0.2, 0.7]
    for cov in covs:
        C = ntwk_np * cov
        np.fill_diagonal(C, 1)
        #z_scores = []
        #eig_percents = []
        #euc_percents = []
        #centroid_percents_k1 = []
        #centroid_percents_k3 = []
        for i in range(1000):
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            eig = pt.get_x_stat(pca.explained_variance_[:-1])
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k=1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k=3)
            eigs = []
            centroid_dists_k1 = []
            centroid_dists_k3 = []
            for j in range(1000):
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                #pca_j = PCA()
                #pca_fit_j = pca_j.fit_transform(X_j)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
                centroid_dists_k1.append(
                    pt.get_mean_centroid_distance(pca_fit_j, k=1))
                centroid_dists_k3.append(
                    pt.get_mean_centroid_distance(pca_fit_j, k=3))
                eigs.append(pt.get_x_stat(pca.explained_variance_[:-1]))
                #eigs.append( pt.get_x_stat(pca_j.explained_variance_[:-1]) )
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            euc_percent = len([k for k in euc_dists if k < euc_dist
                               ]) / len(euc_dists)
            eig_percent = len([k for k in eigs if k < eig]) / len(eigs)
            centroid_percent_k1 = len([
                k for k in centroid_dists_k1 if k < mcd_k1
            ]) / len(centroid_dists_k1)
            centroid_percent_k3 = len([
                k for k in centroid_dists_k3 if k < mcd_k3
            ]) / len(centroid_dists_k3)
            #eig_percents.append(eig_percent)
            #euc_percents.append(euc_percent)
            #z_scores.append(z_score)
            print(cov, i, z_score, euc_percent, eig_percent)
            df_out.write('\t'.join([
                str(cov),
                str(i),
                str(z_score),
                str(euc_percent),
                str(eig_percent),
                str(centroid_percent_k1),
                str(centroid_percent_k3)
            ]) + '\n')

        #print(cov, np.all(np.linalg.eigvals(C) > 0), np.mean(z_scores))

    df_out.close()