예제 #1
0
def wannier_hist(iter=10000):
    dir = os.path.expanduser("~/GitHub/ParEvol")
    df1 = pd.read_csv(dir + '/data/Wannier_et_al/C321.deltaA_mutation_table_clean.txt', sep='\t', index_col=0)
    df2 = pd.read_csv(dir + '/data/Wannier_et_al/C321_mutation_table_clean.txt', sep='\t', index_col=0)
    df = df1.append(df2, sort=False)
    df = df.fillna(0)
    df_np = df.values
    gene_names = df.columns.values
    N1 = df1.shape[0]
    N2 = df2.shape[0]
    df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, 'Wannier_et_al').get_likelihood_matrix()
    F2_all = pt.get_F_2(df_np_delta, N1, N2)
    print(F2_all)
    F2 = F2_all[0]
    V1 = F2_all[1]
    V2 = F2_all[2]

    F2_null = []
    V1_null = []
    V2_null = []
    for i in range(iter):
        if i %1000 ==0:
            print(i)
        df_np_i = pt.get_random_matrix(df_np)
        np.seterr(divide='ignore')
        df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names, 'Wannier_et_al').get_likelihood_matrix()
        F2_all_iter = pt.get_F_2(df_np_i_delta, N1, N2)
        F2_null.append(F2_all_iter[0])
        V1_null.append(F2_all_iter[1])
        V2_null.append(F2_all_iter[2])

    fig = plt.figure()
    #plt.hist(F2_null, bins=30, weights=np.zeros_like(F2_null) + 1. / len(F2_null), alpha=0.8, color = '#175ac6')
    plt.hist(F2_null, bins=30, alpha=0.8, color = '#175ac6')
    plt.axvline(F2, color = 'red', lw = 3)
    plt.xlabel(r'$ F_{2}$', fontsize = 20)
    plt.ylabel("Frequency", fontsize = 12)
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()


    fig = plt.figure()
    plt.hist(V1_null, bins=30, alpha=0.8, color = '#175ac6')
    plt.axvline(V1, color = 'red', lw = 3)
    plt.xlabel(r'$ V_{1}$', fontsize = 20)
    plt.ylabel("Frequency", fontsize = 12)
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V1.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()

    fig = plt.figure()
    plt.hist(V2_null, bins=30, alpha=0.8, color = '#175ac6')
    #print(V2_null)
    plt.axvline(V2, color = 'red', lw = 3)
    plt.xlabel(r'$ V_{2}$', fontsize = 20)
    plt.ylabel("Frequency", fontsize = 12)
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V2.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
예제 #2
0
def rndm_sample_tenaillon(iter1=1000, iter2=10000):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = list(range(df_np.shape[0]))
    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Tenaillon_et_al/dist_sample_size.txt', 'w')
    df_out.write(
        '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n')
    #Ns = list(range(4, 40 +2, 2))
    Ns = [40]
    pca = PCA()
    for N in Ns:
        for i in range(iter1):
            df_np_i = df_np[
                np.random.choice(n_rows, size=N, replace=False, p=None), :]
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(
                compress(gene_names, list(map(operator.not_, gene_bool))))
            df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
            np.seterr(divide='ignore')
            df_np_i_delta = cd.likelihood_matrix_array(
                df_np_i, gene_names_i,
                'Tenaillon_et_al').get_likelihood_matrix()
            df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None]
            X = pt.get_mean_center(df_np_i_delta)
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            for j in range(iter2):
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                df_np_i_j_delta = cd.likelihood_matrix_array(
                    df_np_i_j, gene_names_i,
                    'Tenaillon_et_al').get_likelihood_matrix()
                df_np_i_j_delta = df_np_i_j_delta / df_np_i_j_delta.sum(
                    axis=1)[:, None]
                X_j = pt.get_mean_center(df_np_i_j_delta)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))

            G = df_np_i.shape[1]
            euc_percent = len([k for k in euc_dists if k > euc_dist
                               ]) / len(euc_dists)
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            print(str(N), str(i), str(G), str(euc_percent), str(z_score))
            df_out.write('\t'.join([
                str(N), str(G),
                str(i), str(euc_percent),
                str(z_score)
            ]) + '\n')

    df_out.close()
예제 #3
0
def rndm_sample_tenaillon(k_eval=3, iter1=20, iter2=1000, sample_bs = 10, iter_bs=10000):
    df_path = mydir + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = list(range(df_np.shape[0]))
    df_out = open(mydir + '/data/Tenaillon_et_al/power_sample_size.txt', 'w')
    df_out.write('\t'.join(['N', 'G', 'Power', 'Power_025', 'Power_975']) + '\n')

    Ns = [20, 30]
    #Ns = list(range(20, n_rows, 4))
    for N in Ns:
        p_values = []
        #z_scores = []
        G_list = []
        for i in range(iter1):
            df_np_i = df_np[np.random.choice(n_rows, size=N, replace=False, p=None), :]
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(compress(gene_names, list(map(operator.not_, gene_bool))))
            G_list.append(len(gene_names_i))
            df_np_i = df_np_i[:,~np.all(df_np_i == 0, axis=0)]
            np.seterr(divide='ignore')
            df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
            X = df_np_i_delta/df_np_i_delta.sum(axis=1)[:,None]
            X -= np.mean(X, axis = 0)
            pca = PCA()
            pca_X = pca.fit_transform(X)
            mpd = pt.get_mean_pairwise_euc_distance(pca_X, k=k_eval)
            mpd_null = []
            for j in range(iter2):
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                df_np_i_j_delta = cd.likelihood_matrix_array(df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
                X_j = df_np_i_j_delta/df_np_i_j_delta.sum(axis=1)[:,None]
                X_j -= np.mean(X_j, axis = 0)
                pca_X_j = pca.fit_transform(X_j)
                mpd_null.append(pt.get_mean_pairwise_euc_distance(pca_X_j, k=k_eval))
            p_values.append(len( [m for m in mpd_null if m > mpd] ) / len(mpd_null))
            #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s
        print(p_values)

        power = len([n for n in p_values if n < 0.05]) / len(p_values)
        print(p_values)
        power_bootstrap = []
        for p in range(iter_bs):
            p_values_sample = random.sample(p_values, sample_bs)
            power_sample = len([n for n in p_values_sample if n < 0.05]) / len(p_values_sample)
            power_bootstrap.append(power_sample)
        power_bootstrap.sort()
        # return number of genes, power, power lower, power upper
        #return  power, power_bootstrap[int(10000*0.025)], power_bootstrap[int(10000*0.975)]
        df_out.write('\t'.join([str(N), str(np.mean(G_list)), str(power), str(power_bootstrap[int(iter_bs*0.025)]), str(power_bootstrap[int(iter_bs*0.975)])]) + '\n')

    df_out.close()
예제 #4
0
def rndm_sample_tenaillon(N,
                          df_np,
                          gene_names,
                          n_rows,
                          k=3,
                          iter1=100,
                          iter2=1000,
                          sample_bs=10,
                          iter_bs=10000):
    p_values = []
    #z_scores = []
    G_list = []
    for i in range(iter1):
        df_np_i = df_np[
            np.random.choice(n_rows, size=N, replace=False, p=None), :]
        gene_bool = np.all(df_np_i == 0, axis=0)
        # flip around to select gene_size
        gene_names_i = list(
            compress(gene_names, list(map(operator.not_, gene_bool))))
        G_list.append(len(gene_names_i))
        df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
        np.seterr(divide='ignore')
        df_np_i_delta = cd.likelihood_matrix_array(
            df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
        X = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None]
        e_vals, a_mat = pt.pca_np(X)
        euc_dist = pt.get_mean_pairwise_euc_distance(a_mat, k=k)
        print(euc_dist)
        euc_dists = []
        for j in range(iter2):
            df_np_i_j = pt.get_random_matrix(df_np_i)
            np.seterr(divide='ignore')
            df_np_i_j_delta = cd.likelihood_matrix_array(
                df_np_i_j, gene_names_i,
                'Tenaillon_et_al').get_likelihood_matrix()
            X_j = df_np_i_j_delta / df_np_i_j_delta.sum(axis=1)[:, None]
            e_vals_j, a_mat_j = pt.pca_np(X_j)
            euc_dists.append(pt.get_mean_pairwise_euc_distance(a_mat_j, k=k))
        p_values.append(
            len([m for m in euc_dists if m > euc_dist]) / len(euc_dists))
        #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s

    power = len([n for n in p_values if n < 0.05]) / len(p_values)
    print(p_values)
    power_bootstrap = []
    for p in range(iter_bs):
        p_values_sample = random.sample(p_values, sample_bs)
        power_sample = len([n for n in p_values_sample if n < 0.05
                            ]) / len(p_values_sample)
        power_bootstrap.append(power_sample)
    power_bootstrap.sort()
    # return number of genes, power, power lower, power upper
    return N, np.mean(G_list), power, power_bootstrap[int(
        10000 * 0.025)], power_bootstrap[int(10000 * 0.975)]
예제 #5
0
def gene_svd_tenaillon(iter=10000):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    gene_names = df.columns.tolist()
    df_np = df.values
    df_np_delta = cd.likelihood_matrix_array(
        df_np, gene_names, 'Tenaillon_et_al').get_likelihood_matrix()
    df_np_delta = df_np_delta / df_np_delta.sum(axis=1)[:, None]
    X = pt.get_mean_center(df_np_delta)
    # scipy's svd returns the V matrix in transposed form
    U, s, V_T = svds(X, k=3)
    # apply another transposition to calculate basis matrix
    F = (V_T.T @ np.diag(s)) / np.sqrt(X.shape[0] - 1)
    vars = np.linalg.norm(F, axis=1)**2
    vars_null_list = []
    for i in range(iter):
        if i % 1000 == 0:
            print("Iteration " + str(i))
        df_np_i = pt.get_random_matrix(df_np)
        np.seterr(divide='ignore')
        df_np_i_delta = cd.likelihood_matrix_array(
            df_np_i, gene_names, 'Tenaillon_et_al').get_likelihood_matrix()
        df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None]
        X_j = pt.get_mean_center(df_np_i_delta)
        U_i, s_i, V_i_T = svds(X_j, k=3)
        F_i = (V_i_T.T @ np.diag(s_i)) / np.sqrt(X_j.shape[0] - 1)
        vars_null_list.append(np.linalg.norm(F_i, axis=1)**2)

    vars_null = np.stack(vars_null_list)
    vars_null_mean = np.mean(vars_null, axis=0)
    vars_null_std = np.std(vars_null, axis=0)
    z_scores = (vars - vars_null_mean) / vars_null_std
    p_values = []
    # calculate p values
    for k, column in enumerate(vars_null.T):
        column_greater = [x for x in column if x > vars[k]]
        p_values.append(len(column_greater) / iter)

    label_z_scores = list(zip(gene_names, z_scores, p_values))
    label_sig_z_scores = [x for x in label_z_scores if x[2] < 0.05]
    print(label_sig_z_scores)

    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Tenaillon_et_al/gene_z_scores.txt', 'w')
    df_out.write('\t'.join(['Gene', 'z_score', 'p_score']) + '\n')
    for label_z_score in label_z_scores:
        df_out.write('\t'.join([
            str(label_z_score[0]),
            str(label_z_score[1]),
            str(label_z_score[2])
        ]) + '\n')
    df_out.close()
예제 #6
0
def time_partition_ltee(k=5, iter=100):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    to_include = pt.complete_nonmutator_lines()
    df_nonmut = df[df.index.str.contains('|'.join(to_include))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    # make sure it's sorted
    df_nonmut.sort_index(inplace=True)

    time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(
        list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
    time_points_positions = {}
    for x in time_points_set:
        time_points_positions[x] = [
            i for i, j in enumerate(time_points) if j == x
        ]
    t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]]
    t_final_np = t_final_df.values
    gene_names = df_nonmut.columns.tolist()

    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Good_et_al/time_partition_z_scores.txt', 'w')
    df_out.write(
        '\t'.join(['Time', 'Time_less_z_score', 'Time_greater_z_score']) +
        '\n')
    for time_point in time_points_set:
        # very few mutations after generation 50000
        if time_point > 50000:
            continue
        print("Time point " + str(time_point))
        t_i_df = df_nonmut.iloc[time_points_positions[time_point]]
        t_i_np = t_i_df.values
        # remove rows with all zeros
        t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0]
        n_zeros_t_i_np = len(t_i_np_zeros)
        if n_zeros_t_i_np > 0:
            t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0)

        t_i_to_final_np = t_final_np - t_i_np
        # remove rows with all zeros
        t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0]
        n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros)
        if n_zeros_t_i_to_final_np > 0:
            t_i_to_final_np = np.delete(t_i_to_final_np,
                                        t_i_to_final_np_zeros,
                                        axis=0)

        t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0)
        t_norm = cd.likelihood_matrix_array(
            t_concat, gene_names, 'Good_et_al').get_likelihood_matrix()
        t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True)

        # first five axes
        e_vals, e_vecs = pt.pca_np(t_norm_rel)
        # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i]
        e_vecs_k5 = e_vecs[:, -1 - k:-1]
        # account for rows with zero mutations
        e_vec_t_less = e_vecs_k5[:5 - n_zeros_t_i_np, :]
        e_vec_t_greater = e_vecs_k5[5 - n_zeros_t_i_to_final_np:, :]

        dist_t_less = pt.get_mean_pairwise_euc_distance(e_vec_t_less, k=k)
        dist_t_greater = pt.get_mean_pairwise_euc_distance(e_vec_t_greater,
                                                           k=k)

        dist_t_less_list = []
        dist_t_greater_list = []

        for i in range(iter):
            t_i_np_rndm = pt.get_random_matrix(t_i_np)
            t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np)
            t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm),
                                           axis=0)

            t_rndm_norm = cd.likelihood_matrix_array(
                t_rndm_concat, gene_names,
                'Good_et_al').get_likelihood_matrix()
            t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1,
                                                            keepdims=True)
            # first five axes
            e_vals_rndm, e_vecs_rndm = pt.pca_np(t_rndm_norm_rel)
            # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i]
            e_vecs_rndm_k5 = e_vecs_rndm[:, -1 - k:-1]
            e_vec_t_less_rndm = e_vecs_rndm_k5[:5, :]
            e_vec_t_greater_rndm = e_vecs_rndm_k5[5:, :]

            dist_t_less_rndm = pt.get_mean_pairwise_euc_distance(
                e_vec_t_less_rndm, k=k)
            dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance(
                e_vec_t_greater_rndm, k=k)

            dist_t_less_list.append(dist_t_less_rndm)
            dist_t_greater_list.append(dist_t_greater_rndm)

        z_score_less = (dist_t_less -
                        np.mean(dist_t_less_list)) / np.std(dist_t_less_list)
        z_score_greater = (dist_t_greater - np.mean(dist_t_greater_list)
                           ) / np.std(dist_t_greater_list)

        df_out.write('\t'.join(
            [str(time_point),
             str(z_score_less),
             str(z_score_greater)]) + '\n')

    df_out.close()
예제 #7
0
def time_partition_ltee(k=5, iter=1000):
    df_path = mydir + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    to_include = pt.complete_nonmutator_lines()
    df_nonmut = df[df.index.str.contains('|'.join(to_include))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    # make sure it's sorted
    df_nonmut.sort_index(inplace=True)

    time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(
        list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
    time_points_positions = {}
    for x in time_points_set:
        time_points_positions[x] = [
            i for i, j in enumerate(time_points) if j == x
        ]
    t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]]
    t_final_np = t_final_df.values
    gene_names = df_nonmut.columns.tolist()

    df_out = open(mydir + '/data/Good_et_al/time_partition_z_scores.txt', 'w')
    df_out.write('\t'.join([
        'Time', 'less_mbd', 'greater_mpd', 'delta_mpd', 'less_mbd_025',
        'less_mbd_975', 'greater_mpd_025', 'greater_mpd_975', 'delta_mpd_025',
        'delta_mpd_975'
    ]) + '\n')
    for time_point in time_points_set:
        # very few mutations after generation 50000
        if time_point > 50000:
            continue
        print("Time point " + str(time_point))
        t_i_df = df_nonmut.iloc[time_points_positions[time_point]]
        t_i_np = t_i_df.values
        # remove rows with all zeros
        t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0]
        n_zeros_t_i_np = len(t_i_np_zeros)
        if n_zeros_t_i_np > 0:
            t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0)

        t_i_to_final_np = t_final_np - t_i_np
        # remove rows with all zeros
        t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0]
        n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros)
        if n_zeros_t_i_to_final_np > 0:
            t_i_to_final_np = np.delete(t_i_to_final_np,
                                        t_i_to_final_np_zeros,
                                        axis=0)

        t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0)
        t_norm = cd.likelihood_matrix_array(
            t_concat, gene_names, 'Good_et_al').get_likelihood_matrix()
        t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True)
        t_norm_rel -= np.mean(t_norm_rel, axis=0)
        pca = PCA()
        t_norm_rel_pca = pca.fit_transform(t_norm_rel)
        t_norm_rel_pca_k5 = t_norm_rel_pca[:, -1 - k:-1]
        # account for rows with zero mutations
        dist_t_less = pt.get_mean_pairwise_euc_distance(
            t_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k)
        dist_t_greater = pt.get_mean_pairwise_euc_distance(
            t_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k)
        dist_t_change = dist_t_greater - dist_t_less
        #F_t = pt.get_F_2(t_norm_rel_pca_k5, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0]
        dist_t_less_list = []
        dist_t_greater_list = []
        dist_t_change_list = []
        #F_t_list = []
        for i in range(iter):
            if i % 1000 == 0:
                print("Iteration " + str(i))
            t_i_np_rndm = pt.get_random_matrix(t_i_np)
            t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np)
            t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm),
                                           axis=0)
            t_rndm_norm = cd.likelihood_matrix_array(
                t_rndm_concat, gene_names,
                'Good_et_al').get_likelihood_matrix()
            t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1,
                                                            keepdims=True)
            t_rndm_norm_rel -= np.mean(t_rndm_norm_rel, axis=0)
            t_rndm_norm_rel_pca = pca.fit_transform(t_rndm_norm_rel)
            # first five axes
            t_rndm_norm_rel_pca_k5 = t_rndm_norm_rel_pca[:, -1 - k:-1]
            dist_t_less_rndm = pt.get_mean_pairwise_euc_distance(
                t_rndm_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k)
            dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance(
                t_rndm_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k)
            dist_t_change_list.append(dist_t_greater_rndm - dist_t_less_rndm)
            dist_t_less_list.append(dist_t_less_rndm)
            dist_t_greater_list.append(dist_t_greater_rndm)
            #F_t_list.append(pt.get_F_2(t_rndm_norm_rel_pca, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0])

        dist_t_change_list.sort()
        dist_t_greater_list.sort()
        dist_t_less_list.sort()
        #F_t_list.sort()
        # get 95% CIs
        dist_t_change_025 = dist_t_change_list[int(iter * 0.025)]
        dist_t_change_975 = dist_t_change_list[int(iter * 0.975)]
        dist_t_greater_025 = dist_t_greater_list[int(iter * 0.025)]
        dist_t_greater_975 = dist_t_greater_list[int(iter * 0.975)]
        dist_t_less_025 = dist_t_less_list[int(iter * 0.025)]
        dist_t_less_975 = dist_t_less_list[int(iter * 0.975)]
        #F_t_025 = F_t_list[int(iter*0.025)]
        #F_t_975 = F_t_list[int(iter*0.975)]
        df_out.write('\t'.join([str(time_point), str(dist_t_less), str(dist_t_greater), \
                                str(dist_t_change), str(dist_t_less_025), str(dist_t_less_975), \
                                str(dist_t_greater_025), str(dist_t_greater_975), \
                                str(dist_t_change_025), str(dist_t_change_975)]) + '\n')

    df_out.close()
예제 #8
0
from __future__ import division
import os, pickle, operator
import random
from itertools import compress
import numpy as np
import pandas as pd
import multiprocessing as mp
from functools import partial
from sklearn.metrics.pairwise import euclidean_distances
#from asa159 import rcont2
from scipy import linalg as LA

from sklearn.decomposition import PCA

import clean_data as cd

df_path = '/Users/WRShoemaker/GitHub/ParEvol/data/Tenaillon_et_al/gene_by_pop.txt'
#df_path = mydir + '/data/Tenaillon_et_al/gene_by_pop.txt'
df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
df_np = df.values
gene_names = df.columns.values
df_np_delta = cd.likelihood_matrix_array(
    df_np, gene_names, 'Tenaillon_et_al').get_likelihood_matrix()

X = df_np_delta / df_np_delta.sum(axis=1)[:, None]
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
#X = X - np.mean(X, axis = 0)

pca = PCA()
pca_X = pca.fit(X)
예제 #9
0
observed_occupancies_geometric_non, predicted_occupancies_geometric_non = get_predicted_observed_occupancies_geometric(df_non_np, locus_tags_non, mut_counts_non_dict)

print( np.mean(np.absolute(observed_occupancies_non - predicted_occupancies_non ) / observed_occupancies_non ))

print( np.mean(np.absolute(observed_occupancies_geometric_non - predicted_occupancies_geometric_non ) / observed_occupancies_geometric_non ))




df_path = pt.get_path() + "/data/Tenaillon_et_al/gene_by_pop_nonsyn.txt"
df = pd.read_csv(df_path, sep="\t", header="infer", index_col=0)
df_np = df.values
gene_names = df.columns.values
n_rows = list(range(df_np.shape[0]))

df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, "Tenaillon_et_al").get_likelihood_matrix()

X = df_np_delta / df_np_delta.sum(axis=1)[:, None]
X = X - np.mean(X, axis=0)
# cov = np.cov(X.T)
# ev, eig = np.linalg.eig(cov)
pca = PCA()
pca_fit = pca.fit_transform(X)
# L = pt.get_L_stat(max(ev), N, cov.shape[0])
eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=X.shape[1])

eig_null = []
for j in range(iter):
    df_np_j = pt.get_random_matrix(df_np)
    np.seterr(divide="ignore")
    df_np_j_delta = cd.likelihood_matrix_array(df_np_j, gene_names, "Tenaillon_et_al").get_likelihood_matrix()
예제 #10
0
def ltee_convergence(alpha = 0.05, k = 5):
    df_path = os.path.expanduser("~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    to_keep = pt.complete_nonmutator_lines()
    #to_keep.append('p5')
    to_keep.remove('p5')
    df_nonmut = df[df.index.str.contains('|'.join( to_keep))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    gene_names = df_nonmut.columns.tolist()
    sample_names = df_nonmut.index.tolist()
    df_delta = cd.likelihood_matrix_array(df_nonmut, gene_names, 'Good_et_al').get_likelihood_matrix()
    df_delta = df_delta/df_delta.sum(axis=1)[:,None]
    X = pt.get_mean_center(df_delta)

    pca = PCA()
    df_out = pca.fit_transform(X)

    time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values])))
    colors = np.linspace(min(time_points_set),max(time_points_set),len(time_points_set))
    color_dict = dict(zip(time_points_set, colors))

    df_pca = pd.DataFrame(data=df_out, index=sample_names)
    mean_dist = []
    for tp in time_points_set:
        df_pca_tp = df_pca[df_pca.index.str.contains('_' + str(tp))]
        mean_dist.append(pt.get_mean_pairwise_euc_distance(df_pca_tp.values, k = k))



    #fig = plt.figure()
    #plt.scatter(time_points_set, mean_dist, marker = "o", edgecolors='#244162', c = '#175ac6', alpha = 0.4, s = 60, zorder=4)

    #plt.xlabel("Time", fontsize = 14)
    #plt.ylabel("Mean euclidean distance", fontsize = 12)

    #plt.figure(1)
    #plt.subplot(313)
    #plt.errorbar(perm_gens, mean_L, yerr = [lower_ci_L, upper_ci_L], fmt = 'o', alpha = 0.5, \
    #    barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1)
    #plt.scatter(time_points_set, Ls, c='#175ac6', marker = 'o', s = 70, \
    #    edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none')

    #for pop in to_keep:
    #    pop_df_pca = df_pca[df_pca.index.str.contains(pop)]
    #    c_list = [ color_dict[int(x.split('_')[1])] for x in pop_df_pca.index.values]
    #    if  pt.nonmutator_shapes()[pop] == 'p2':
    #        size == 50
    #    else:
    #        size = 80
    #    plt.scatter(pop_df_pca.values[:,0], pop_df_pca.values[:,1], \
    #        c=c_list, cmap = cm.Blues, vmin=min(time_points_set), vmax=max(time_points_set), \
    #        marker = pt.nonmutator_shapes()[pop], s = size, edgecolors='#244162', \
    #        linewidth = 0.6,  zorder=4, alpha=0.7)#, edgecolors='none')

    #c = plt.colorbar()
    #c.set_label("Generations", size=18)
    #plt.xlabel('PCA 1 (' + str(round(pca.explained_variance_ratio_[0],3)*100) + '%)' , fontsize = 16)
    #plt.ylabel('PCA 2 (' + str(round(pca.explained_variance_ratio_[1],3)*100) + '%)' , fontsize = 16)
    #plt.xlim([-0.4,0.4])
    #plt.ylim([-0.4,0.4])
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/ltee_convergence.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()