def wannier_hist(iter=10000): dir = os.path.expanduser("~/GitHub/ParEvol") df1 = pd.read_csv(dir + '/data/Wannier_et_al/C321.deltaA_mutation_table_clean.txt', sep='\t', index_col=0) df2 = pd.read_csv(dir + '/data/Wannier_et_al/C321_mutation_table_clean.txt', sep='\t', index_col=0) df = df1.append(df2, sort=False) df = df.fillna(0) df_np = df.values gene_names = df.columns.values N1 = df1.shape[0] N2 = df2.shape[0] df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, 'Wannier_et_al').get_likelihood_matrix() F2_all = pt.get_F_2(df_np_delta, N1, N2) print(F2_all) F2 = F2_all[0] V1 = F2_all[1] V2 = F2_all[2] F2_null = [] V1_null = [] V2_null = [] for i in range(iter): if i %1000 ==0: print(i) df_np_i = pt.get_random_matrix(df_np) np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names, 'Wannier_et_al').get_likelihood_matrix() F2_all_iter = pt.get_F_2(df_np_i_delta, N1, N2) F2_null.append(F2_all_iter[0]) V1_null.append(F2_all_iter[1]) V2_null.append(F2_all_iter[2]) fig = plt.figure() #plt.hist(F2_null, bins=30, weights=np.zeros_like(F2_null) + 1. / len(F2_null), alpha=0.8, color = '#175ac6') plt.hist(F2_null, bins=30, alpha=0.8, color = '#175ac6') plt.axvline(F2, color = 'red', lw = 3) plt.xlabel(r'$ F_{2}$', fontsize = 20) plt.ylabel("Frequency", fontsize = 12) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close() fig = plt.figure() plt.hist(V1_null, bins=30, alpha=0.8, color = '#175ac6') plt.axvline(V1, color = 'red', lw = 3) plt.xlabel(r'$ V_{1}$', fontsize = 20) plt.ylabel("Frequency", fontsize = 12) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V1.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close() fig = plt.figure() plt.hist(V2_null, bins=30, alpha=0.8, color = '#175ac6') #print(V2_null) plt.axvline(V2, color = 'red', lw = 3) plt.xlabel(r'$ V_{2}$', fontsize = 20) plt.ylabel("Frequency", fontsize = 12) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V2.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def rndm_sample_tenaillon(iter1=1000, iter2=10000): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_np = df.values gene_names = df.columns.values n_rows = list(range(df_np.shape[0])) df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Tenaillon_et_al/dist_sample_size.txt', 'w') df_out.write( '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n') #Ns = list(range(4, 40 +2, 2)) Ns = [40] pca = PCA() for N in Ns: for i in range(iter1): df_np_i = df_np[ np.random.choice(n_rows, size=N, replace=False, p=None), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array( df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None] X = pt.get_mean_center(df_np_i_delta) pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = cd.likelihood_matrix_array( df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() df_np_i_j_delta = df_np_i_j_delta / df_np_i_j_delta.sum( axis=1)[:, None] X_j = pt.get_mean_center(df_np_i_j_delta) pca_fit_j = pca.fit_transform(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) G = df_np_i.shape[1] euc_percent = len([k for k in euc_dists if k > euc_dist ]) / len(euc_dists) z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) print(str(N), str(i), str(G), str(euc_percent), str(z_score)) df_out.write('\t'.join([ str(N), str(G), str(i), str(euc_percent), str(z_score) ]) + '\n') df_out.close()
def rndm_sample_tenaillon(k_eval=3, iter1=20, iter2=1000, sample_bs = 10, iter_bs=10000): df_path = mydir + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) df_np = df.values gene_names = df.columns.values n_rows = list(range(df_np.shape[0])) df_out = open(mydir + '/data/Tenaillon_et_al/power_sample_size.txt', 'w') df_out.write('\t'.join(['N', 'G', 'Power', 'Power_025', 'Power_975']) + '\n') Ns = [20, 30] #Ns = list(range(20, n_rows, 4)) for N in Ns: p_values = [] #z_scores = [] G_list = [] for i in range(iter1): df_np_i = df_np[np.random.choice(n_rows, size=N, replace=False, p=None), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list(compress(gene_names, list(map(operator.not_, gene_bool)))) G_list.append(len(gene_names_i)) df_np_i = df_np_i[:,~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = df_np_i_delta/df_np_i_delta.sum(axis=1)[:,None] X -= np.mean(X, axis = 0) pca = PCA() pca_X = pca.fit_transform(X) mpd = pt.get_mean_pairwise_euc_distance(pca_X, k=k_eval) mpd_null = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = cd.likelihood_matrix_array(df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X_j = df_np_i_j_delta/df_np_i_j_delta.sum(axis=1)[:,None] X_j -= np.mean(X_j, axis = 0) pca_X_j = pca.fit_transform(X_j) mpd_null.append(pt.get_mean_pairwise_euc_distance(pca_X_j, k=k_eval)) p_values.append(len( [m for m in mpd_null if m > mpd] ) / len(mpd_null)) #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s print(p_values) power = len([n for n in p_values if n < 0.05]) / len(p_values) print(p_values) power_bootstrap = [] for p in range(iter_bs): p_values_sample = random.sample(p_values, sample_bs) power_sample = len([n for n in p_values_sample if n < 0.05]) / len(p_values_sample) power_bootstrap.append(power_sample) power_bootstrap.sort() # return number of genes, power, power lower, power upper #return power, power_bootstrap[int(10000*0.025)], power_bootstrap[int(10000*0.975)] df_out.write('\t'.join([str(N), str(np.mean(G_list)), str(power), str(power_bootstrap[int(iter_bs*0.025)]), str(power_bootstrap[int(iter_bs*0.975)])]) + '\n') df_out.close()
def rndm_sample_tenaillon(N, df_np, gene_names, n_rows, k=3, iter1=100, iter2=1000, sample_bs=10, iter_bs=10000): p_values = [] #z_scores = [] G_list = [] for i in range(iter1): df_np_i = df_np[ np.random.choice(n_rows, size=N, replace=False, p=None), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) G_list.append(len(gene_names_i)) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array( df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None] e_vals, a_mat = pt.pca_np(X) euc_dist = pt.get_mean_pairwise_euc_distance(a_mat, k=k) print(euc_dist) euc_dists = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = cd.likelihood_matrix_array( df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X_j = df_np_i_j_delta / df_np_i_j_delta.sum(axis=1)[:, None] e_vals_j, a_mat_j = pt.pca_np(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(a_mat_j, k=k)) p_values.append( len([m for m in euc_dists if m > euc_dist]) / len(euc_dists)) #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s power = len([n for n in p_values if n < 0.05]) / len(p_values) print(p_values) power_bootstrap = [] for p in range(iter_bs): p_values_sample = random.sample(p_values, sample_bs) power_sample = len([n for n in p_values_sample if n < 0.05 ]) / len(p_values_sample) power_bootstrap.append(power_sample) power_bootstrap.sort() # return number of genes, power, power lower, power upper return N, np.mean(G_list), power, power_bootstrap[int( 10000 * 0.025)], power_bootstrap[int(10000 * 0.975)]
def gene_svd_tenaillon(iter=10000): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) gene_names = df.columns.tolist() df_np = df.values df_np_delta = cd.likelihood_matrix_array( df_np, gene_names, 'Tenaillon_et_al').get_likelihood_matrix() df_np_delta = df_np_delta / df_np_delta.sum(axis=1)[:, None] X = pt.get_mean_center(df_np_delta) # scipy's svd returns the V matrix in transposed form U, s, V_T = svds(X, k=3) # apply another transposition to calculate basis matrix F = (V_T.T @ np.diag(s)) / np.sqrt(X.shape[0] - 1) vars = np.linalg.norm(F, axis=1)**2 vars_null_list = [] for i in range(iter): if i % 1000 == 0: print("Iteration " + str(i)) df_np_i = pt.get_random_matrix(df_np) np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array( df_np_i, gene_names, 'Tenaillon_et_al').get_likelihood_matrix() df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None] X_j = pt.get_mean_center(df_np_i_delta) U_i, s_i, V_i_T = svds(X_j, k=3) F_i = (V_i_T.T @ np.diag(s_i)) / np.sqrt(X_j.shape[0] - 1) vars_null_list.append(np.linalg.norm(F_i, axis=1)**2) vars_null = np.stack(vars_null_list) vars_null_mean = np.mean(vars_null, axis=0) vars_null_std = np.std(vars_null, axis=0) z_scores = (vars - vars_null_mean) / vars_null_std p_values = [] # calculate p values for k, column in enumerate(vars_null.T): column_greater = [x for x in column if x > vars[k]] p_values.append(len(column_greater) / iter) label_z_scores = list(zip(gene_names, z_scores, p_values)) label_sig_z_scores = [x for x in label_z_scores if x[2] < 0.05] print(label_sig_z_scores) df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_z_scores.txt', 'w') df_out.write('\t'.join(['Gene', 'z_score', 'p_score']) + '\n') for label_z_score in label_z_scores: df_out.write('\t'.join([ str(label_z_score[0]), str(label_z_score[1]), str(label_z_score[2]) ]) + '\n') df_out.close()
def time_partition_ltee(k=5, iter=100): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() df_nonmut = df[df.index.str.contains('|'.join(to_include))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] # make sure it's sorted df_nonmut.sort_index(inplace=True) time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]] t_final_np = t_final_df.values gene_names = df_nonmut.columns.tolist() df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Good_et_al/time_partition_z_scores.txt', 'w') df_out.write( '\t'.join(['Time', 'Time_less_z_score', 'Time_greater_z_score']) + '\n') for time_point in time_points_set: # very few mutations after generation 50000 if time_point > 50000: continue print("Time point " + str(time_point)) t_i_df = df_nonmut.iloc[time_points_positions[time_point]] t_i_np = t_i_df.values # remove rows with all zeros t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0] n_zeros_t_i_np = len(t_i_np_zeros) if n_zeros_t_i_np > 0: t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0) t_i_to_final_np = t_final_np - t_i_np # remove rows with all zeros t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0] n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros) if n_zeros_t_i_to_final_np > 0: t_i_to_final_np = np.delete(t_i_to_final_np, t_i_to_final_np_zeros, axis=0) t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0) t_norm = cd.likelihood_matrix_array( t_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True) # first five axes e_vals, e_vecs = pt.pca_np(t_norm_rel) # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i] e_vecs_k5 = e_vecs[:, -1 - k:-1] # account for rows with zero mutations e_vec_t_less = e_vecs_k5[:5 - n_zeros_t_i_np, :] e_vec_t_greater = e_vecs_k5[5 - n_zeros_t_i_to_final_np:, :] dist_t_less = pt.get_mean_pairwise_euc_distance(e_vec_t_less, k=k) dist_t_greater = pt.get_mean_pairwise_euc_distance(e_vec_t_greater, k=k) dist_t_less_list = [] dist_t_greater_list = [] for i in range(iter): t_i_np_rndm = pt.get_random_matrix(t_i_np) t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np) t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm), axis=0) t_rndm_norm = cd.likelihood_matrix_array( t_rndm_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1, keepdims=True) # first five axes e_vals_rndm, e_vecs_rndm = pt.pca_np(t_rndm_norm_rel) # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i] e_vecs_rndm_k5 = e_vecs_rndm[:, -1 - k:-1] e_vec_t_less_rndm = e_vecs_rndm_k5[:5, :] e_vec_t_greater_rndm = e_vecs_rndm_k5[5:, :] dist_t_less_rndm = pt.get_mean_pairwise_euc_distance( e_vec_t_less_rndm, k=k) dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance( e_vec_t_greater_rndm, k=k) dist_t_less_list.append(dist_t_less_rndm) dist_t_greater_list.append(dist_t_greater_rndm) z_score_less = (dist_t_less - np.mean(dist_t_less_list)) / np.std(dist_t_less_list) z_score_greater = (dist_t_greater - np.mean(dist_t_greater_list) ) / np.std(dist_t_greater_list) df_out.write('\t'.join( [str(time_point), str(z_score_less), str(z_score_greater)]) + '\n') df_out.close()
def time_partition_ltee(k=5, iter=1000): df_path = mydir + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() df_nonmut = df[df.index.str.contains('|'.join(to_include))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] # make sure it's sorted df_nonmut.sort_index(inplace=True) time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]] t_final_np = t_final_df.values gene_names = df_nonmut.columns.tolist() df_out = open(mydir + '/data/Good_et_al/time_partition_z_scores.txt', 'w') df_out.write('\t'.join([ 'Time', 'less_mbd', 'greater_mpd', 'delta_mpd', 'less_mbd_025', 'less_mbd_975', 'greater_mpd_025', 'greater_mpd_975', 'delta_mpd_025', 'delta_mpd_975' ]) + '\n') for time_point in time_points_set: # very few mutations after generation 50000 if time_point > 50000: continue print("Time point " + str(time_point)) t_i_df = df_nonmut.iloc[time_points_positions[time_point]] t_i_np = t_i_df.values # remove rows with all zeros t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0] n_zeros_t_i_np = len(t_i_np_zeros) if n_zeros_t_i_np > 0: t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0) t_i_to_final_np = t_final_np - t_i_np # remove rows with all zeros t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0] n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros) if n_zeros_t_i_to_final_np > 0: t_i_to_final_np = np.delete(t_i_to_final_np, t_i_to_final_np_zeros, axis=0) t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0) t_norm = cd.likelihood_matrix_array( t_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True) t_norm_rel -= np.mean(t_norm_rel, axis=0) pca = PCA() t_norm_rel_pca = pca.fit_transform(t_norm_rel) t_norm_rel_pca_k5 = t_norm_rel_pca[:, -1 - k:-1] # account for rows with zero mutations dist_t_less = pt.get_mean_pairwise_euc_distance( t_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k) dist_t_greater = pt.get_mean_pairwise_euc_distance( t_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k) dist_t_change = dist_t_greater - dist_t_less #F_t = pt.get_F_2(t_norm_rel_pca_k5, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0] dist_t_less_list = [] dist_t_greater_list = [] dist_t_change_list = [] #F_t_list = [] for i in range(iter): if i % 1000 == 0: print("Iteration " + str(i)) t_i_np_rndm = pt.get_random_matrix(t_i_np) t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np) t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm), axis=0) t_rndm_norm = cd.likelihood_matrix_array( t_rndm_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1, keepdims=True) t_rndm_norm_rel -= np.mean(t_rndm_norm_rel, axis=0) t_rndm_norm_rel_pca = pca.fit_transform(t_rndm_norm_rel) # first five axes t_rndm_norm_rel_pca_k5 = t_rndm_norm_rel_pca[:, -1 - k:-1] dist_t_less_rndm = pt.get_mean_pairwise_euc_distance( t_rndm_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k) dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance( t_rndm_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k) dist_t_change_list.append(dist_t_greater_rndm - dist_t_less_rndm) dist_t_less_list.append(dist_t_less_rndm) dist_t_greater_list.append(dist_t_greater_rndm) #F_t_list.append(pt.get_F_2(t_rndm_norm_rel_pca, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0]) dist_t_change_list.sort() dist_t_greater_list.sort() dist_t_less_list.sort() #F_t_list.sort() # get 95% CIs dist_t_change_025 = dist_t_change_list[int(iter * 0.025)] dist_t_change_975 = dist_t_change_list[int(iter * 0.975)] dist_t_greater_025 = dist_t_greater_list[int(iter * 0.025)] dist_t_greater_975 = dist_t_greater_list[int(iter * 0.975)] dist_t_less_025 = dist_t_less_list[int(iter * 0.025)] dist_t_less_975 = dist_t_less_list[int(iter * 0.975)] #F_t_025 = F_t_list[int(iter*0.025)] #F_t_975 = F_t_list[int(iter*0.975)] df_out.write('\t'.join([str(time_point), str(dist_t_less), str(dist_t_greater), \ str(dist_t_change), str(dist_t_less_025), str(dist_t_less_975), \ str(dist_t_greater_025), str(dist_t_greater_975), \ str(dist_t_change_025), str(dist_t_change_975)]) + '\n') df_out.close()
from __future__ import division import os, pickle, operator import random from itertools import compress import numpy as np import pandas as pd import multiprocessing as mp from functools import partial from sklearn.metrics.pairwise import euclidean_distances #from asa159 import rcont2 from scipy import linalg as LA from sklearn.decomposition import PCA import clean_data as cd df_path = '/Users/WRShoemaker/GitHub/ParEvol/data/Tenaillon_et_al/gene_by_pop.txt' #df_path = mydir + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_np = df.values gene_names = df.columns.values df_np_delta = cd.likelihood_matrix_array( df_np, gene_names, 'Tenaillon_et_al').get_likelihood_matrix() X = df_np_delta / df_np_delta.sum(axis=1)[:, None] X = (X - np.mean(X, axis=0)) / np.std(X, axis=0) #X = X - np.mean(X, axis = 0) pca = PCA() pca_X = pca.fit(X)
observed_occupancies_geometric_non, predicted_occupancies_geometric_non = get_predicted_observed_occupancies_geometric(df_non_np, locus_tags_non, mut_counts_non_dict) print( np.mean(np.absolute(observed_occupancies_non - predicted_occupancies_non ) / observed_occupancies_non )) print( np.mean(np.absolute(observed_occupancies_geometric_non - predicted_occupancies_geometric_non ) / observed_occupancies_geometric_non )) df_path = pt.get_path() + "/data/Tenaillon_et_al/gene_by_pop_nonsyn.txt" df = pd.read_csv(df_path, sep="\t", header="infer", index_col=0) df_np = df.values gene_names = df.columns.values n_rows = list(range(df_np.shape[0])) df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, "Tenaillon_et_al").get_likelihood_matrix() X = df_np_delta / df_np_delta.sum(axis=1)[:, None] X = X - np.mean(X, axis=0) # cov = np.cov(X.T) # ev, eig = np.linalg.eig(cov) pca = PCA() pca_fit = pca.fit_transform(X) # L = pt.get_L_stat(max(ev), N, cov.shape[0]) eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=X.shape[1]) eig_null = [] for j in range(iter): df_np_j = pt.get_random_matrix(df_np) np.seterr(divide="ignore") df_np_j_delta = cd.likelihood_matrix_array(df_np_j, gene_names, "Tenaillon_et_al").get_likelihood_matrix()
def ltee_convergence(alpha = 0.05, k = 5): df_path = os.path.expanduser("~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) to_keep = pt.complete_nonmutator_lines() #to_keep.append('p5') to_keep.remove('p5') df_nonmut = df[df.index.str.contains('|'.join( to_keep))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] gene_names = df_nonmut.columns.tolist() sample_names = df_nonmut.index.tolist() df_delta = cd.likelihood_matrix_array(df_nonmut, gene_names, 'Good_et_al').get_likelihood_matrix() df_delta = df_delta/df_delta.sum(axis=1)[:,None] X = pt.get_mean_center(df_delta) pca = PCA() df_out = pca.fit_transform(X) time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values]))) colors = np.linspace(min(time_points_set),max(time_points_set),len(time_points_set)) color_dict = dict(zip(time_points_set, colors)) df_pca = pd.DataFrame(data=df_out, index=sample_names) mean_dist = [] for tp in time_points_set: df_pca_tp = df_pca[df_pca.index.str.contains('_' + str(tp))] mean_dist.append(pt.get_mean_pairwise_euc_distance(df_pca_tp.values, k = k)) #fig = plt.figure() #plt.scatter(time_points_set, mean_dist, marker = "o", edgecolors='#244162', c = '#175ac6', alpha = 0.4, s = 60, zorder=4) #plt.xlabel("Time", fontsize = 14) #plt.ylabel("Mean euclidean distance", fontsize = 12) #plt.figure(1) #plt.subplot(313) #plt.errorbar(perm_gens, mean_L, yerr = [lower_ci_L, upper_ci_L], fmt = 'o', alpha = 0.5, \ # barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1) #plt.scatter(time_points_set, Ls, c='#175ac6', marker = 'o', s = 70, \ # edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none') #for pop in to_keep: # pop_df_pca = df_pca[df_pca.index.str.contains(pop)] # c_list = [ color_dict[int(x.split('_')[1])] for x in pop_df_pca.index.values] # if pt.nonmutator_shapes()[pop] == 'p2': # size == 50 # else: # size = 80 # plt.scatter(pop_df_pca.values[:,0], pop_df_pca.values[:,1], \ # c=c_list, cmap = cm.Blues, vmin=min(time_points_set), vmax=max(time_points_set), \ # marker = pt.nonmutator_shapes()[pop], s = size, edgecolors='#244162', \ # linewidth = 0.6, zorder=4, alpha=0.7)#, edgecolors='none') #c = plt.colorbar() #c.set_label("Generations", size=18) #plt.xlabel('PCA 1 (' + str(round(pca.explained_variance_ratio_[0],3)*100) + '%)' , fontsize = 16) #plt.ylabel('PCA 2 (' + str(round(pca.explained_variance_ratio_[1],3)*100) + '%)' , fontsize = 16) #plt.xlim([-0.4,0.4]) #plt.ylim([-0.4,0.4]) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/ltee_convergence.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()