def run_all_sims(): df_out = open(pt.get_path() + '/data/simulations/cov_euc.txt', 'w') n_pops = 20 n_genes = 50 lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) covs = [0.5, 0, -0.5] df_out.write('\t'.join(['Covariance', 'Iteration', 'z_score']) + '\n') for cov in covs: for i in range(100): print(str(cov) + ' ' + str(i)) test_cov = np.stack( [get_count_pop(lambda_genes, cov=cov) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_euclidean_distance(pca_fit) sim_eucs = [] for j in range(1000): #if j % 100 == 0: # print(j) X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) sim_eucs.append(pt.get_euclidean_distance(pca_fit_j)) z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs) df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n') df_out.close()
def run_ntwrk_cov_sims(var=1, cov=0.25): df_out = open( pt.get_path() + '/data/simulations/cov_ntwrk_euc_pos_only_010.txt', 'w') n_pops = 20 n_genes = 250 lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n') C = np.loadtxt(pt.get_path() + '/data/modular_ntwrk_mu_010.txt', delimiter='\t') #, dtype='int') #print(np.mean(np.sum(ntwrk, axis =1))) C = C * cov np.fill_diagonal(C, var) for i in range(100): test_cov = np.stack( [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) sim_eucs = [] for j in range(1000): X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs) print(str(cov), ' ', str(i), ' ', str(z_score)) df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n') df_out.close()
def run_block_cov_sims(): df_out = open( pt.get_path() + '/data/simulations/cov_block_euc_pos_only.txt', 'w') n_pops = 20 n_genes = 50 lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n') #covs = [0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9] covs = [-0.9] for cov in covs: C = get_block_cov(n_genes, pos_cov=cov, neg_cov=cov) print(np.all(np.linalg.eigvals(C) > 0)) print(C) for i in range(100): test_cov = np.stack( [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) sim_eucs = [] for j in range(1000): X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs) print(str(cov), ' ', str(i), ' ', str(z_score)) df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n') df_out.close()
def two_treats_sim(iter1=1000, iter2=1000, alpha=0.05): genes = 10 pops1 = pops2 = 10 shape = 1 scale = 1 muts1 = muts2 = 20 to_reshuffle = [0, 5, 10, 15, 20] for reshuf in to_reshuffle: p_vales = [] for i in range(iter1): #print(i) rates = np.random.gamma(shape, scale=scale, size=genes) rates1 = rates.copy() # permute rates shuffle(rates[:reshuf]) rates2 = rates.copy() list_dicts1 = [ Counter( np.random.choice(genes, size=muts1, replace=True, p=rates1 / sum(rates1))) for i in range(pops1) ] list_dicts2 = [ Counter( np.random.choice(genes, size=muts2, replace=True, p=rates2 / sum(rates2))) for i in range(pops2) ] df1 = pd.DataFrame(list_dicts1) df2 = pd.DataFrame(list_dicts2) df = pd.concat([df1, df2]) df = df.fillna(0) count_matrix = df.values groups = [ np.asarray(list(range(0, pops1))), np.asarray(list(range(pops1, pops1 + pops2))) ] pca = PCA() X = pt.hellinger_transform(count_matrix) pca_fit = pca.fit_transform(X) F = get_F_stat_pairwise(pca_fit, groups) F_list = [] for j in range(iter2): count_matrix_n0 = pt.random_matrix(count_matrix) X_n0 = pt.hellinger_transform(count_matrix_n0) pca_fit_n0 = pca.fit_transform(X_n0) F_list.append(get_F_stat_pairwise(pca_fit_n0, groups)) p_vales.append( (len([x for x in F_list if x > F]) + 1) / (iter2 + 1)) power = (len([k for k in p_vales if k < alpha]) + 1) / (iter1 + 1) print('Reshuffle = ' + str(reshuf) + ', Power ' + str(power))
def run_pca_permutation(iter=10000, analysis='PCA', dataset='tenaillon'): if dataset == 'tenaillon': k = 3 df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_array = df.as_matrix() df_out = open( pt.get_path() + '/data/Tenaillon_et_al/permute_' + analysis + '.txt', 'w') column_headers = [ 'Iteration', 'MCD', 'mean_angle', 'mean_dist', 'delta_L', 'x_stat' ] df_out.write('\t'.join(column_headers) + '\n') for i in range(iter): print(i) df_rndm = pd.DataFrame(data=pt.random_matrix(df_array), index=df.index, columns=df.columns) df_rndm_delta = pt.likelihood_matrix( df_rndm, 'Tenaillon_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_rndm_delta) pca = PCA() df_rndm_delta_out = pca.fit_transform(X) #df_pca = pd.DataFrame(data=X_pca, index=df.index) mean_angle = pt.get_mean_angle(df_rndm_delta_out, k=k) mcd = pt.get_mean_centroid_distance(df_rndm_delta_out, k=k) mean_length = pt.get_euc_magnitude_diff(df_rndm_delta_out, k=k) mean_dist = pt.get_mean_pairwise_euc_distance(df_rndm_delta_out, k=k) x_stat = pt.get_x_stat(pca.explained_variance_[:-1]) df_out.write('\t'.join([ str(i), str(mcd), str(mean_angle), str(mean_dist), str(mean_length), str(x_stat) ]) + '\n') df_out.close() elif dataset == 'good': k = 5 df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_exclude = pt.complete_nonmutator_lines() to_exclude.append('p5') df_nonmut = df[df.index.str.contains('|'.join(to_exclude))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) df_nonmut_array = df_nonmut.as_matrix() time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] df_final = df_nonmut.iloc[time_points_positions[time_points_set[-1]]] df_out = open( pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt', 'w') #column_headers = ['Iteration', 'Generation', 'MCD'] column_headers = [ 'Iteration', 'Generation', 'MCD', 'mean_angle', 'delta_L', 'mean_dist' ] df_out.write('\t'.join(column_headers) + '\n') for i in range(iter): print("Iteration " + str(i)) matrix_0 = df_nonmut.iloc[time_points_positions[ time_points_set[0]]] matrix_0_rndm = pt.random_matrix(matrix_0.as_matrix()) df_rndm_list = [ pd.DataFrame(data=matrix_0_rndm, index=matrix_0.index, columns=matrix_0.columns) ] # skip first time step for j, tp in enumerate(time_points_set[0:]): if j == 0: continue df_tp_minus1 = df_nonmut[df_nonmut.index.str.contains( '_' + str(time_points_set[j - 1]))] df_tp = df_nonmut[df_nonmut.index.str.contains('_' + str(tp))] matrix_diff = df_tp.as_matrix() - df_tp_minus1.as_matrix() matrix_0_rndm = matrix_0_rndm + pt.random_matrix(matrix_diff) df_0_rndm = pd.DataFrame(data=matrix_0_rndm, index=df_tp.index, columns=df_tp.columns) df_rndm_list.append(df_0_rndm) df_rndm = pd.concat(df_rndm_list) df_rndm_delta = pt.likelihood_matrix( df_rndm, 'Good_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_rndm_delta) pca = PCA() matrix_rndm_delta_out = pca.fit_transform(X) elif analysis == 'cMDS': matrix_rndm_delta_bc = np.sqrt( pt.get_bray_curtis(df_rndm_delta.as_matrix())) matrix_rndm_delta_out = pt.cmdscale(matrix_rndm_delta_bc)[0] else: print("Analysis argument not accepted") continue df_rndm_delta_out = pd.DataFrame(data=matrix_rndm_delta_out, index=df_rndm_delta.index) for tp in time_points_set: df_rndm_delta_out_tp = df_rndm_delta_out[ df_rndm_delta_out.index.str.contains('_' + str(tp))] df_rndm_delta_out_tp_matrix = df_rndm_delta_out_tp.as_matrix() mean_angle = pt.get_mean_angle(df_rndm_delta_out_tp_matrix, k=k) mcd = pt.get_mean_centroid_distance( df_rndm_delta_out_tp_matrix, k=k) mean_length = pt.get_euc_magnitude_diff( df_rndm_delta_out_tp_matrix, k=k) mean_dist = pt.get_mean_pairwise_euc_distance( df_rndm_delta_out_tp_matrix, k=k) df_out.write('\t'.join([ str(i), str(tp), str(mcd), str(mean_angle), str(mean_length), str(mean_dist) ]) + '\n') df_out.close()
def run_ba_ntwk_cov_sims(): df_out = open(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', 'w') n_pops = 100 n_genes = 50 ntwk = nx.barabasi_albert_graph(n_genes, 2) ntwk_np = nx.to_numpy_matrix(ntwk) lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) df_out.write('\t'.join([ 'Cov', 'Iteration', 'euc_z_score', 'euc_percent', 'eig_percent', 'mcd_percent_k1', 'mcd_percent_k3' ]) + '\n') covs = [0.05, 0.1, 0.15, 0.2] #covs = [0.2, 0.7] for cov in covs: C = ntwk_np * cov np.fill_diagonal(C, 1) #z_scores = [] #eig_percents = [] #euc_percents = [] #centroid_percents_k1 = [] #centroid_percents_k3 = [] for i in range(1000): test_cov = np.stack( [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] eig = pt.get_x_stat(pca.explained_variance_[:-1]) mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k=1) mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k=3) eigs = [] centroid_dists_k1 = [] centroid_dists_k3 = [] for j in range(1000): X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) #pca_j = PCA() #pca_fit_j = pca_j.fit_transform(X_j) pca_fit_j = pca.fit_transform(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) centroid_dists_k1.append( pt.get_mean_centroid_distance(pca_fit_j, k=1)) centroid_dists_k3.append( pt.get_mean_centroid_distance(pca_fit_j, k=3)) eigs.append(pt.get_x_stat(pca.explained_variance_[:-1])) #eigs.append( pt.get_x_stat(pca_j.explained_variance_[:-1]) ) z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) eig_percent = len([k for k in eigs if k < eig]) / len(eigs) centroid_percent_k1 = len([ k for k in centroid_dists_k1 if k < mcd_k1 ]) / len(centroid_dists_k1) centroid_percent_k3 = len([ k for k in centroid_dists_k3 if k < mcd_k3 ]) / len(centroid_dists_k3) #eig_percents.append(eig_percent) #euc_percents.append(euc_percent) #z_scores.append(z_score) print(cov, i, z_score, euc_percent, eig_percent) df_out.write('\t'.join([ str(cov), str(i), str(z_score), str(euc_percent), str(eig_percent), str(centroid_percent_k1), str(centroid_percent_k3) ]) + '\n') #print(cov, np.all(np.linalg.eigvals(C) > 0), np.mean(z_scores)) df_out.close()