def run_ba_cov_neutral_sims(shape=1, scale=1, G=50, N=50, iter1=1000, iter2=1000): df_out = open(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt', 'w') df_out.write('\t'.join([ 'N', 'G', 'lamba_mean', 'lambda_neutral', 'Cov', 'Iteration', 'dist_percent' ]) + '\n') covs = [0.2] mean_gamma = shape * scale neutral_range = np.logspace(-2, 1, num=20, endpoint=True, base=10.0) neutral_range = neutral_range[::-1] for neutral_ in neutral_range: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) lambda_genes = np.random.gamma(shape=shape, scale=scale, size=G) lambda_genes_null = np.asarray([neutral_] * G) test_cov_adapt = np.stack( [pt.get_count_pop(lambda_genes, C=C) for x in range(N)], axis=0) # matrix with diaganol values equal to one test_cov_neutral = np.stack([ pt.get_count_pop(lambda_genes_null, C=np.identity(G)) for x in range(N) ], axis=0) test_cov = test_cov_adapt + test_cov_neutral X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): #X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) X_j = pt.hellinger_transform( pt.get_random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) euc_dists.append( pt.get_mean_pairwise_euc_distance(pca_fit_j)) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) print(neutral_, cov, i, euc_percent) df_out.write('\t'.join([ str(N), str(G), str(mean_gamma), str(neutral_), str(cov), str(i), str(euc_percent) ]) + '\n') df_out.close()
def run_cov_neutral_sims(out_name, covs=[0.1, 0.15, 0.2], shape=1, scale=1, G=50, N=50, iter1=1000, iter2=1000): df_out = open(out_name, 'w') df_out.write('\t'.join([ 'N', 'G', 'lamba_mean', 'lambda_neutral', 'Cov', 'Iteration', 'dist_percent', 'z_score' ]) + '\n') mean_gamma = shape * scale neutral_range = np.logspace(-2, 1, num=20, endpoint=True, base=10.0) neutral_range = neutral_range[::-1] for neutral_ in neutral_range: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) while True: lambda_genes = np.random.gamma(shape=shape, scale=scale, size=G) lambda_genes_null = np.asarray([neutral_] * G) test_cov_adapt = np.stack([ pt.get_count_pop(lambda_genes, C=C) for x in range(N) ], axis=0) # matrix with diaganol values equal to one test_cov_neutral = np.stack([ pt.get_count_pop(lambda_genes_null, C=np.identity(G)) for x in range(N) ], axis=0) test_cov = test_cov_adapt + test_cov_neutral if (np.any(test_cov.sum(axis=1) == 0)) == False: break # check and remove empty columns test_cov = test_cov[:, ~np.all(test_cov == 0, axis=0)] euc_percent, z_score = pt.matrix_vs_null_one_treat( test_cov, iter2) df_out.write('\t'.join([ str(N), str(G), str(mean_gamma), str(neutral_), str(cov), str(i), str(euc_percent), str(z_score) ]) + '\n') print(neutral_, cov) df_out.close()
def run_ba_cov_sims(gene_list, pop_list, out_name, iter1=1000, iter2=1000): df_out = open(pt.get_path() + '/data/simulations/' + out_name + '.txt', 'w') df_out.write('\t'.join(['N', 'G', 'Cov', 'Iteration', 'dist_percent']) + '\n') covs = [0.1, 0.15, 0.2] for G in gene_list: for N in pop_list: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) while True: lambda_genes = np.random.gamma(shape=1, scale=1, size=G) test_cov = np.stack([ pt.get_count_pop(lambda_genes, cov=C) for x in range(N) ], axis=0) #test_cov_row_sum = test_cov.sum(axis=1) if (np.any(test_cov.sum(axis=1) == 0)) == False: break #if np.count_nonzero(test_cov_row_sum) == len(test_cov_row_sum): # break X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): X_j = pt.hellinger_transform( pt.get_random_matrix(test_cov)) #X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) euc_dists.append( pt.get_mean_pairwise_euc_distance(pca_fit_j)) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) print(N, G, cov, i, euc_percent) df_out.write('\t'.join( [str(N), str(G), str(cov), str(i), str(euc_percent)]) + '\n') df_out.close()
def run_cov_rho_sims(out_name, covs=[0.1, 0.15, 0.2], rhos=[-0.2, 0, 0.2], shape=1, scale=1, G=50, N=50, iter1=10, iter2=1000): df_out = open(out_name, 'w') df_out.write('\t'.join([ 'N', 'G', 'Cov', 'Rho_goal', 'Rho_estimated', 'Iteration', 'dist_percent', 'z_score' ]) + '\n') for cov in covs: for rho in rhos: for i in range(iter1): C, rho_estimated = pt.get_ba_cov_matrix(n_genes=G, cov=cov, rho=rho) while True: lambda_genes = np.random.gamma(shape=1, scale=1, size=G) test_cov = np.stack([ pt.get_count_pop(lambda_genes, C=C) for x in range(N) ], axis=0) if (np.any(test_cov.sum(axis=1) == 0)) == False: break # check and remove empty columns test_cov = test_cov[:, ~np.all(test_cov == 0, axis=0)] euc_percent, z_score = pt.matrix_vs_null_one_treat( test_cov, iter2) df_out.write('\t'.join([ str(N), str(G), str(cov), str(rho), str(rho_estimated), str(i), str(euc_percent), str(z_score) ]) + '\n') print(N, G, cov, rho, rho_estimated, i) df_out.close()
def run_cov_dist_sims_unequal(out_name, to_reshuffle=[5], N1=20, N2=20, covs_12=[0.05], G=100, shape=1, scale=1, iter1=10, iter2=1000): df_out = open(out_name, 'w') df_out.write('\t'.join([ 'N1', 'N2', 'G', 'Reshuf', 'Cov', 'Iteration', 'Euc_dist', 'F_2_percent', 'F_2_z_score', 'V_1_percent', 'V_1_z_score', 'V_2_percent', 'V_2_z_score' ]) + '\n') # re write code for covariance matrix to get for reshuf in to_reshuffle: for cov in covs: reshuf_list = [] for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) while True: rates = np.random.gamma(shape, scale=scale, size=G) rates1 = rates.copy() rates2 = rates.copy() # fix this so you're not resampling the same pairs for j in range(reshuf)[0::2]: rates2[j], rates2[j + 1] = rates2[j + 1], rates2[j] #shuffle(rates)#[:reshuf]) counts1 = np.stack( [pt.get_count_pop(rates1, C=C) for x in range(N1)], axis=0) counts2 = np.stack( [pt.get_count_pop(rates2, C=C) for x in range(N2)], axis=0) if (np.any(counts1.sum(axis=1) == 0) == False) or (np.any( counts2.sum(axis=1) == 0) == False): break euc_dist = np.linalg.norm(rates1 - rates2) count_matrix = np.concatenate((counts1, counts2), axis=0) # check and remove empty columns count_matrix = count_matrix[:, ~np.all(count_matrix == 0, axis=0)] F_2_percent, F_2_z_score, \ V_1_percent, V_1_z_score, \ V_2_percent, V_2_z_score = \ pt.matrix_vs_null_two_treats(count_matrix, N1, N2, iter=iter2) reshuf_list.append(euc_dist) print(reshuf, cov, i, F_2_percent, F_2_z_score, euc_dist, V_1_percent, V_2_percent) df_out.write('\t'.join([ str(N1), str(N2), str(G), str(reshuf), str(cov), str(i), str(euc_dist), str(F_2_percent), str(F_2_z_score), str(V_1_percent), str(V_1_z_score), str(V_2_percent), str(V_2_z_score) ]) + '\n') print(cov, np.mean(reshuf_list)) df_out.close()
def run_ba_ntwk_cluster_sims(iter1=1000, iter2=1000, cov=0.2): df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_cluster_methods.txt', 'w') df_out.write('\t'.join(['Prob', 'CC_mean', 'CC_025', 'CC_975', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n') n_pops=100 n_genes=50 #covs = [0.05, 0.1, 0.15, 0.2] ps = [0, 0.2, 0.4, 0.6, 0.8, 1] for p in ps: eig_p_list = [] mcd_k1_p_list = [] mcd_k3_p_list = [] mpd_k1_p_list = [] mpd_k3_p_list = [] eig_z_list = [] mcd_k1_z_list = [] mcd_k3_z_list = [] mpd_k1_z_list = [] mpd_k3_z_list = [] cc_list = [] for i in range(iter1): if i %100 ==0: print(ps, i) lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) C, cc = pt.get_ba_cov_matrix(n_genes, cov=cov, p=p) test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 ) X = test_cov/test_cov.sum(axis=1)[:,None] X -= np.mean(X, axis = 0) pca = PCA() pca_fit = pca.fit_transform(X) mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1) mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3) eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes) mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1) mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3) eig_null_list = [] mcd_k1_null_list = [] mcd_k3_null_list = [] mpd_k1_null_list = [] mpd_k3_null_list = [] for j in range(iter2): test_cov_rndm = pt.get_random_matrix(test_cov) X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None] X_j -= np.mean(X_j, axis = 0) pca_j = PCA() pca_fit_j = pca_j.fit_transform(X_j) #pca_fit_j = pca.fit_transform(X_j) mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) ) mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) ) mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1)) mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3)) eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) ) #print(len( [k for k in eig_null_list if k > eig] ) / iter1) eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1) mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 ) mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 ) mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 ) mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 ) cc_list.append(cc) eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list) ) mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list) ) mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list) ) mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list) ) mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list) ) # calculate cc_mean = np.mean(cc_list) cc_bs_mean_list = [] for iter_i in range(10000): cc_bs_mean_list.append( np.mean( np.random.choice(cc_list, size=50, replace=True ) )) cc_bs_mean_list.sort() cc_975 = cc_bs_mean_list[ int(0.975 * 10000) ] cc_025 = cc_bs_mean_list[ int(0.025 * 10000) ] eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1 eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list) mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1 mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list) mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1 mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list) mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1 mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list) mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1 mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list) eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list) mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list) mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list) mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list) mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list) df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n') df_out.close()
def run_ba_ntwk_cov_sims(iter1=1000, iter2=1000, n_pops=100, n_genes=50): df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_methods.txt', 'w') df_out.write('\t'.join(['Cov', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n') covs = [0.05, 0.1, 0.15, 0.2] #covs = [0.2] for cov in covs: eig_p_list = [] mcd_k1_p_list = [] mcd_k3_p_list = [] mpd_k1_p_list = [] mpd_k3_p_list = [] eig_z_list = [] mcd_k1_z_list = [] mcd_k3_z_list = [] mpd_k1_z_list = [] mpd_k3_z_list = [] for i in range(iter1): if i %100 ==0: print(cov, i) lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) C = pt.get_ba_cov_matrix(n_genes, cov=cov) test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 ) X = test_cov/test_cov.sum(axis=1)[:,None] X -= np.mean(X, axis = 0) pca = PCA() pca_fit = pca.fit_transform(X) mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1) mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3) eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes) mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1) mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3) #print(pca.explained_variance_[:-1]) #print(pt.get_x_stat(pca.explained_variance_[:-1])) eig_null_list = [] mcd_k1_null_list = [] mcd_k3_null_list = [] mpd_k1_null_list = [] mpd_k3_null_list = [] for j in range(iter2): test_cov_rndm = pt.get_random_matrix(test_cov) X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None] X_j -= np.mean(X_j, axis = 0) pca_j = PCA() pca_fit_j = pca_j.fit_transform(X_j) #pca_fit_j = pca.fit_transform(X_j) mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) ) mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) ) mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1)) mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3)) eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) ) eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1) mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 ) mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 ) mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 ) mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 ) eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list) ) mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list) ) mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list) ) mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list) ) mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list) ) # calculate power eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1 eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list) mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1 mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list) mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1 mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list) mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1 mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list) mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1 mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list) eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list) mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list) mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list) mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list) mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list) df_out.write('\t'.join([str(cov), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n') df_out.close()