def modular_ntwrk(): G_025 = nx.algorithms.community.LFR_benchmark_graph(n=250, tau1=3, tau2=1.5, mu=0.25, average_degree=5, min_community=20, seed=10) np.savetxt(pt.get_path() + '/data/modular_ntwrk_mu_025.txt', nx.to_numpy_matrix(G_025), delimiter="\t") G_015 = nx.algorithms.community.LFR_benchmark_graph(n=250, tau1=3, tau2=1.5, mu=0.15, average_degree=5, min_community=20, seed=10) np.savetxt(pt.get_path() + '/data/modular_ntwrk_mu_015.txt', nx.to_numpy_matrix(G_015), delimiter="\t") G_010 = nx.algorithms.community.LFR_benchmark_graph(n=250, tau1=3, tau2=1.5, mu=0.01, average_degree=5, min_community=20, seed=10) np.savetxt(pt.get_path() + '/data/modular_ntwrk_mu_010.txt', nx.to_numpy_matrix(G_010), delimiter="\t") communities = {frozenset(G_010.nodes[v]['community']) for v in G_010}
def run_ntwrk_cov_sims(var=1, cov=0.25): df_out = open( pt.get_path() + '/data/simulations/cov_ntwrk_euc_pos_only_010.txt', 'w') n_pops = 20 n_genes = 250 lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n') C = np.loadtxt(pt.get_path() + '/data/modular_ntwrk_mu_010.txt', delimiter='\t') #, dtype='int') #print(np.mean(np.sum(ntwrk, axis =1))) C = C * cov np.fill_diagonal(C, var) for i in range(100): test_cov = np.stack( [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) sim_eucs = [] for j in range(1000): X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs) print(str(cov), ' ', str(i), ' ', str(z_score)) df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n') df_out.close()
def poisson_power_G(alpha = 0.05): fig = plt.figure() df = pd.read_csv(pt.get_path() + '/data/simulations/ba_cov_G_sims.txt', sep='\t') covs = np.sort(list(set(df.Cov.values))) Ns = np.sort(list(set(df.G.values))) colors = ['powderblue', 'royalblue', 'navy'] for i, cov in enumerate(covs): powers = [] for j, N in enumerate(Ns): df_cov = df[ (df['Cov'] == cov) & (df['G'] == N) ] p = df_cov['dist_percent'].values #p = df_i[ (df_i['N_genes_sample'] == gene_shuffle) ].p.tolist() p_sig = [p_i for p_i in p if p_i >= (1-alpha)] powers.append(len(p_sig) / len(p)) plt.plot(np.asarray(Ns), np.asarray(powers), linestyle='--', marker='o', color=colors[i], label=r'$\mathrm{cov}=$' + str(cov)) plt.tight_layout() plt.legend(loc='upper left', fontsize=14) plt.xlabel('Number of genes, '+ r'$\mathrm{log}_{2}$', fontsize = 16) plt.xscale('log', basex=2) plt.axhline(0.05, color = 'dimgrey', lw = 2, ls = '--') plt.ylabel(r'$ \mathrm{P}\left ( \mathrm{reject} \; H_{0} \mid H_{1} \; \mathrm{is}\, \mathrm{true}, \, \alpha=0.05 \right ) $', fontsize = 16) fig_name = pt.get_path() + '/figs/poisson_power_G.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def rndm_sample_tenaillon(iter1=1000, iter2=1000): df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_np = df.values gene_names = df.columns.values n_rows = df_np.shape[0] df_out = open(pt.get_path() + '/data/Tenaillon_et_al/sample_size_sim.txt', 'w') df_out.write( '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n') Ns = list(range(2, 40, 2)) for N in Ns: for i in range(iter1): #df_np_i = df_np[np.random.choice(n_rows, N, replace=False), :] #df_np_i = df_np_i[: , ~np.all(df_np_i == 0, axis=0)] #df_i = df.sample(N) df_np_i = df_np[np.random.randint(n_rows, size=N), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] #df_i = df_i.loc[:, (df_i != 0).any(axis=0)] np.seterr(divide='ignore') df_np_i_delta = pt.likelihood_matrix_array( df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = pt.hellinger_transform(df_np_i_delta) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): #df_np_i_j = pt.random_matrix(df_np_i) df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = pt.likelihood_matrix_array( df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() #df_i_j = pd.DataFrame(data=pt.random_matrix(df_np_i_j), index=df_i.index, columns=df_i.columns) #df_i_j_delta = pt.likelihood_matrix(df_i_j, 'Tenaillon_et_al').get_likelihood_matrix() X_j = pt.hellinger_transform(df_np_i_j_delta) pca_fit_j = pca.fit_transform(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) G = df_np_i.shape[1] euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) print(str(N), str(i), str(G), str(euc_percent), str(z_score)) df_out.write('\t'.join([ str(N), str(G), str(i), str(euc_percent), str(z_score) ]) + '\n') df_out.close()
def run_all_sims(): df_out = open(pt.get_path() + '/data/simulations/cov_euc.txt', 'w') n_pops = 20 n_genes = 50 lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) covs = [0.5, 0, -0.5] df_out.write('\t'.join(['Covariance', 'Iteration', 'z_score']) + '\n') for cov in covs: for i in range(100): print(str(cov) + ' ' + str(i)) test_cov = np.stack( [get_count_pop(lambda_genes, cov=cov) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_euclidean_distance(pca_fit) sim_eucs = [] for j in range(1000): #if j % 100 == 0: # print(j) X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) sim_eucs.append(pt.get_euclidean_distance(pca_fit_j)) z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs) df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n') df_out.close()
def run_block_cov_sims(): df_out = open( pt.get_path() + '/data/simulations/cov_block_euc_pos_only.txt', 'w') n_pops = 20 n_genes = 50 lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n') #covs = [0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9] covs = [-0.9] for cov in covs: C = get_block_cov(n_genes, pos_cov=cov, neg_cov=cov) print(np.all(np.linalg.eigvals(C) > 0)) print(C) for i in range(100): test_cov = np.stack( [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) sim_eucs = [] for j in range(1000): X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs) print(str(cov), ' ', str(i), ' ', str(z_score)) df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n') df_out.close()
def get_network_cov(cov=1 / 9, var=1): #df = pd.read_csv(pt.get_path() + '/data/disassoc_network_eq.txt', sep='\t', header=None) #df = df.astype(int) ntwrk = np.loadtxt(pt.get_path() + '/data/disassoc_network_eq.txt', delimiter='\t') #, dtype='int') #print(np.mean(np.sum(ntwrk, axis =1))) ntwrk = ntwrk * cov np.fill_diagonal(ntwrk, var) # Gershgorin circle theorem sets limit on covariance # https://math.stackexchange.com/questions/2378428/how-to-create-a-positive-definite-covariance-matrix-from-an-adjacency-matrix graph = nx.barabasi_albert_graph(50, 5) graph_np = nx.to_numpy_matrix(graph) #print(np.sum(graph_np, axis =1)) graph_np = graph_np * cov np.fill_diagonal(graph_np, 1) #print(np.linalg.eigvals(graph_np)) #print(np.all(np.linalg.eigvals(graph_np) > 0)) #graph_np = graph_np * 0.49 #np.fill_diagonal(graph_np, 1) #print(ntwrk) print(np.linalg.eigvals(ntwrk)) print(np.all(np.linalg.eigvals(ntwrk) > 0))
def run_ba_cov_neutral_sims(shape=1, scale=1, G=50, N=50, iter1=1000, iter2=1000): df_out = open(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt', 'w') df_out.write('\t'.join([ 'N', 'G', 'lamba_mean', 'lambda_neutral', 'Cov', 'Iteration', 'dist_percent' ]) + '\n') covs = [0.2] mean_gamma = shape * scale neutral_range = np.logspace(-2, 1, num=20, endpoint=True, base=10.0) neutral_range = neutral_range[::-1] for neutral_ in neutral_range: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) lambda_genes = np.random.gamma(shape=shape, scale=scale, size=G) lambda_genes_null = np.asarray([neutral_] * G) test_cov_adapt = np.stack( [pt.get_count_pop(lambda_genes, C=C) for x in range(N)], axis=0) # matrix with diaganol values equal to one test_cov_neutral = np.stack([ pt.get_count_pop(lambda_genes_null, C=np.identity(G)) for x in range(N) ], axis=0) test_cov = test_cov_adapt + test_cov_neutral X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): #X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) X_j = pt.hellinger_transform( pt.get_random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) euc_dists.append( pt.get_mean_pairwise_euc_distance(pca_fit_j)) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) print(neutral_, cov, i, euc_percent) df_out.write('\t'.join([ str(N), str(G), str(mean_gamma), str(neutral_), str(cov), str(i), str(euc_percent) ]) + '\n') df_out.close()
def tenaillon_fitness_hist(): gene_by_pop_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' gene_by_pop = pd.read_csv(gene_by_pop_path, sep = '\t', header = 'infer', index_col = 0) fitness_path = pt.get_path() + '/data/Tenaillon_et_al/fitness.csv' fitness = pd.read_csv(fitness_path, sep = ',', header = 'infer', index_col = 0) # select fitness values from lines that were sequenced fitness_subset = fitness.ix[gene_by_pop.index.values] fitness_np = fitness_subset['W (avg)'].values fitness_np = fitness_np[np.logical_not(np.isnan(fitness_np))] kde = pt.get_kde(fitness_np) fig = plt.figure() plt.plot(kde[0], kde[1]) plt.xlabel("Fitness", fontsize = 18) plt.ylabel("Frequency", fontsize = 18) fig.tight_layout() plot_path = pt.get_path() + '/figs/tenaillon_fitness.png' fig.savefig(plot_path, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def run_pca_sample_size_permutation(iter=10000, analysis='PCA', k=3): df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_array = df.as_matrix() sample_sizes = np.linspace(2, df.shape[0], num=20, dtype=int) df_out = open( pt.get_path() + '/data/Tenaillon_et_al/sample_size_permute_' + analysis + '.txt', 'w') column_headers = [ 'Sample_size', 'Iteration', 'MCD', 'mean_angle', 'delta_L' ] df_out.write('\t'.join(column_headers) + '\n') for sample_size in sample_sizes: print("Sample size = " + str(sample_size)) for i in range(iter): print("Sample size = " + str(sample_size) + ' Iteration = ' + str(i)) df_sample = df.sample(n=sample_size) #df_sample = df_sample.loc[:, (df_sample != 0).any(axis=0)] df_sample_delta = pt.likelihood_matrix( df_sample, 'Tenaillon_et_al').get_likelihood_matrix() df_sample_delta = df_sample_delta.loc[:, (df_sample_delta != 0).any( axis=0)] X = pt.hellinger_transform(df_sample_delta) pca = PCA() df_sample_delta_out = pca.fit_transform(X) mcd = pt.get_mean_centroid_distance(df_sample_delta_out, k=k) mean_angle = pt.get_mean_angle(df_sample_delta_out, k=k) mean_length = pt.get_euclidean_distance(df_sample_delta_out, k=k) df_out.write('\t'.join([ str(sample_size), str(i), str(mcd), str(mean_angle), str(mean_length) ]) + '\n') df_out.close()
def power_figs(alpha=0.05): df = pd.read_csv(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', sep='\t') fig = plt.figure() covs = [0.05, 0.1, 0.15, 0.2] measures = [ 'euc_percent', 'eig_percent', 'mcd_percent_k1', 'mcd_percent_k3' ] colors = ['powderblue', 'skyblue', 'royalblue', 'blue', 'navy'] labels = ['euclidean distance', 'eigenanalysis', 'mcd 1', 'mcd 1-3'] for i, measure in enumerate(measures): #df_i = df[ (df['Cov'] == cov) & (df['Cov'] == cov)] powers = [] for j, cov in enumerate(covs): df_cov = df[df['Cov'] == cov] p = df_cov[measure].values #p = df_i[ (df_i['N_genes_sample'] == gene_shuffle) ].p.tolist() p_sig = [p_i for p_i in p if p_i >= (1 - alpha)] powers.append(len(p_sig) / len(p)) print(powers) plt.plot(np.asarray(covs), np.asarray(powers), linestyle='--', marker='o', color=colors[i], label=labels[i]) #plt.title('Covariance', fontsize = 18) plt.legend(loc='lower right') plt.xlabel('Covariance', fontsize=16) plt.ylabel( r'$ \mathrm{P}\left ( \mathrm{reject} \; H_{0} \mid H_{1} \; \mathrm{is}\, \mathrm{true}, \, \alpha=0.05 \right ) $', fontsize=16) #plt.xlim(-0.02, 1.02) #plt.ylim(-0.02, 1.02) plt.tight_layout() fig_name = pt.get_path() + '/figs/power_method.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def poisson_neutral_fig(alpha = 0.05): df = pd.read_csv(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt', sep='\t') neuts = np.sort(list(set(df.lambda_neutral.values))) cov = 0.2 powers = [] for neut in neuts: df_neut = df[ (df['lambda_neutral'] == neut) ] p = df_neut.dist_percent.values p_sig = [p_i for p_i in p if p_i >= (1-alpha)] powers.append(len(p_sig) / len(p)) fig = plt.figure() plt.plot(np.asarray(1 / neuts), np.asarray(powers), linestyle='--', marker='o', color='royalblue', label=r'$\mathrm{cov}=$' + str(cov)) plt.tight_layout() plt.legend(loc='upper left', fontsize=14) plt.xscale('log', basex=10) plt.xlabel("Adaptive vs. non-adaptive substitution rate, " + r'$\frac{ \left \langle \lambda \right \rangle }{\lambda_{0}}$', fontsize = 16) plt.axhline(0.05, color = 'dimgrey', lw = 2, ls = '--') plt.ylabel(r'$ \mathrm{P}\left ( \mathrm{reject} \; H_{0} \mid H_{1} \; \mathrm{is}\, \mathrm{true}, \, \alpha=0.05 \right ) $', fontsize = 16) fig_name = pt.get_path() + '/figs/poisson_power_neutral.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def get_fig(): df = pd.read_csv(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', sep='\t') x = df.Cov.values y = df.euc_z_score.values #print(np.mean(y)) fig = plt.figure() slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) x_slope = np.linspace(0, 1, 1000) y_slope = intercept + (slope * x_slope) plt.scatter(x, y, c='#175ac6', marker = 'o', s = 70, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.2, zorder=2)#, edgecolors='none') plt.plot(x_slope, y_slope, c='k', lw=2) plt.axhline(y=0, color='red', lw=2, linestyle='--') plt.xlabel('Covariance') plt.ylabel('Z-score') plt.tight_layout() fig_name = pt.get_path() + '/figs/cov_ba_ntwrk_ev.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def run_ba_cov_sims(gene_list, pop_list, out_name, iter1=1000, iter2=1000): df_out = open(pt.get_path() + '/data/simulations/' + out_name + '.txt', 'w') df_out.write('\t'.join(['N', 'G', 'Cov', 'Iteration', 'dist_percent']) + '\n') covs = [0.1, 0.15, 0.2] for G in gene_list: for N in pop_list: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) while True: lambda_genes = np.random.gamma(shape=1, scale=1, size=G) test_cov = np.stack([ pt.get_count_pop(lambda_genes, cov=C) for x in range(N) ], axis=0) #test_cov_row_sum = test_cov.sum(axis=1) if (np.any(test_cov.sum(axis=1) == 0)) == False: break #if np.count_nonzero(test_cov_row_sum) == len(test_cov_row_sum): # break X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): X_j = pt.hellinger_transform( pt.get_random_matrix(test_cov)) #X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) euc_dists.append( pt.get_mean_pairwise_euc_distance(pca_fit_j)) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) print(N, G, cov, i, euc_percent) df_out.write('\t'.join( [str(N), str(G), str(cov), str(i), str(euc_percent)]) + '\n') df_out.close()
def simulate(N, Ks, G, alphas, mu, iter=100): #alphas and mus is a list df_out = open(pt.get_path() + '/data/simulations/test.txt', 'w') header = ['N', 'K', 'Gene', 'Alpha', 'Mu', 'Muts', 'Iter'] df_out.write('\t'.join(header) + '\n') for K in Ks: for alpha in alphas: for i in range(iter): print('K = ' + str(K), 'alpha = ' + str(alpha), 'iter = ' + str(i)) sim_gene_dict = simulate_NK(N, K, G, alpha, mu).get_directed_graph() for key, value in sim_gene_dict.items(): sim_out = [N, K, key, alpha, mu, value, i] sim_out = [str(x) for x in sim_out] df_out.write('\t'.join(sim_out) + '\n') df_out.close()
def get_correlated_rndm_ntwrk_original(nodes=10, m=2, rho=0.5): assort_ = [] graph = nx.barabasi_albert_graph(nodes, m) graph_np = nx.to_numpy_matrix(graph) #np.savetxt(pt.get_path() + '/data/disassoc_network_n0.txt', graph_np.astype(int), delimiter="\t") #iter = 100 count = 0 current_rho = 0 #while count < iter: rejected_counts = 0 while abs(current_rho) < abs(rho): def get_two_edges(graph_array): #d = nx.to_dict_of_dicts(graph_array, edge_data=1) d = nx.to_dict_of_dicts(nx.from_numpy_matrix(graph_array), edge_data=1) l0_n0 = random.sample(list(d), 1)[0] l0_list = list(d[l0_n0]) l0_n1 = random.sample(l0_list, 1)[0] def get_second_edge(d, l0_n0, l0_n1): l1_list = [i for i in list(d) if i not in [l0_n0, l0_n1]] l1 = [] while len(l1) != 2: l1_n0 = random.sample(list(l1_list), 1)[0] l1_n1_list = d[l1_n0] l1_n1_list = [ i for i in l1_n1_list if i not in [l0_n0, l0_n1] ] if len(l1_n1_list) > 0: l1_n1 = random.sample(list(l1_n1_list), 1)[0] l1.extend([l1_n0, l1_n1]) return l1 # get two links, make sure all four nodes are unique link1 = get_second_edge(d, l0_n0, l0_n1) row_sums = np.asarray(np.sum(graph_array, axis=0))[0] node_edge_counts = [(l0_n0, row_sums[l0_n0]), (l0_n1, row_sums[l0_n1]), (link1[0], row_sums[link1[0]]), (link1[1], row_sums[link1[1]])] return node_edge_counts edges = get_two_edges(graph_np) graph_np_sums = np.sum(graph_np, axis=1) #if edges == edges_copy: # continue # check whether new edges already exist if graph_np[edges[0][0],edges[3][0]] == 1 or \ graph_np[edges[3][0],edges[0][0]] == 1 or \ graph_np[edges[2][0],edges[1][0]] == 1 or \ graph_np[edges[1][0],edges[2][0]] == 1: continue disc = (edges[0][1] - edges[2][1]) * \ (edges[3][1] - edges[1][1]) if (assortative == True and disc > 0) or (assortative == False and disc < 0): graph_np[edges[0][0], edges[1][0]] = 0 graph_np[edges[1][0], edges[0][0]] = 0 graph_np[edges[2][0], edges[3][0]] = 0 graph_np[edges[3][0], edges[2][0]] = 0 graph_np[edges[0][0], edges[3][0]] = 1 graph_np[edges[3][0], edges[0][0]] = 1 graph_np[edges[2][0], edges[1][0]] = 1 graph_np[edges[1][0], edges[2][0]] = 1 assort_.append( nx.degree_assortativity_coefficient( nx.from_numpy_matrix(graph_np))) count += 1 current_rho = nx.degree_assortativity_coefficient( nx.from_numpy_matrix(graph_np)) print(current_rho, rejected_counts) else: rejected_counts += 1 #print(count, disc, nx.degree_assortativity_coefficient(nx.from_numpy_matrix(graph_np))) graph_np.astype(int) if assortative == True: txt_name = 'assoc_network_eq' else: txt_name = 'disassoc_network_eq' np.savetxt(pt.get_path() + '/data/' + txt_name + '.txt', graph_np.astype(int), delimiter="\t")
def hist_tenaillon_multi(k = 3): df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) df_delta = pt.likelihood_matrix(df, 'Tenaillon_et_al').get_likelihood_matrix() X = pt.hellinger_transform(df_delta) pca = PCA() df_out = pca.fit_transform(X) df_null_path = pt.get_path() + '/data/Tenaillon_et_al/permute_PCA.txt' df_null = pd.read_csv(df_null_path, sep = '\t', header = 'infer', index_col = 0) mean_angle = pt.get_mean_angle(df_out, k = k) mcd = pt.get_mean_centroid_distance(df_out, k=k) #mean_length = pt.get_euclidean_distance(df_out, k=k) mean_dist = pt.get_mean_pairwise_euc_distance(df_out, k=k) x_stat = pt.get_x_stat(pca.explained_variance_[:-1]) fig = plt.figure() ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1) ax1.axhline(y=0, color='k', linestyle=':', alpha = 0.8, zorder=1) ax1.axvline(x=0, color='k', linestyle=':', alpha = 0.8, zorder=2) ax1.scatter(0, 0, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3) ax1.scatter(df_out[:,0], df_out[:,1], marker = "o", edgecolors='#244162', c = '#175ac6', alpha = 0.4, s = 60, zorder=4) ax1.set_xlim([-0.75,0.75]) ax1.set_ylim([-0.75,0.75]) ax1.set_xlabel('PCA 1 (' + str(round(pca.explained_variance_ratio_[0],3)*100) + '%)' , fontsize = 14) ax1.set_ylabel('PCA 2 (' + str(round(pca.explained_variance_ratio_[1],3)*100) + '%)' , fontsize = 14) ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1) mcd_list = df_null.MCD.tolist() #ax2.hist(mcd_list, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b') ax2.hist(mcd_list,bins=30, weights=np.zeros_like(mcd_list) + 1. / len(mcd_list), alpha=0.8, color = '#175ac6') ax2.axvline(mcd, color = 'red', lw = 3) ax2.set_xlabel("Mean centroid distance, " + r'$ \left \langle \delta_{c} \right \rangle$', fontsize = 14) ax2.set_ylabel("Frequency", fontsize = 16) mcd_list.append(mcd) relative_position_mcd = sorted(mcd_list).index(mcd) / (len(mcd_list) -1) if relative_position_mcd > 0.5: p_score_mcd = 1 - relative_position_mcd else: p_score_mcd = relative_position_mcd print('mean centroid distance p-score = ' + str(round(p_score_mcd, 3))) ax2.text(0.366, 0.088, r'$p < 0.05$', fontsize = 10) ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1) delta_L_list = df_null.mean_dist.tolist() #ax3.hist(delta_L_list, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b') ax3.hist(delta_L_list,bins=30, weights=np.zeros_like(delta_L_list) + 1. / len(delta_L_list), alpha=0.8, color = '#175ac6') ax3.axvline(mean_dist, color = 'red', lw = 3) ax3.set_xlabel("Mean pair-wise \n Euclidean distance, " + r'$ \left \langle d \right \rangle$', fontsize = 14) ax3.set_ylabel("Frequency", fontsize = 16) delta_L_list.append(mean_dist) relative_position_delta_L = sorted(delta_L_list).index(mean_dist) / (len(delta_L_list) -1) if relative_position_delta_L > 0.5: p_score_delta_L = 1 - relative_position_delta_L else: p_score_delta_L = relative_position_delta_L print('mean difference in distances p-score = ' + str(round(p_score_delta_L, 3))) ax3.text(0.50, 0.09, r'$p < 0.05$', fontsize = 10) ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1) ax4_values = df_null.x_stat.values ax4_values = ax4_values[np.logical_not(np.isnan(ax4_values))] #ax4.hist(ax4_values, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b') ax4.hist(ax4_values, bins=30, weights=np.zeros_like(ax4_values) + 1. / len(ax4_values), alpha=0.8, color = '#175ac6') print(np.mean(ax4_values)) print(stats.mode(ax4_values)) ax4.axvline(x_stat, color = 'red', lw = 3) ax4.set_xlabel(r'$F_{1}$', fontsize = 14) ax4.set_ylabel("Frequency", fontsize = 16) mean_angle_list = ax4_values.tolist() mean_angle_list.append(mean_angle) relative_position_angle = sorted(mean_angle_list).index(mean_angle) / (len(mean_angle_list) -1) print(x_stat) print( len([x for x in mean_angle_list if x > x_stat])/ sum(mean_angle_list) ) if relative_position_angle > 0.5: p_score_angle = 1 - relative_position_angle else: p_score_angle = relative_position_angle print('F_{1} statistic p-score = ' + str(round(p_score_angle, 3))) ax4.text(19.1, 0.09, r'$p \nless 0.05$', fontsize = 10) plt.tight_layout() fig_name = pt.get_path() + '/figs/fig1.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
import pickle import operator import sys import random import copy from itertools import compress import numpy as np import pandas as pd import parevol_tools as pt import clean_data as cd import matplotlib.pyplot as plt from scipy import stats df_non_path = pt.get_path() + "/data/Tenaillon_et_al/gene_by_pop_nonsyn.txt" df_non = pd.read_csv(df_non_path, sep="\t", header="infer", index_col=0) genes_non = df_non.columns.to_list() df_non_np = df_non.values df_non_np = np.transpose(df_non_np) mean_all = [] var_all = [] for gene in df_non_np: if sum(gene > 0) < 5: continue mean_all.append(np.mean(gene)) var_all.append(np.var(gene))
def plot_permutation(dataset = 'good', analysis = 'PCA', alpha = 0.05): df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) to_exclude = pt.complete_nonmutator_lines() to_exclude.append('p5') df_nonmut = df[df.index.str.contains('|'.join( to_exclude))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] df_delta = pt.likelihood_matrix(df_nonmut, 'Good_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_delta) pca = PCA() df_out = pca.fit_transform(X) elif analysis == 'cMDS': df_delta_bc = np.sqrt(pt.get_scipy_bray_curtis(df_delta.as_matrix())) df_out = pt.cmdscale(df_delta_bc)[0] time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values]))) df_rndm_delta_out = pd.DataFrame(data=df_out, index=df_delta.index) mcds = [] for tp in time_points_set: df_rndm_delta_out_tp = df_rndm_delta_out[df_rndm_delta_out.index.str.contains('_' + str(tp))] mcds.append(pt.get_mean_pairwise_euc_distance(df_rndm_delta_out_tp.as_matrix(), k=3)) mcd_perm_path = pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt' mcd_perm = pd.read_csv(mcd_perm_path, sep = '\t', header = 'infer', index_col = 0) mcd_perm_x = np.sort(list(set(mcd_perm.Generation.tolist()))) lower_ci = [] upper_ci = [] mean_mcds = [] std_mcds = [] lower_z_ci = [] upper_z_ci = [] for x in mcd_perm_x: mcd_perm_y = mcd_perm.loc[mcd_perm['Generation'] == x] mcd_perm_y_sort = np.sort(mcd_perm_y.mean_dist.tolist()) mean_mcd_perm_y = np.mean(mcd_perm_y_sort) std_mcd_perm_y = np.std(mcd_perm_y_sort) mean_mcds.append(mean_mcd_perm_y) std_mcds.append(std_mcd_perm_y) lower_ci.append(mean_mcd_perm_y - mcd_perm_y_sort[int(len(mcd_perm_y_sort) * alpha)]) upper_ci.append(abs(mean_mcd_perm_y - mcd_perm_y_sort[int(len(mcd_perm_y_sort) * (1 - alpha))])) # z-scores mcd_perm_y_sort_z = [ ((i - mean_mcd_perm_y) / std_mcd_perm_y) for i in mcd_perm_y_sort] lower_z_ci.append(abs(mcd_perm_y_sort_z[int(len(mcd_perm_y_sort_z) * alpha)])) upper_z_ci.append(abs(mcd_perm_y_sort_z[int(len(mcd_perm_y_sort_z) * (1 - alpha))])) fig = plt.figure() plt.figure(1) plt.subplot(211) plt.errorbar(mcd_perm_x, mean_mcds, yerr = [lower_ci, upper_ci], fmt = 'o', alpha = 0.5, \ barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1) plt.scatter(time_points_set, mcds, c='#175ac6', marker = 'o', s = 70, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none') #plt.xlabel("Time (generations)", fontsize = 16) #plt.ylabel("Mean \n Euclidean distance", fontsize = 14) plt.ylabel("Mean pair-wise \n Euclidean \n distance, " + r'$ \left \langle d \right \rangle$', fontsize = 14) plt.figure(1) plt.subplot(212) plt.errorbar(mcd_perm_x, [0] * len(mcd_perm_x), yerr = [lower_z_ci, upper_z_ci], fmt = 'o', alpha = 0.5, \ barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1) # zip mean, std, and measured values to make z-scores zip_list = list(zip(mean_mcds, std_mcds, mcds)) z_scores = [((i[2] - i[0]) / i[1]) for i in zip_list ] plt.scatter(time_points_set, z_scores, c='#175ac6', marker = 'o', s = 70, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none') plt.ylim(-2.2, 2.2) #plt.axhline(0, color = 'k', lw = 2, ls = '-') #plt.axhline(-1, color = 'dimgrey', lw = 2, ls = '--') #plt.axhline(-2, color = 'dimgrey', lw = 2, ls = ':') plt.xlabel("Time (generations)", fontsize = 16) plt.ylabel("Standardized mean \n pair-wise Euclidean \n distance, " + r'$ z_{\left \langle d \right \rangle}$', fontsize = 14) #plt.ylabel("Standardized mean \n Euclidean distance", fontsize = 14) fig.tight_layout() fig.savefig(pt.get_path() + '/figs/permutation_scatter_good.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def probability_absence(gene, N, mut_counts_dict, zeros=True): if zeros == True: mean_relative_muts_denom = sum( [mut_counts_dict[g]["mean_relative_muts"] for g in mut_counts_dict.keys()] ) mean_relative_muts_num = sum([mut_counts_dict[g]["mean_relative_muts"] for g in mut_counts_dict.keys() if g != gene]) else: mean_relative_muts_denom = sum([mut_counts_dict[g]["mean_relative_muts_no_zeros"] for g in mut_counts_dict.keys()]) mean_relative_muts_num = sum([mut_counts_dict[g]["mean_relative_muts_no_zeros"] for g in mut_counts_dict.keys() if g != gene]) return (mean_relative_muts_num / mean_relative_muts_denom) ** N df_non_path = pt.get_path() + "/data/Tenaillon_et_al/gene_by_pop_nonsyn.txt" df_non = pd.read_csv(df_non_path, sep="\t", header="infer", index_col=0) genes_non = df_non.columns.to_list() df_non_np = df_non.values df_non_np = np.transpose(df_non_np) locus_tags_non = map_tenaillon_genes_to_locus_tags(genes_non) mut_counts_non_dict = get_mut_counts_dict(df_non_np, locus_tags_non) # df_syn_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop_syn.txt' # df_syn = pd.read_csv(df_syn_path, sep = '\t', header = 'infer', index_col = 0) # genes_syn = df_syn.columns.to_list() # df_syn_np = df_syn.values # df_syn_np = np.transpose(df_syn_np)
def run_pca_permutation(iter=10000, analysis='PCA', dataset='tenaillon'): if dataset == 'tenaillon': k = 3 df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_array = df.as_matrix() df_out = open( pt.get_path() + '/data/Tenaillon_et_al/permute_' + analysis + '.txt', 'w') column_headers = [ 'Iteration', 'MCD', 'mean_angle', 'mean_dist', 'delta_L', 'x_stat' ] df_out.write('\t'.join(column_headers) + '\n') for i in range(iter): print(i) df_rndm = pd.DataFrame(data=pt.random_matrix(df_array), index=df.index, columns=df.columns) df_rndm_delta = pt.likelihood_matrix( df_rndm, 'Tenaillon_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_rndm_delta) pca = PCA() df_rndm_delta_out = pca.fit_transform(X) #df_pca = pd.DataFrame(data=X_pca, index=df.index) mean_angle = pt.get_mean_angle(df_rndm_delta_out, k=k) mcd = pt.get_mean_centroid_distance(df_rndm_delta_out, k=k) mean_length = pt.get_euc_magnitude_diff(df_rndm_delta_out, k=k) mean_dist = pt.get_mean_pairwise_euc_distance(df_rndm_delta_out, k=k) x_stat = pt.get_x_stat(pca.explained_variance_[:-1]) df_out.write('\t'.join([ str(i), str(mcd), str(mean_angle), str(mean_dist), str(mean_length), str(x_stat) ]) + '\n') df_out.close() elif dataset == 'good': k = 5 df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_exclude = pt.complete_nonmutator_lines() to_exclude.append('p5') df_nonmut = df[df.index.str.contains('|'.join(to_exclude))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) df_nonmut_array = df_nonmut.as_matrix() time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] df_final = df_nonmut.iloc[time_points_positions[time_points_set[-1]]] df_out = open( pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt', 'w') #column_headers = ['Iteration', 'Generation', 'MCD'] column_headers = [ 'Iteration', 'Generation', 'MCD', 'mean_angle', 'delta_L', 'mean_dist' ] df_out.write('\t'.join(column_headers) + '\n') for i in range(iter): print("Iteration " + str(i)) matrix_0 = df_nonmut.iloc[time_points_positions[ time_points_set[0]]] matrix_0_rndm = pt.random_matrix(matrix_0.as_matrix()) df_rndm_list = [ pd.DataFrame(data=matrix_0_rndm, index=matrix_0.index, columns=matrix_0.columns) ] # skip first time step for j, tp in enumerate(time_points_set[0:]): if j == 0: continue df_tp_minus1 = df_nonmut[df_nonmut.index.str.contains( '_' + str(time_points_set[j - 1]))] df_tp = df_nonmut[df_nonmut.index.str.contains('_' + str(tp))] matrix_diff = df_tp.as_matrix() - df_tp_minus1.as_matrix() matrix_0_rndm = matrix_0_rndm + pt.random_matrix(matrix_diff) df_0_rndm = pd.DataFrame(data=matrix_0_rndm, index=df_tp.index, columns=df_tp.columns) df_rndm_list.append(df_0_rndm) df_rndm = pd.concat(df_rndm_list) df_rndm_delta = pt.likelihood_matrix( df_rndm, 'Good_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_rndm_delta) pca = PCA() matrix_rndm_delta_out = pca.fit_transform(X) elif analysis == 'cMDS': matrix_rndm_delta_bc = np.sqrt( pt.get_bray_curtis(df_rndm_delta.as_matrix())) matrix_rndm_delta_out = pt.cmdscale(matrix_rndm_delta_bc)[0] else: print("Analysis argument not accepted") continue df_rndm_delta_out = pd.DataFrame(data=matrix_rndm_delta_out, index=df_rndm_delta.index) for tp in time_points_set: df_rndm_delta_out_tp = df_rndm_delta_out[ df_rndm_delta_out.index.str.contains('_' + str(tp))] df_rndm_delta_out_tp_matrix = df_rndm_delta_out_tp.as_matrix() mean_angle = pt.get_mean_angle(df_rndm_delta_out_tp_matrix, k=k) mcd = pt.get_mean_centroid_distance( df_rndm_delta_out_tp_matrix, k=k) mean_length = pt.get_euc_magnitude_diff( df_rndm_delta_out_tp_matrix, k=k) mean_dist = pt.get_mean_pairwise_euc_distance( df_rndm_delta_out_tp_matrix, k=k) df_out.write('\t'.join([ str(i), str(tp), str(mcd), str(mean_angle), str(mean_length), str(mean_dist) ]) + '\n') df_out.close()
def calculate_subsampled_mae(df_np, df_genes, mut_counts_dict, subsamples=1, name="non"): population_idx = np.arange(0, df_np.shape[1], 1) n_subsamples = np.arange(10, df_np.shape[1], 5) #n_subsamples = [10] mae_dict = {} for n_i in n_subsamples: sys.stdout.write("%d populations......\n" % n_i) mae_dict[n_i] = {} mean_absolute_error_all = [] mean_absolute_error_all_geometric = [] for subsample in range(subsamples): population_idx_subsample = np.random.choice(population_idx, size=n_i, replace=False) df_np_subsample = df_np[:, population_idx_subsample] observed_occupancies_subsample, predicted_occupancies_subsample = get_predicted_observed_occupancies(df_np_subsample, df_genes, mut_counts_dict) # df_np_pres_abs_subsample = np.where(df_np_subsample > 0, 1, 0) # observed_occupancies_subsample = df_np_pres_abs_subsample.sum(axis=1) / df_np_subsample.shape[1] # N_subsample_array = df_np_subsample.sum(axis=0) # predicted_occupancies_subsample = [] # for gene_idx, gene in enumerate(df_genes): # absence_prob_subsample_list = [probability_absence(gene, N_subsample, mut_counts_dict) for N_subsample in N_subsample_array if N_subsample> 0 ] # predicted_occupancies_subsample.append(1-np.mean(absence_prob_subsample_list)) # predicted_occupancies_subsample = np.asarray(predicted_occupancies_subsample) predicted_occupancies_subsample = predicted_occupancies_subsample[observed_occupancies_subsample > 0] observed_occupancies_subsample = observed_occupancies_subsample[observed_occupancies_subsample > 0] mean_absolute_error_i = np.mean(np.absolute(observed_occupancies_subsample - predicted_occupancies_subsample) ) mean_absolute_error_all.append(mean_absolute_error_i) # geometric observed_occupancies_subsample_geometric, predicted_occupancies_subsample_geometric = get_predicted_observed_occupancies_geometric(df_np_subsample, df_genes, mut_counts_dict) predicted_occupancies_subsample_geometric = predicted_occupancies_subsample_geometric[observed_occupancies_subsample_geometric > 0] observed_occupancies_subsample_geometric = observed_occupancies_subsample_geometric[observed_occupancies_subsample_geometric > 0] mean_absolute_error_i_geometric = np.mean(np.absolute(observed_occupancies_subsample_geometric - predicted_occupancies_subsample_geometric) ) mean_absolute_error_all_geometric.append(mean_absolute_error_i_geometric) mean_absolute_error_all = np.asarray(mean_absolute_error_all) mean_absolute_error_all_geometric = np.asarray(mean_absolute_error_all_geometric) mae_dict[n_i]["mae_mean"] = np.mean(mean_absolute_error_all) mae_dict[n_i]["mae_025"] = np.percentile(mean_absolute_error_all, 2.5) mae_dict[n_i]["mae_975"] = np.percentile(mean_absolute_error_all, 97.5) mae_dict[n_i]["mae_mean_geometric"] = np.mean(mean_absolute_error_all_geometric) mae_dict[n_i]["mae_025_geometric"] = np.percentile(mean_absolute_error_all_geometric, 2.5) mae_dict[n_i]["mae_975_geometric"] = np.percentile(mean_absolute_error_all_geometric, 97.5) sys.stdout.write("Dumping pickle......\n") file_name = "%s/data/Tenaillon_et_al/subsample_poisson_occupancy_%s.pickle" % (pt.get_path(), name) with open(file_name, "wb") as handle: pickle.dump(mae_dict, handle) sys.stdout.write("Done!\n")
def run_ba_ntwk_cov_sims(): df_out = open(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', 'w') n_pops = 100 n_genes = 50 ntwk = nx.barabasi_albert_graph(n_genes, 2) ntwk_np = nx.to_numpy_matrix(ntwk) lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) df_out.write('\t'.join([ 'Cov', 'Iteration', 'euc_z_score', 'euc_percent', 'eig_percent', 'mcd_percent_k1', 'mcd_percent_k3' ]) + '\n') covs = [0.05, 0.1, 0.15, 0.2] #covs = [0.2, 0.7] for cov in covs: C = ntwk_np * cov np.fill_diagonal(C, 1) #z_scores = [] #eig_percents = [] #euc_percents = [] #centroid_percents_k1 = [] #centroid_percents_k3 = [] for i in range(1000): test_cov = np.stack( [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)], axis=0) X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] eig = pt.get_x_stat(pca.explained_variance_[:-1]) mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k=1) mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k=3) eigs = [] centroid_dists_k1 = [] centroid_dists_k3 = [] for j in range(1000): X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) #pca_j = PCA() #pca_fit_j = pca_j.fit_transform(X_j) pca_fit_j = pca.fit_transform(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) centroid_dists_k1.append( pt.get_mean_centroid_distance(pca_fit_j, k=1)) centroid_dists_k3.append( pt.get_mean_centroid_distance(pca_fit_j, k=3)) eigs.append(pt.get_x_stat(pca.explained_variance_[:-1])) #eigs.append( pt.get_x_stat(pca_j.explained_variance_[:-1]) ) z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) eig_percent = len([k for k in eigs if k < eig]) / len(eigs) centroid_percent_k1 = len([ k for k in centroid_dists_k1 if k < mcd_k1 ]) / len(centroid_dists_k1) centroid_percent_k3 = len([ k for k in centroid_dists_k3 if k < mcd_k3 ]) / len(centroid_dists_k3) #eig_percents.append(eig_percent) #euc_percents.append(euc_percent) #z_scores.append(z_score) print(cov, i, z_score, euc_percent, eig_percent) df_out.write('\t'.join([ str(cov), str(i), str(z_score), str(euc_percent), str(eig_percent), str(centroid_percent_k1), str(centroid_percent_k3) ]) + '\n') #print(cov, np.all(np.linalg.eigvals(C) > 0), np.mean(z_scores)) df_out.close()