def plot_distance(): df_path = nt.get_path() + '/data/Good_et_al/network_features.txt' df = pd.read_csv(df_path, sep='\t', header='infer') fig = plt.figure() x = df.N.values y = df.d_mean.values #plt.scatter(x, y, marker = "o", edgecolors='none', c = '#87CEEB', s = 120, zorder=3) plt.scatter(x, y, c='#175ac6', marker = 'o', s = 120, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.9, zorder=3) x_range = list(range(10, max(x))) barabasi_albert_range = [(np.log(i) / np.log(np.log(i))) for i in x_range] random_range = [np.log(i) for i in x_range] plt.plot(x_range, barabasi_albert_range, c='r', lw=2.5, ls='--') plt.plot(x_range, random_range, c='k', lw=2.5, ls='--') plt.xlabel('Network size, ' + r'$N$', fontsize=18) plt.ylabel('Mean distance, ' + r'$\left \langle d \right \rangle$', fontsize=16) #plt.xscale('log') #plt.ylim(0.05, 1.5) fig.tight_layout() fig.savefig(nt.get_path() + '/figs/good_N_vs_d.png', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def clean_tenaillon_et_al(self): df_in = nt.get_path() + 'data/Tenaillon_et_al/1212986tableS2.csv' df_out = open( nt.get_path() + 'data/Tenaillon_et_al/1212986tableS2_clean.csv', 'w') category_dict = {} header = ['Lines', 'Position', 'Type', 'Change', 'Genic_status', 'Gene_nb', \ 'Gene_name', 'Effect', 'Site_affected', 'Length', \ 'Genic_type', 'Gene_nb_type', 'Gene_name_type', \ 'Effect_type', 'Site_affected_type', 'Length_type'] df_out.write(','.join(header) + '\n') # For genic, check whether genic + '_' + 7th column value in dict, if not, select # 'Genic' as key head_type = { 'Genic': ['Genic', 'Gene_nb', 'Gene_Name', 'Effect', 'codon_affected', 'gene_length_in_codon'] , \ 'Genic_Large_Deletion': ['Genic', 'Gene_nb', 'Gene_Name', 'Large_Deletion', 'bp_deleted_in_Gene', 'gene_length_bp' ], \ 'Genic_RNA': ['Genic', 'Gene_nb', 'Gene_Name', 'RNA', 'bp_affected', 'gene_length_bp'] ,\ 'Intergenic_Intergenic': ['Intergenic', 'Previous_Gene_nb', 'Previous_Gene_Name_distance_bp', 'Effect', 'Next_Gene_Name_distance_bp', 'Intergenic_type'], \ 'Multigenic_Multigenic': ['Multigenic', 'First_Gene_nb', 'First_Gene_Name', 'Effect', 'Last_Gene_nb', 'Last_Gene_Name']} for i, line in enumerate(open(df_in, 'r')): line = line.strip().split(',') if (len(line) == 0) or (i in range(0, 5)) or (len(line[0]) == 0): continue else: line_type = line[4] + '_' + line[7] if line_type in head_type: line_new = line + head_type[line_type] else: line_new = line + head_type['Genic'] df_out.write(','.join(line_new) + '\n') df_out.close()
def get_likelihood_matrices(): df_good_path = nt.get_path() + '/data/Good_et_al/gene_by_pop.txt' df_good = pd.read_csv(df_good_path, sep = '\t', header = 'infer', index_col = 0) df_good_delta = nt.likelihood_matrix(df_good, 'Good_et_al').get_likelihood_matrix() df_good_delta_out = nt.get_path() + '/data/Good_et_al/gene_by_pop_delta.txt' df_good_delta.to_csv(df_good_delta_out, sep = '\t', index = True) df_good_poly_path = nt.get_path() + '/data/Good_et_al/gene_by_pop_poly.txt' df_good_poly = pd.read_csv(df_good_poly_path, sep = '\t', header = 'infer', index_col = 0) df_good_poly_delta = nt.likelihood_matrix(df_good_poly, 'Good_et_al').get_likelihood_matrix() df_good_poly_delta_out = nt.get_path() + '/data/Good_et_al/gene_by_pop_poly_delta.txt' df_good_poly_delta.to_csv(df_good_poly_delta_out, sep = '\t', index = True)
def plot_t_k_node(): network_dir = nt.get_path() + '/data/Good_et_al/networks_naive/' node_dict = {} for filename in os.listdir(network_dir): df = pd.read_csv(network_dir + filename, sep='\t', header='infer', index_col=0) gens = filename.split('.') time = re.split('[_.]', filename)[1] node_dict[time] = {} for index, row in df.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 node_dict[time][index] = k_row node_df = pd.DataFrame.from_dict(node_dict) idx = node_df.sum(axis=1).sort_values(ascending=False).head(10).index node_df_idx = node_df.ix[idx] #node_df_idx = node_df.ix[ ['malT'] ] colors = ['firebrick', 'darkorange', 'gold', 'darkgreen', 'palegreen', \ 'navy', 'royalblue', 'black', 'teal', 'dimgrey'] color_count = 0 fig = plt.figure() nodes = node_df.index.values for index, row in node_df_idx.iterrows(): print(index) row = row.dropna() row_x = [int(x) for x in row.index.values] row_y = row.values row_xy = list(zip(row_x, row_y)) row_xy.sort(key=lambda tup: tup[1]) # sorts in place row_x_sort = [x[0] for x in row_xy] row_y_sort = [x[1] for x in row_xy] plt.scatter(row_x_sort, row_y_sort, c=colors[color_count], marker = 'o', s = 120, \ edgecolors='k', linewidth = 0.6, alpha = 0.9) color_count += 1 plt.xlabel('Time (generations)', fontsize=18) plt.ylabel('k(t)', fontsize=18) plt.xlim(2500, 60000) plt.ylim(1, 200) plt.xscale('log') plt.yscale('log') fig.tight_layout() fig.savefig(nt.get_path() + '/figs/good_t_vs_k_node.png', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def plot_edge_dist(): df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) k_list = [] for index, row in df.iterrows(): k_row = sum(i > 0 for i in row.values) - 1 if k_row > 0: k_list.append(k_row) k_count = dict(Counter(k_list)) k_count = { k: v / total for total in (sum(k_count.values()), ) for k, v in k_count.items() } #x = np.log10(list(k_count.keys())) #y = np.log10(list(k_count.values())) k_mean = np.mean(k_list) print("mean k = " + str(k_mean)) print("N = " + str(df.shape[0])) x = list(k_count.keys()) y = list(k_count.values()) x_poisson = list(range(1, 100)) y_poisson = [(math.exp(-k_mean) * ((k_mean**k) / math.factorial(k))) for k in x_poisson] fig = plt.figure() plt.scatter(x, y, marker="o", edgecolors='none', c='darkgray', s=120, zorder=3) plt.plot(x_poisson, y_poisson) plt.xlabel("Number of edges, k", fontsize=16) plt.ylabel("Frequency", fontsize=16) plt.xscale('log') plt.yscale('log') plt.ylim(0.001, 1) fig.tight_layout() fig.savefig(nt.get_path() + '/figs/edge_dist.png', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def get_naive_good_network(): out_directory = nt.get_path() + '/data/Good_et_al/networks_naive' df_path = nt.get_path() + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) to_exclude = nt.complete_nonmutator_lines() to_exclude.append('p5') df_nonmut = df[df.index.str.contains('|'.join( to_exclude))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values]))) for time_point in time_points_set: print(time_point) df_time_point = df_nonmut[df_nonmut.index.to_series().str.contains('_' + str(time_point))] df_time_point = df_time_point.loc[:, (df_time_point != 0).any(axis=0)] network_tp = nt.reconstruct_naive_network(df_time_point) network_tp.to_csv(out_directory + '/network_' + str(time_point) + '.txt', sep = '\t', index = True)
def run_network_permutation_rndm(iter = 1000, include_kmax = True): df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) df = df.drop('kpsD', axis=0) df = df.drop('kpsD', axis=1) df_out = open(nt.get_path() + '/data/Tenaillon_et_al/permute_network_rndm_nokmax.txt', 'w') df_out.write('\t'.join(['Iteration', 'k_max', 'k_mean', 'C_mean', 'C_mean_no1or2', 'd_mean']) + '\n') # get m using likelihood for i in range(iter): print("Iteration " + str(i)) df_rndm = nt.get_random_network_edges(df) k_i_rndm = [] C_i_rndm_list = [] for index, row in df_rndm.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 k_i_rndm.append(k_row) if (k_row == 0) or (k_row == 1): C_i_rndm_list.append(float(0)) else: non_zero = row.nonzero() row_non_zero = row[non_zero[0]] # drop the node row_non_zero = row_non_zero.drop(labels = [index]) L_i = 0 for index_gene, gene in row_non_zero.iteritems(): row_non_zero_list = row_non_zero.index.tolist() row_non_zero_list.remove(index_gene) df_subset = df.loc[[index_gene]][row_non_zero_list] L_i += sum(sum(i != 0 for i in df_subset.values)) # we don't multiply L_i by a factor of 2 bc we're double counting edges C_i = L_i / (k_row * (k_row-1) ) C_i_rndm_list.append(C_i) k_max = max(k_i_rndm) k_mean = np.mean(k_i_rndm) C_mean = np.mean(C_i_rndm_list) C_mean_no1or2 = np.mean([l for l in C_i_rndm_list if l > 0]) distance_df = nt.networkx_distance(df_rndm) df_out.write('\t'.join([str(i), str(k_max), str(k_mean), str(C_mean), str(C_mean_no1or2), str(distance_df)]) + '\n') df_out.close()
def plot_cluster_dist(): df_path = nt.get_path() + '/data/Tenaillon_et_al/network_CCs.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) k_count = dict(Counter(df.C_i.values)) k_count = { k: v / total for total in (sum(k_count.values()), ) for k, v in k_count.items() } #x = np.log10(list(k_count.keys())) #y = np.log10(list(k_count.values())) # cluster kde C_i = df.C_i.values grid_ = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1, 10, 50)}, cv=20) # 20-fold cross-validation grid_.fit(C_i[:, None]) x_grid_ = np.linspace(0, 2.5, 1000) kde_ = grid_.best_estimator_ pdf_ = np.exp(kde_.score_samples(x_grid_[:, None])) pdf_ = [x / sum(pdf_) for x in pdf_] x = list(k_count.keys()) y = list(k_count.values()) #x_poisson = list(range(1, 100)) #y_poisson = [(math.exp(-k_mean) * ( (k_mean ** k) / math.factorial(k) )) for k in x_poisson] fig = plt.figure() #plt.scatter(x, y, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3) plt.plot(x_grid_, pdf_) plt.ylabel("Clustering coefficient, " + r'$C_{i}$', fontsize=16) plt.xlabel("Number of edges, " + r'$k$', fontsize=16) #plt.xscale('log') #plt.yscale('log') #plt.ylim(0, 1) fig.tight_layout() fig.savefig(nt.get_path() + '/figs/C_dist.png', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def run_network_permutation_ba(): df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) df_out = open(nt.get_path() + '/data/Tenaillon_et_al/permute_network_ba.txt', 'w') df_out.write('\t'.join(['Iteration', 'k_max', 'k_mean', 'C_mean', 'C_mean_no1or2', 'd_mean']) + '\n') k_list = [] for index, row in df.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 if k_row > 0: k_list.append(k_row) count_k_list = Counter(k_list) count_k_list_sum = sum(count_k_list.values()) x = count_k_list.keys() y = [(i / count_k_list_sum) for i in count_k_list.values()] count_k_list.pop(max(x), None) x_no_max = list(count_k_list.keys()) y_no_max = [(i / (count_k_list_sum-1)) for i in count_k_list.values()] model_no_max = nt.continuumBarabasiAlbert(x_no_max, y_no_max) m_start = [1, 2, 3, 4] z_start = [-2,-0.5] results = [] for m in m_start: for z in z_start: start_params = [m, z] result = model_no_max.fit(start_params = start_params) results.append(result) AICs = [result.aic for result in results] best = results[AICs.index(min(AICs))] best_CI_FIC = nt.CI_FIC(best) best_CI = best.conf_int() best_params = best.params print(best_params) #barabasi_albert_range_C_ll = nt.cluster_BA(np.sort(x_C), best_params[0]) df_out.close()
def get_good_network_features(reconstruct = 'naive'): if reconstruct == 'naive': directory = nt.get_path() + '/data/Good_et_al/networks_naive/' df_out = open(nt.get_path() + '/data/Good_et_al/network_naive_features.txt', 'w') df_clust_path = nt.get_path() + '/data/Good_et_al/network_naive_CCs.txt' elif reconstruct == 'BIC': directory = nt.get_path() + '/data/Good_et_al/networks_BIC/' df_out = open(nt.get_path() + '/data/Good_et_al/network_BIC_features.txt', 'w') df_clust_path = nt.get_path() + '/data/Good_et_al/network_BIC_CCs.txt' df_out_columns = ['Generations', 'N', 'k_max', 'k_mean', 'C_mean', 'C_mean_no1or2', 'd_mean'] df_out.write('\t'.join(df_out_columns) + '\n') df_clust = pd.read_csv(df_clust_path, sep = '\t', header = 'infer')#, index_col = 0) for filename in os.listdir(directory): if filename == '.DS_Store': continue df = pd.read_csv(directory + filename, sep = '\t', header = 'infer', index_col = 0) gens = filename.split('.') time = re.split('[_.]', filename)[1] df_clust_time = df_clust.loc[df_clust['Generations'] == int(time)] N = df.shape[0] k_max = max(df_clust_time.k_i.values) k_mean = np.mean(df_clust_time.k_i.values) C_mean = np.mean(df_clust_time.C_i.values) C_mean_no1or2 = np.mean(df_clust_time.loc[df_clust_time['k_i'] >= 2].C_i.values) distance_df = nt.networkx_distance(df) print(time) print(distance_df) row = [str(time), str(N), str(k_max), str(k_mean), str(C_mean), str(C_mean_no1or2), str(distance_df)] df_out.write('\t'.join(row) + '\n') df_out.close()
def plot_nodes_over_time(): directory = nt.get_path() + '/data/Good_et_al/networks_BIC/' time_nodes = [] for filename in os.listdir(directory): df = pd.read_csv(directory + filename, sep='\t', header='infer', index_col=0) gens = filename.split('.') time = re.split('[_.]', filename)[1] time_nodes.append((int(time), df.shape[0])) time_nodes_sorted = sorted(time_nodes, key=lambda tup: tup[0]) x = [i[0] for i in time_nodes_sorted] y = [i[1] for i in time_nodes_sorted] x_pred = list(set(x)) x_pred.sort() y_pred = [min(y) + x_pred_i + 1 for x_pred_i in list(range(len(x_pred)))] fig = plt.figure() plt.scatter(x, y, marker="o", edgecolors='#244162', c='#175ac6', s=120, zorder=3) plt.plot(x_pred, y_pred) plt.xlabel("Time (generations)", fontsize=18) plt.ylabel('Network size, ' + r'$N$', fontsize=18) plt.ylim(5, 500) plt.yscale('log') fig.tight_layout() fig.savefig(nt.get_path() + '/figs/good_N_vs_time.png', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def pop_by_gene_tenaillon(self): pop_by_gene_dict = {} gene_size_dict = {} df_in = nt.get_path() + 'data/Tenaillon_et_al/1212986tableS2_clean.csv' for i, line in enumerate(open(df_in, 'r')): line_split = line.strip().split(',') if (line_split[4] == 'Intergenic') or \ (i == 0) or \ (line_split[9].isdigit() == False): continue gene_length_units = line_split[-1] gene_name = line_split[6] pop_name = line_split[0] if gene_length_units == 'gene_length_in_codon': gene_length = int(line_split[9]) * 3 elif gene_length_units == 'gene_length_bp': gene_length = int(line_split[9]) if gene_name not in gene_size_dict: gene_size_dict[gene_name] = gene_length if gene_name not in pop_by_gene_dict: pop_by_gene_dict[gene_name] = {} if pop_name not in pop_by_gene_dict[gene_name]: pop_by_gene_dict[gene_name][pop_name] = 1 else: pop_by_gene_dict[gene_name][pop_name] += 1 df = pd.DataFrame.from_dict(pop_by_gene_dict) df = df.fillna(0) # remove rows and columns with all zeros #df = df.loc[(df.sum(axis=1) != 0), (df.sum(axis=0) != 0)] df_out = nt.get_path() + 'data/Tenaillon_et_al/gene_by_pop.txt' df.to_csv(df_out, sep='\t', index=True) gene_size_dict_out = nt.get_path( ) + 'data/Tenaillon_et_al/gene_size_dict.txt' with open(gene_size_dict_out, 'wb') as handle: pickle.dump(gene_size_dict, handle)
def reformat_convergence_matrix(self, mut_type='F'): conv_dict = self.parse_convergence_matrix( nt.get_path() + "data/Good_et_al/gene_convergence_matrix.txt") time_points = [] new_dict = {} for gene_name, gene_data in conv_dict.items(): for pop_name, mutations in gene_data['mutations'].items(): for mutation in mutations: time = int(mutation[0]) time_points.append(time) time_points = sorted(list(set(time_points))) for gene_name, gene_data in conv_dict.items(): if gene_name not in new_dict: new_dict[gene_name] = {} for pop_name, mutations in gene_data['mutations'].items(): if len(mutations) == 0: continue mutations.sort(key=lambda tup: tup[0]) # keep only fixed mutations #{'A':0,'E':1,'F':2,'P':3} if mut_type == 'F': mutations = [x for x in mutations if int(x[1]) == 2] elif mut_type == 'P': mutations = [x for x in mutations if (int(x[1]) == 3)] #or (int(x[1]) == 0)] else: print("Argument mut_type not recognized") if len(mutations) == 0: continue for mutation in mutations: if mut_type == 'F': time = mutation[0] remaining_time_points = time_points[time_points. index(time):] for time_point in remaining_time_points: pop_time = pop_name + '_' + str(int(time_point)) if pop_time not in new_dict[gene_name]: new_dict[gene_name][pop_time] = 1 else: new_dict[gene_name][pop_time] += 1 elif mut_type == 'P': pop_time = pop_name + '_' + str(int(mutation[0])) if pop_time not in new_dict[gene_name]: new_dict[gene_name][pop_time] = 1 else: new_dict[gene_name][pop_time] += 1 df = pd.DataFrame.from_dict(new_dict) df = df.fillna(0) df = df.loc[:, (df != 0).any(axis=0)] if mut_type == 'F': df_out = nt.get_path() + 'data/Good_et_al/gene_by_pop.txt' #df_delta_out = mydir + 'data/Good_et_al/gene_by_pop_delta.txt' elif mut_type == 'P': df_out = nt.get_path() + 'data/Good_et_al/gene_by_pop_poly.txt' #df_delta_out = mydir + 'data/Good_et_al/gene_by_pop_poly_delta.txt' else: print("Argument mut_type not recognized") df.to_csv(df_out, sep='\t', index=True)
def plot_kmax_over_time(): directory = nt.get_path() + '/data/Good_et_al/networks_BIC/' time_kmax = [] for filename in os.listdir(directory): df = pd.read_csv(directory + filename, sep='\t', header='infer', index_col=0) gens = filename.split('.') time = re.split('[_.]', filename)[1] time_kmax.append((int(time), max(df.astype(bool).sum(axis=0).values))) time_kmax_sorted = sorted(time_kmax, key=lambda tup: tup[0]) x = [i[0] for i in time_kmax_sorted] y = [i[1] for i in time_kmax_sorted] x = np.log10(x) y = np.log10(y) #df_rndm_path = nt.get_path() + '/data/Good_et_al/networks_BIC_rndm.txt' #df_rndm = pd.read_csv(df_rndm_path, sep = '\t', header = 'infer') #x_rndm = np.log10(df_rndm.Generations.values) #y_rndm = np.log10(df_rndm.Generations.values) fig = plt.figure() #plt.scatter(x, y, marker = "o", edgecolors='none', c = '#175ac6', s = 120, zorder=3) plt.scatter(x, y, c='#175ac6', marker = 'o', s = 120, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.8, zorder=3)#, edgecolors='none') #plt.scatter(x_rndm, y_rndm, marker = "o", edgecolors='none', c = 'blue', s = 120, alpha = 0.1) '''using some code from ken locey, will cite later''' df = pd.DataFrame({'t': list(x)}) df['kmax'] = list(y) f = smf.ols('kmax ~ t', df).fit() R2 = f.rsquared pval = f.pvalues intercept = f.params[0] slope = f.params[1] X = np.linspace(min(x), max(x), 1000) Y = f.predict(exog=dict(t=X)) st, data, ss2 = summary_table(f, alpha=0.05) print(ss2) fittedvalues = data[:, 2] pred_mean_se = data[:, 3] pred_mean_ci_low, pred_mean_ci_upp = data[:, 4:6].T pred_ci_low, pred_ci_upp = data[:, 6:8].T slope_to_gamme = (1 / slope) + 1 plt.fill_between(x, pred_ci_low, pred_ci_upp, color='#175ac6', lw=0.5, alpha=0.2) #'$^\frac{1}{1 - '+str(round(slope_to_gamme,2))+'}$' #plt.text(2.4, 2.1, r'$k_{max}$'+ ' = '+str(round(10**intercept,2))+'*'+r'$t$'+ '$^{\frac{1}{1 - '+str(round(slope_to_gamme,2))+'}}$', fontsize=10, color='k', alpha=0.9) plt.text(2.4, 2.05, r'$k_{max}$' + ' = ' + str(round(10**intercept, 2)) + '*' + r'$t^ \frac{1}{\,' + str(round(slope_to_gamme, 2)) + '- 1}$', fontsize=12, color='k', alpha=0.9) plt.text(2.4, 1.94, r'$r^2$' + ' = ' + str("%.2f" % R2), fontsize=12, color='0.2') plt.plot(X.tolist(), Y.tolist(), '--', c='k', lw=2, alpha=0.8, color='k', label='Power-law') #plt.plot(t_x, t_y) plt.xlabel("Time (generations), " + r'$\mathrm{log}_{10}$', fontsize=18) plt.ylabel(r'$k_{max}, \; \mathrm{log}_{10}$', fontsize=18) #plt.xscale('log') #plt.yscale('log') #plt.ylim(0.001, 1) fig.tight_layout() fig.savefig(nt.get_path() + '/figs/good_kmax_vs_time.png', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def get_network_clustering_coefficients(dataset = 'good', kmax = True, reconstruct = 'naive'): if dataset == 'tenaillon': # df is a numpy matrix or pandas dataframe containing network interactions df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) if kmax == False: df = df.drop('kpsD', axis=0) df = df.drop('kpsD', axis=1) df_out = open(nt.get_path() + '/data/Tenaillon_et_al/network_CCs_no_kmax.txt', 'w') else: df_out = open(nt.get_path() + '/data/Tenaillon_et_al/network_CCs.txt', 'w') df_out.write('\t'.join(['Gene', 'k_i', 'C_i']) + '\n') for index, row in df.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 if (k_row == 0) or (k_row == 1): C_i = 0 else: non_zero = row.nonzero() row_non_zero = row[non_zero[0]] # drop the node row_non_zero = row_non_zero.drop(labels = [index]) L_i = 0 for index_gene, gene in row_non_zero.iteritems(): row_non_zero_list = row_non_zero.index.tolist() row_non_zero_list.remove(index_gene) df_subset = df.loc[[index_gene]][row_non_zero_list] L_i += sum(sum(i != 0 for i in df_subset.values)) # we don't multiply L_i by a factor of 2 bc we're double counting edges C_i = L_i / (k_row * (k_row-1) ) df_out.write('\t'.join([index, str(k_row), str(C_i)]) + '\n') df_out.close() elif dataset == 'good': if reconstruct == 'naive': directory = nt.get_path() + '/data/Good_et_al/networks_naive/' df_out = open(nt.get_path() + '/data/Good_et_al/network_naive_CCs.txt', 'w') elif reconstruct == 'BIC': directory = nt.get_path() + '/data/Good_et_al/networks_BIC/' df_out = open(nt.get_path() + '/data/Good_et_al/network_BIC_CCs.txt', 'w') df_out.write('\t'.join(['Generations', 'Gene', 'k_i', 'C_i']) + '\n') for filename in os.listdir(directory): if filename == '.DS_Store': continue df = pd.read_csv(directory + filename, sep = '\t', header = 'infer', index_col = 0) gens = filename.split('.') time = re.split('[_.]', filename)[1] print(time) for index, row in df.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 if (k_row == 0) or (k_row == 1): C_i = float(0) else: non_zero = row.nonzero() row_non_zero = row[non_zero[0]] # drop the node row_non_zero = row_non_zero.drop(labels = [index]) L_i = 0 for index_gene, gene in row_non_zero.iteritems(): row_non_zero_list = row_non_zero.index.tolist() row_non_zero_list.remove(index_gene) df_subset = df.loc[[index_gene]][row_non_zero_list] L_i += sum(sum(i != 0 for i in df_subset.values)) # we don't multiply L_i by a factor of 2 bc we're double counting edges C_i = L_i / (k_row * (k_row-1) ) df_out.write('\t'.join([str(time), index, str(k_row), str(C_i)]) + '\n') df_out.close()
def fig6(): network_dir = nt.get_path() + '/data/Good_et_al/networks_naive/' #network_dir = nt.get_path() + '/data/Good_et_al/networks_BIC/' time_nodes = [] time_kmax = [] for filename in os.listdir(network_dir): if filename == '.DS_Store': continue df = pd.read_csv(network_dir + filename, sep='\t', header='infer', index_col=0) gens = filename.split('.') time = re.split('[_.]', filename)[1] time_nodes.append((int(time), df.shape[0])) time_kmax.append((int(time), max(df.astype(bool).sum(axis=0).values))) fig = plt.figure() ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1) time_nodes_sorted = sorted(time_nodes, key=lambda tup: tup[0]) x_nodes = [i[0] for i in time_nodes_sorted] y_nodes = [i[1] for i in time_nodes_sorted] ax1.scatter(x_nodes, y_nodes, marker = "o", edgecolors='#244162', \ c = '#175ac6', s = 80, zorder=3, alpha = 0.6) ax1.set_xlabel("Time (generations)", fontsize=14) ax1.set_ylabel('Network size, ' + r'$N$', fontsize=14) ax1.set_ylim(5, 500) #ax1.set_yscale('log') ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1) time_kmax_sorted = sorted(time_kmax, key=lambda tup: tup[0]) x_kmax = [i[0] for i in time_kmax_sorted] y_kmax = [i[1] for i in time_kmax_sorted] x_kmax = np.log10(x_kmax) y_kmax = np.log10(y_kmax) '''The below regression code is from the GitHub repository ScalingMicroBiodiversity and is licensed under a GNU General Public License v3.0. https://github.com/klocey/ScalingMicroBiodiversity ''' df_regression = pd.DataFrame({'t': list(x_kmax)}) df_regression['kmax'] = list(y_kmax) f = smf.ols('kmax ~ t', df_regression).fit() R2 = f.rsquared pval = f.pvalues intercept = f.params[0] slope = f.params[1] X = np.linspace(min(x_kmax), max(x_kmax), 1000) Y = f.predict(exog=dict(t=X)) print(min(x_kmax), max(y_kmax)) st, data, ss2 = summary_table(f, alpha=0.05) fittedvalues = data[:, 2] pred_mean_se = data[:, 3] pred_mean_ci_low, pred_mean_ci_upp = data[:, 4:6].T pred_ci_low, pred_ci_upp = data[:, 6:8].T slope_to_gamme = (1 / slope) + 1 ax2.scatter([10**i for i in x_kmax], [10**i for i in y_kmax], c='#175ac6', marker = 'o', s = 80, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.6, zorder=1)#, edgecolors='none') ax2.fill_between([10**i for i in x_kmax], [10**i for i in pred_ci_low], [10**i for i in pred_ci_upp], color='#175ac6', lw=0.5, alpha=0.2, zorder=2) ax2.text(250, 100, r'$k_{max}$' + ' = ' + str(round(10**intercept, 2)) + '*' + r'$t^ \frac{1}{\,' + str(round(slope_to_gamme, 2)) + '- 1}$', fontsize=9, color='k', alpha=0.9) ax2.text(250, 60, r'$r^2$' + ' = ' + str("%.2f" % R2), fontsize=9, color='0.2') ax2.plot([10**i for i in X.tolist()], [10**i for i in Y.tolist()], '--', c='k', lw=2, alpha=0.8, color='k', label='Power-law', zorder=2) ax2.set_xlabel("Time (generations)", fontsize=14) ax2.set_ylabel(r'$k_{max}$', fontsize=14) ax2.set_xscale('log') ax2.set_yscale('log') ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1) df_net_feats_path = nt.get_path( ) + '/data/Good_et_al/network_naive_features.txt' #df_net_feats_path = nt.get_path() + '/data/Good_et_al/network_naive_features.txt' df_net_feats = pd.read_csv(df_net_feats_path, sep='\t', header='infer') x_C = df_net_feats.N.values y_C = df_net_feats.C_mean.values ax3.scatter(x_C, y_C, c='#175ac6', marker = 'o', s = 80, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.6, zorder=1) x_C_range = list(range(10, max(x_C))) barabasi_albert_range_C = [((np.log(i)**2) / i) for i in x_C_range] random_range_c = [(1 / i) for i in x_C_range] x_C_sort = list(set(x_C.tolist())) x_C_sort.sort() model = nt.clusterBarabasiAlbert(x_C, y_C) b0_start = [0.01, 0.1, 1, 10] z_start = [-2, -0.5] results = [] for b0 in b0_start: for z in z_start: start_params = [b0, z] result = model.fit(start_params=start_params) results.append(result) AICs = [result.aic for result in results] best = results[AICs.index(min(AICs))] best_CI_FIC = nt.CI_FIC(best) best_CI = best.conf_int() best_params = best.params barabasi_albert_range_C_ll = nt.cluster_BA(np.sort(x_C), best_params[0]) ax3.plot(np.sort(x_C), barabasi_albert_range_C_ll, c='k', lw=2.5, ls='--', zorder=2) #plt.plot(x_C_range, random_range_c, c = 'r', lw = 2.5, ls = '--') ax3.set_xlabel('Network size, ' + r'$N$', fontsize=14) ax3.set_ylabel('Mean clustering \ncoefficient, ' + r'$\left \langle C \right \rangle$', fontsize=14) #ax3.set_xscale('log') ax3.set_yscale('log') ax3.set_ylim(0.05, 1.5) ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1) x_d = df_net_feats.N.values y_d = df_net_feats.d_mean.values ax4.scatter(x_d, y_d, c='#175ac6', marker = 'o', s = 80, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.6, zorder=1) #x_d_range = list(range(10, max(x_d))) #barabasi_albert_range_d = [ (np.log(i) / np.log(np.log(i))) for i in x_d_range ] x_d_sort = list(set(x_d.tolist())) x_d_sort.sort() model_d = nt.distanceBarabasiAlbert(x_d, y_d) results_d = [] for b0 in b0_start: for z in z_start: start_params_d = [b0, z] result_d = model_d.fit(start_params=start_params_d) results_d.append(result_d) AICs_d = [result_d.aic for result_d in results_d] best_d = results_d[AICs_d.index(min(AICs_d))] best_CI_FIC_d = nt.CI_FIC(best_d) best_CI_d = best_d.conf_int() best_d_params = best_d.params barabasi_albert_range_d_ll = nt.distance_BA(np.sort(x_d), best_d_params[0]) ax4.plot(np.sort(x_C), barabasi_albert_range_d_ll, c='k', lw=2.5, ls='--', zorder=2) #random_range = [ np.log(i) for i in x_d_range ] #ax4.plot(x_d_range, random_range, c = 'r', lw = 2.5, ls = '--') ax4.set_xlabel('Network size, ' + r'$N$', fontsize=14) ax4.set_ylabel('Mean distance, ' + r'$\left \langle d \right \rangle$', fontsize=14) #ax4.set_xscale('log') plt.tight_layout() fig_name = nt.get_path() + '/figs/fig6.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def plot_end_network(): network_path = nt.get_path( ) + '/data/Good_et_al/networks_naive/network_62750.txt' df = pd.read_csv(network_path, sep='\t', header='infer', index_col=0) print(df)
import networkx as nx import pandas as pd import network_tools as nt #import matplotlib #matplotlib.use('TkAgg') import matplotlib.pyplot as plt df = pd.read_csv(nt.get_path() + '/data/Good_et_al/networks_naive/network_55250.txt', index_col=0, sep = '\t') df_values = df.values G = nx.from_numpy_matrix(df_values) #print(G) #nx.draw(G) #plt.savefig(nt.get_path() + '/figs/ntwrk_good.png', format="PNG")
def fig4(): df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) #df_C_path = nt.get_path() + '/data/Tenaillon_et_al/network_CCs.txt' df_C_path = nt.get_path() + '/data/Tenaillon_et_al/network_CCs_no_kmax.txt' df_C = pd.read_csv(df_C_path, sep='\t', header='infer', index_col=0) kmax_df = max(df_C.k_i.values) mean_C_df = np.mean(df_C.loc[df_C['k_i'] >= 2].C_i.values) df_null_path = nt.get_path() + '/data/Tenaillon_et_al/permute_network.txt' df_null = pd.read_csv(df_null_path, sep='\t', header='infer', index_col=0) df_no_max = df.copy() df_no_max = df_no_max.drop('kpsD', axis=0) df_no_max = df_no_max.drop('kpsD', axis=1) #dist_df = nt.networkx_distance(df) dist_df = nt.networkx_distance(df_no_max) C_mean_null = df_null.C_mean_no1or2.tolist() C_mean_null = [x for x in C_mean_null if str(x) != 'nan'] d_mean_null = df_null.d_mean.tolist() k_max_null = df_null.k_max.tolist() fig = plt.figure() ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1) k_list = [] for index, row in df.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 if k_row > 0: k_list.append(k_row) count_k_list = Counter(k_list) count_k_list_sum = sum(count_k_list.values()) count_k_list_x = list(count_k_list.keys()) count_k_list_y = [(i / count_k_list_sum) for i in count_k_list.values()] k_list_no_max = [] for index, row in df_no_max.iterrows(): k_row = sum(i != 0 for i in row.values) - 1 if k_row > 0: k_list_no_max.append(k_row) count_k_list_no_max = Counter(k_list_no_max) count_k_list_sum_no_max = sum(count_k_list_no_max.values()) count_k_list_x_no_max = list(count_k_list_no_max.keys()) count_k_list_y_no_max = [(i / count_k_list_sum_no_max) for i in count_k_list_no_max.values()] ax1.scatter(count_k_list_x, count_k_list_y, marker="o", edgecolors='#244162', c='#175ac6', alpha=0.4, s=60, zorder=4) # red colors # edge #C92525 # c #FF4343 #ax1.scatter(count_k_list_x_no_max, count_k_list_y_no_max, marker = "o", edgecolors='#C92525', c = '#FF4343', alpha = 0.4, s = 60, zorder=4) count_k_list_x.sort() #m = 0.56086623 #pred_y = [ ((2 * m * (m+1)) / (j * (j+1) * (j+2) )) for j in count_k_list_x ] #ax1.plot(count_k_list_x, pred_y, c = 'k', lw = 2.5, # ls = '--', zorder=2) p = sum(k_list) / (((df.shape[0]) * (df.shape[0] - 1)) / 2) p_no_max = sum([i for i in k_list if i != 181]) / (((df.shape[0] - 1) * (df.shape[0] - 2)) / 2) binom_x = np.arange(0, max(k_list)) binom_y = binom.pmf(binom_x, df.shape[0] - 1, p) binom_y_noMax = binom.pmf(binom_x, df.shape[0] - 2, p_no_max) ax1.plot(binom_x, binom_y_noMax, c='k', lw=2.5, ls='--', zorder=2) ax1.set_xlim([0.5, 400]) ax1.set_ylim([0.001, 1]) ax1.set_xscale('log') ax1.set_yscale('log') ax1.set_xlabel(r'$k_{i}$', fontsize=14) ax1.set_ylabel("Frequency", fontsize=14) ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1) ax2.hist(k_max_null, bins=30, weights=np.zeros_like(k_max_null) + 1. / len(k_max_null), alpha=0.8, color='#175ac6') #ax2.axvline(max(k_list_no_max), color = 'red', lw = 2, ls = ':') ax2.axvline(max(k_list_no_max), color='red', lw=2, ls='--') #ax2.axvline(kmax_df, color = 'red', lw = 2, ls = '--') #ax2.set_xscale('log') ax2.set_xlabel(r'$k_{max}$', fontsize=14) ax2.set_ylabel("Frequency", fontsize=14) k_max_null.append(kmax_df) relative_position_k_max = sorted(k_max_null).index(kmax_df) / ( len(k_max_null) - 1) if relative_position_k_max > 0.5: p_score_k_max = 1 - relative_position_k_max else: p_score_k_max = relative_position_k_max print('kmax p-score = ' + str(round(p_score_k_max, 3))) #ax2.text(0.366, 0.088, r'$p < 0.05$', fontsize = 10) ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1) #print(C_mean_null) ax3.hist(C_mean_null, bins=30, weights=np.zeros_like(C_mean_null) + 1. / len(C_mean_null), alpha=0.8, color='#175ac6') ax3.axvline(mean_C_df, color='red', lw=2, ls='--') #ax3.set_xlabel("Mean clustering coefficient", fontsize = 14) ax3.set_xlabel('Mean clustering coefficient, ' + r'$\left \langle C \right \rangle$', fontsize=14) ax3.set_ylabel("Frequency", fontsize=14) C_mean_null.append(mean_C_df) relative_position_mean_C = sorted(C_mean_null).index(mean_C_df) / ( len(C_mean_null) - 1) if relative_position_mean_C > 0.5: p_score_mean_C = 1 - relative_position_mean_C else: p_score_mean_C = relative_position_mean_C print('mean C p-score = ' + str(round(p_score_mean_C, 3))) #ax3.text(0.078, 0.115, r'$p < 0.05$', fontsize = 10) ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1) ax4.hist(d_mean_null, bins=30, weights=np.zeros_like(d_mean_null) + 1. / len(d_mean_null), alpha=0.8, color='#175ac6') ax4.axvline(dist_df, color='red', lw=2, ls='--') #ax4.set_xlabel("Mean distance", fontsize = 14) ax4.set_xlabel('Mean distance, ' + r'$\left \langle d \right \rangle$', fontsize=14) ax4.set_ylabel("Frequency", fontsize=14) d_mean_null.append(dist_df) relative_position_d_mean = sorted(d_mean_null).index(dist_df) / ( len(d_mean_null) - 1) if relative_position_d_mean > 0.5: p_score_d_mean = 1 - relative_position_d_mean else: p_score_d_mean = relative_position_d_mean print('mean pairwise distance p-score = ' + str(round(p_score_d_mean, 3))) #ax4.text(89.1, 0.09, r'$p \nless 0.05$', fontsize = 10) plt.tight_layout() fig_name = nt.get_path() + '/figs/fig4.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()