def gini_pred_vs_obs_plot(filename=None): plt.scatter(np.array(pred_ginis), np.array(obs_ginis)) plt.xlabel("Predicted Gini coeff") plt.ylabel("Observed Gini coeff") plt.text(0.8, 0.2, "r^2 = 0.25") plt.plot([0, 1], [0, 1], lienstyle='--') maybesave(filename)
def plot_results_dict_gini_qq(results_dict,filename=None): bios = [] maxents = [] uniforms = [] for i,k in enumerate(results_dict): g1,g2,tf = k.split("_") genome = g1 + "_" + g2 bio_motif = extract_tfdf_sites(genome,tf) bio_ic = motif_ic(bio_motif) bio_gini = motif_gini(bio_motif) d = results_dict[k] bios.append(bio_gini) maxents.append(mean(d['maxent']['motif_gini'])) uniforms.append(mean(d['uniform']['motif_gini'])) plt.scatter(bios,maxents,label='ME') plt.scatter(bios,uniforms,label='TURS',color='g') minval = min(bios+maxents+uniforms) maxval = max(bios+maxents+uniforms) plt.plot([minval,maxval],[minval,maxval],linestyle='--') plt.xlabel("Observed Gini Coefficient") plt.ylabel("Mean Sampled Gini Coefficient") plt.legend(loc='upper left') print "bio vs maxent:",pearsonr(bios,maxents) print "bio vs uniform:",pearsonr(bios,uniforms) maybesave(filename)
def main(prok_motifs, euk_motifs, filename='motif_summary_stats.eps'): sns.set(style="darkgrid", color_codes=True) #df = pd.DataFrame(columns="Type N L IC Gini".split(), index=range(len(prok_motifs) + len(euk_motifs))) df = pd.DataFrame() df['Domain'] = ["Eukaryotic" for _ in euk_motifs ] + ["Prokaryotic" for _ in prok_motifs] motifs = euk_motifs + prok_motifs df['N'] = [log(len(motif)) / log(10) for motif in motifs] df['L (bp)'] = [len(motif[0]) for motif in motifs] df['IC (bits)'] = [motif_ic(motif) for motif in motifs] df['IGC'] = [motif_gini(motif) for motif in motifs] pg = sns.pairplot(df, hue='Domain', markers='s o'.split(), palette='cubehelix') #hue_order=["Prokaryotic", "Eukaryotic"]) for i in range(4): pg.axes[i][3].set_xlim(-0.01, 0.6) for j in range(4): pg.axes[3][j].set_ylim(-0.01, 0.6) pg.axes[0][0].set_yticks(range(1, 5)) pg.axes[0][0].set_yticklabels(["$10^%i$" % i for i in range(1, 5)]) pg.axes[3][0].set_xticks(range(1, 5)) pg.axes[3][0].set_xticklabels(["$10^%i$" % i for i in range(1, 5)]) maybesave(filename)
def redo_ic_igc_plot(filename=None): xmin, xmax, ymin, ymax = 0, 0.6, 0, 0.6 marker_size = 40 sns.set_style('white') #sns.set_style('darkgrid') plt.subplot(1,2,1) # plt.xlim(0,0.4) # plt.ylim(0,0.6) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) for x,y,p in zip(prok_maxent_ginis, prok_ginis, prok_patterns): plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size) plt.plot([0,1],[0,1],linestyle='--',color='black') print "prok maxent" print pearsonr(prok_maxent_ginis,prok_ginis) plt.xlabel("MaxEnt IGC",fontsize='large') plt.ylabel("Prokaryotic IGC",fontsize='large') plt.subplot(1,2,2) # plt.xlim(0,0.4) # plt.ylim(0,0.6) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) print "euk maxent:" print pearsonr(euk_maxent_ginis,euk_ginis) for x,y,p in zip(euk_maxent_ginis, euk_ginis, euk_patterns): plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size) plt.plot([0,1],[0,1],linestyle='--',color='black') plt.ylabel("Eukaroytic IGC",fontsize='large') plt.xlabel("MaxEnt IGC",fontsize='large') maybesave(filename)
def sigma_Ne_contour_plot(filename=None): sigmas = np.linspace(0,5,20) Nes = np.linspace(1,20,20) L = 10 n = 50 copies = 10*n trials = 100 motifss = [[[(sample_motif(sigma, Ne, L, copies, n)) for i in range(trials)] for sigma in sigmas] for Ne in tqdm(Nes)] occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas] for Ne in tqdm(Nes)] print "ic_M" ic_M = mmap(lambda ms:mean(map(motif_ic,ms)),motifss) print "gini_M" gini_M = mmap(lambda ms:mean(map(motif_gini,ms)),motifss) print "mi_M" mi_M = mmap(lambda ms:mean(map(total_motif_mi,ms)),tqdm(motifss)) plt.subplot(2,2,1) plt.contourf(sigmas,Nes,occ_M,cmap='jet') plt.colorbar() plt.subplot(2,2,2) plt.contourf(sigmas,Nes,ic_M,cmap='jet') plt.colorbar() plt.subplot(2,2,3) plt.contourf(sigmas,Nes,gini_M,cmap='jet') plt.colorbar() plt.subplot(2,2,4) plt.contourf(sigmas,Nes,mi_M,cmap='jet') plt.colorbar() maybesave(filename)
def interpret_biological_experiment2(results_dict,filename=None): spoof_names = sorted([k for k in results_dict.values()[0] if not k == 'bio']) stat_names = sorted([k for k in results_dict.values()[0]['bio']]) tf_names = sorted(results_dict.keys(),key=lambda tf:results_dict[tf]["bio"]["motif_ic"]) def order_tfs_by(stat_name): return sorted(results_dict.keys(),key=lambda tf:results_dict[tf]["bio"][stat_name]) def bio_stats(fname,order_by_stat="motif_ic"): ordered_tfs = order_tfs_by(order_by_stat) return [results_dict[tf]["bio"][fname] for tf in ordered_tfs] def spoof_stats(spoof_name,fname,order_by_stat="motif_ic"): ordered_tfs = order_tfs_by(order_by_stat) return [results_dict[tf][spoof_name][fname] for tf in ordered_tfs] for spoof_idx,spoof_name in enumerate(spoof_names): for stat_idx,stat_name in enumerate(stat_names): plt.subplot(len(spoof_names),len(stat_names),spoof_idx*3+stat_idx+1) if spoof_idx == 0 and stat_idx == 0: plt.ylabel("MaxEnt Sampling") if spoof_idx == 1 and stat_idx == 0: plt.ylabel("Uniform Sampling") plt.boxplot(spoof_stats(spoof_name,stat_name,order_by_stat=stat_name)) plt.scatter(range(1,len(bio_stats(stat_name))+1),bio_stats(stat_name,order_by_stat=stat_name), marker='^',color='r') plt.title(stat_name) plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') maybesave(filename)
def L_vs_sigma_plot(filename=None, with_bio=False): if with_bio: tfdf = extract_motif_object_from_tfdf() motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ls = [len(motif[0]) for motif in motifs] cs = [len(motif) for motif in motifs] ics = [motif_ic(motif) for motif in motifs] ic_density = [ic / L for ic, L in zip(ics, Ls)] sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs] ginis = [motif_gini(motif, correct=False) for motif in motifs] mi_density = [ total_motif_mi(motif) / choose(L, 2) for motif, L in zip(motifs, Ls) ] min_sigma = 0.1 max_sigma = 10 plt.xlim(0, max_sigma) plt.ylim(0, 60) plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)), label="Binding Transition") plt.plot([min_sigma, max_sigma], [log(G, 2) / 2, log(G, 2) / 2], linestyle='--', label="Info Theory Threshold") # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)), # linestyle='--',label="Zero Discrimination Asymptote") if with_bio: plt.scatter(sigmas, Ls, label="Biological Motifs") plt.xlabel("sigma") plt.ylabel("L") plt.legend() maybesave(filename)
def sigma_Ne_contour_plot(filename=None): sigmas = np.linspace(0, 5, 20) Nes = np.linspace(1, 20, 20) L = 10 n = 50 copies = 10 * n trials = 100 motifss = [[[(sample_motif(sigma, Ne, L, copies, n)) for i in range(trials)] for sigma in sigmas] for Ne in tqdm(Nes)] occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas] for Ne in tqdm(Nes)] print "ic_M" ic_M = mmap(lambda ms: mean(map(motif_ic, ms)), motifss) print "gini_M" gini_M = mmap(lambda ms: mean(map(motif_gini, ms)), motifss) print "mi_M" mi_M = mmap(lambda ms: mean(map(total_motif_mi, ms)), tqdm(motifss)) plt.subplot(2, 2, 1) plt.contourf(sigmas, Nes, occ_M, cmap='jet') plt.colorbar() plt.subplot(2, 2, 2) plt.contourf(sigmas, Nes, ic_M, cmap='jet') plt.colorbar() plt.subplot(2, 2, 3) plt.contourf(sigmas, Nes, gini_M, cmap='jet') plt.colorbar() plt.subplot(2, 2, 4) plt.contourf(sigmas, Nes, mi_M, cmap='jet') plt.colorbar() maybesave(filename)
def gini_vs_mi_comparison(filename=None): sys.path.append("/home/pat/jaspar") from parse_jaspar import euk_motifs euk_motifs = [motif if len(motif) <= 200 else sample(200,motif,replace=False) for motif in euk_motifs] prok_ginis = map(motif_gini,bio_motifs) prok_mis = map(total_motif_mi,tqdm(bio_motifs)) prok_mipps = map(motif_mi_pp,tqdm(bio_motifs)) eu_ginis = map(motif_gini,jaspar_motifs) eu_mis = map(total_motif_mi,tqdm(jaspar_motifs)) eu_mipps = map(motif_mi_pp,tqdm(jaspar_motifs)) plt.subplot(1,2,1) plt.scatter(prok_ginis,prok_mipps) plt.xlabel("Gini Coefficient") plt.ylabel("MI (bits / column pair)") plt.title("Prokaryotic Motifs") plt.xlim(-.1,.7) plt.ylim(-0.1,0.7) plt.subplot(1,2,2) plt.scatter(eu_ginis,eu_mipps) plt.xlabel("Gini Coefficient") plt.xlim(-.1,.7) plt.ylim(-0.1,0.7) plt.title("Eukaryotic Motifs") plt.suptitle("Mutual Information vs Gini Coefficient") maybesave(filename)
def make_plot(filename=None): trials_per_iteration = 3 iterations = [10**i for i in [0,1,2,3,4]] n = 50 L = 10 desired_ic = 10 tv = 0.01 correction_per_col = 3/(2*log(2)*n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n,L,desired_ic_for_beta) epsilon = 0.1 alpha = exp(-2*beta*epsilon) opt_iterations = int(ceil(log(tv)/log(1-alpha))) opt_epsilon = 1/(2*beta) print "optimum iterations:", opt_iterations print "optimum epsilon:", opt_epsilon results = {} for iteration in iterations: print "starting on:", iteration motifs = [uniform_motif_with_ic_imh(n,L,desired_ic,epsilon=epsilon,beta=beta,iterations=iteration)[-1] for trial in trange(trials_per_iteration)] ics = map(motif_ic, motifs) results[iteration] = ics opt_ics = [uniform_motif_imh_tv(n,L,desired_ic,beta=beta,epsilon=epsilon) for trial in range(trials_per_iteration)] icss = [results[iteration] for iteration in iterations] plt.boxplot(icss + [opt_ics]) maybesave(filename)
def bio_detector_experiment_prok_euk(filename=None,pickle_filename=None): #use data from prok_euk_ic_gini_experiment; Figure 4 in Gini Paper if pickle_filename is None: prok_motifs = bio_motifs euk_motifs = [motif if len(motif) <= 200 else sample(200,motif,replace=False) for motif in euk_motifs] with open("prok_euk_ic_gini_experiment.pkl") as f: (prok_maxents, prok_uniforms, euk_maxents, euk_uniforms) = cPickle.load(f) prok_bio_ginis = map(motif_gini, prok_motifs) euk_bio_ginis = map(motif_gini, euk_motifs) prok_ps = [percentile(bio_gini,map(motif_gini,spoofs)) for bio_gini,spoofs in zip(prok_bio_ginis,prok_maxents)] prok_spoofs = [spoofs[0] for spoofs in prok_maxents] prok_neg_ps = [percentile(motif_gini(spoof),map(motif_gini,spoofs)) for spoof,spoofs in zip(prok_spoofs,prok_maxents)] euk_ps = [percentile(bio_gini,map(motif_gini,spoofs)) for bio_gini,spoofs in zip(euk_bio_ginis,euk_maxents)] euk_spoofs = [spoofs[0] for spoofs in euk_maxents] euk_neg_ps = [percentile(motif_gini(spoof),map(motif_gini,spoofs)) for spoof,spoofs in zip(euk_spoofs,euk_maxents)] with open("bio_detector_experiment_prok_euk.pkl",'w') as f: cPickle.dump((prok_ps,euk_ps,prok_neg_ps,euk_neg_ps),f) else: with open(pickle_filename) as f: (prok_ps,euk_ps,prok_neg_ps,euk_neg_ps) = cPickle.load(f) sns.set_style('white') #sns.set_palette('gray') sns.set_palette(sns.cubehelix_palette(3)) roc_curve(prok_ps + euk_ps,prok_neg_ps + euk_neg_ps,color='black') plt.xlabel("FPR",fontsize='large') plt.ylabel("TPR",fontsize='large') maybesave(filename)
def on_off_experiment2(num_motifs=100,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf,tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [spoof_on_off_motif(motif,num_motifs=num_motifs,trials=1) for motif in bio_motifs] spoof_ginises = mmap(motif_gini,tqdm(spoofses)) spoof_mises = mmap(total_motif_mi,tqdm(spoofses)) cors, ps = [],[] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis,mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors,ps,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1,1],[q,q],linestyle='--',label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1,1]) plt.ylim([10**-4,1+1]) cor_ps = zip(cors,ps) sig_negs = [(c,p) for (c,p) in cor_ps if c < 0 and p < q] sig_poses = [(c,p) for (c,p) in cor_ps if c > 0 and p < q] insigs = [(c,p) for (c,p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors,ps,Ns = transpose(cor_p_Ns) return sum([cor*N for (cor,N) in zip (cors,Ns)])/sum(Ns) plt.title("Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs") maybesave(filename)
def make_occupancy_figure(): occs2 = occ_matrix( G=5 * 10**6, occ_f=lambda *args: mean(occ2(*args) for i in range(1000)), sigmas=np.linspace(0, 10, 100)) plot_matrix(occs2, show=False) plt.title("Mean Occupancy of Random Gaussian Ensembles of PWMs") maybesave("basic_occupancy1000.png")
def plot_matrix_chain(mc,true_ll,filename=None): lls = [m[1] for m in mc] plt.plot(lls) plt.plot([true_ll for l in lls],label='True Log-likelihood',linestyle='--') plt.xlabel("Iteration") plt.ylabel("Log-likelihood") plt.legend(loc='lower right') maybesave(filename)
def graph_acceptance_ratio(filename): ps = normalize([exp(-random.gauss(0,5)) for i in xrange(5000000)]) ars = [show(ncp(ps,i)) for i in range(10+1)] plt.plot(ars) plt.semilogy() plt.xlabel("Copy Number") plt.ylabel("Acceptance Ratio") plt.title("Acceptance Ratio vs. Copy number for 5*10^6 LN(0,5) sites") maybesave(filename)
def discrete_parallelogram_plot(filename=None): motifs = concat([maxent_motifs_with_ic(200,10,ic,10) for ic in tqdm(np.linspace(0.5,19.5,100))]) ics = map(motif_ic,motifs) mis = map(total_motif_mi,motifs) plt.scatter(ics,mis) plt.xlabel("IC (bits)") plt.ylabel("Pairwise MI (bits)") plt.title("IC vs Pairwise MI for MaxEnt Motifs") maybesave(filename)
def mi_pred_vs_obs_plot(filename=None): plt.scatter( np.array(pred_mis) / np.array(sizes), np.array(obs_mis) / np.array(sizes)) plt.xlabel("Predicted MI Density (bits/comparison)") plt.ylabel("Observed MI Density (bits/comparison)") plt.plot([0, 0.3], [0, 0.3], linestyle='--') plt.text(0.05, 0.2, "r^2 = 0.98") maybesave(filename)
def viz_sample(sample,filename=None): """Visualize a sample trajectory""" plt.subplot(211) plt.imshow(transpose(sample),interpolation='nearest',aspect='auto') plt.ylabel("Position") plt.subplot(212) energies = map(hamiltonian,sample) plt.plot(energies) plt.ylabel("Energy") plt.xlabel("Iteration") maybesave(filename)
def mu_approx_fig(filename=None): sigma = 1 L = 10 copy_range = np.linspace(1,10**5,100) plt.plot(*pl(lambda copy_num:mu_from(G,sigma,L,copy_num=copy_num),copy_range),label="Exact") plt.plot(*pl(lambda copy_num:approx_mu(G,sigma,L,copy_num=copy_num),copy_range),label="Approx") plt.xlabel("Copy number") plt.ylabel("$\mu$") plt.semilogx() plt.legend(loc='ul') plt.title("Exact vs. Approximate Chemical Potential") maybesave(filename)
def bio_detector_experiment(filename=None): """use high Gini to detect biological motifs""" bio_ginis = map(motif_gini, bio_motifs) maxent_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(bio_motifs)] maxent_ginis = mmap(motif_gini, maxent_spoofs) ps = zipWith(percentile,bio_ginis, maxent_ginis) neg_controls = map(first, maxent_spoofs) neg_control_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(neg_controls)] nc_ps = zipWith(percentile,map(motif_gini,neg_controls), mmap(motif_gini, neg_control_spoofs)) roc_curve(ps, nc_ps) plt.xlabel("FPR",fontsize='large') plt.ylabel("TPR",fontsize='large') maybesave(filename)
def plot_matrices(*args, **kwargs): print kwargs n = len(args) for i, arg in enumerate(args): plt.subplot(1, n, i + 1) if "labels" in kwargs: plt.title(kwargs["labels"][i]) plt.imshow(arg, interpolation='none') #plot_matrix(arg,show=False,xlabel=None,ylabel=None) plt.xlabel("Binding Site Length") plt.ylabel("Standard Deviation of Weight Matrix") plt.colorbar(label='Occupancy') fname = kwargs.get('fname', None) maybesave(fname)
def make_sigma_infty_asymptote_figure(): Ls = range(1, 20) sigma = 100 plt.plot(*pl( lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls), label='Occupancy') plt.ylabel("Occupancy") plt.xlabel("Length") plt.plot([11.12, 11.12], [0, 1], linestyle='--', label='Predicted Critical Length') plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2") plt.legend(loc='upper left') plt.title("Mean Occupancy for sigma = 100") maybesave("sigma_infty_asymptote.png")
def plot_results_dict_gini_vs_ic(results_dict,filename=None): for i,k in enumerate(results_dict): g1,g2,tf = k.split("_") genome = g1 + "_" + g2 bio_motif = extract_tfdf_sites(genome,tf) bio_ic = motif_ic(bio_motif) bio_gini = motif_gini(bio_motif) d = results_dict[k] plt.scatter(bio_ic,bio_gini,color='b',label="Bio"*(i==0)) plt.scatter(mean(d['maxent']['motif_ic']),mean(d['maxent']['motif_gini']),color='g',label='ME'*(i==0)) plt.scatter(mean(d['uniform']['motif_ic']),mean(d['uniform']['motif_gini']),color='r',label="TURS"*(i==0)) plt.xlabel("IC (bits)") plt.ylabel("Gini Coefficient") plt.legend() maybesave(filename)
def interpret_chain(chain, motif, filename=None): N = len(motif) log_fhats = [log_fhat(theta,motif) for theta in chain] log_Zs = [log_ZM_hack(theta,N) for theta in chain] log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)] plt.plot(map(logmod, [mean(score_seq(x[0],site) for site in motif) for x in chain]), label="Mean Site Energy (kBT)") plt.plot(map(logmod, [x[1] for x in chain]),label="$\mu$ (kBT)") plt.plot(map(logmod, [x[2] for x in chain]),label="$Ne$") plt.plot(map(logmod, log_fhats),label="log fhat") plt.plot(map(logmod, log_Zs),label="log_ZM") plt.plot(map(logmod, log_ps),label="log p") plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]),label="Mean Occupancy") plt.legend(loc='right',fontsize='large') plt.xlabel("Iteration",fontsize='large') maybesave(filename)
def mu_approx_fig(filename=None): sigma = 1 L = 10 copy_range = np.linspace(1, 10**5, 100) plt.plot(*pl(lambda copy_num: mu_from(G, sigma, L, copy_num=copy_num), copy_range), label="Exact") plt.plot(*pl(lambda copy_num: approx_mu(G, sigma, L, copy_num=copy_num), copy_range), label="Approx") plt.xlabel("Copy number") plt.ylabel("$\mu$") plt.semilogx() plt.legend(loc='ul') plt.title("Exact vs. Approximate Chemical Potential") maybesave(filename)
def rfreq_rseq_experiment(obj,filename="rfreq_vs_rseq_in_sefas_collection.png"): Rfreqs = [] Rseqs = [] G = 5.0*10**6 min_rfreq = log2(G/500) for tf in obj.tfs: motif = getattr(obj,tf) Rfreqs.append(log(G/len(motif),2)) Rseqs.append(motif_ic(motif)) plt.scatter(Rfreqs,Rseqs) plt.xlabel("log(G/n) (bits)") plt.ylabel("Motif Information Content (bits)") plt.plot([0,20],[0,20],linestyle='--',label='Theory') plt.plot([min_rfreq,min_rfreq],[0,30],linestyle='--',label='Maximum Plausible Regulon Size') plt.title("Motif Information Content vs. Search Difficulty") plt.legend(loc='upper left') maybesave(filename)
def on_off_experiment2(num_motifs=100, filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [ spoof_on_off_motif(motif, num_motifs=num_motifs, trials=1) for motif in bio_motifs ] spoof_ginises = mmap(motif_gini, tqdm(spoofses)) spoof_mises = mmap(total_motif_mi, tqdm(spoofses)) cors, ps = [], [] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis, mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors, ps, filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1, 1], [q, q], linestyle='--', label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1, 1]) plt.ylim([10**-4, 1 + 1]) cor_ps = zip(cors, ps) sig_negs = [(c, p) for (c, p) in cor_ps if c < 0 and p < q] sig_poses = [(c, p) for (c, p) in cor_ps if c > 0 and p < q] insigs = [(c, p) for (c, p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors, ps, Ns = transpose(cor_p_Ns) return sum([cor * N for (cor, N) in zip(cors, Ns)]) / sum(Ns) plt.title( "Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs" ) maybesave(filename)
def controlling_for_gc_experiment(): euk_downmotifs = [downsample(200, motif) for motif in euk_motifs] prok_spoofses = [spoof_maxent_motifs(motif, 1000) for motif in tqdm(prok_motifs)] euk_spoofses = [spoof_maxent_motifs(motif, 1000) for motif in tqdm(euk_downmotifs)] prok_spoofses_gc = [spoof_maxent_motifs_gc(motif, 1000) for motif in tqdm(prok_motifs)] euk_spoofses_gc = [spoof_maxent_motifs_gc(motif, 1000) for motif in tqdm(euk_downmotifs)] with open("prok_spoofses.pkl",'w') as f: cPickle.dump(prok_spoofses, f) with open("euk_spoofses.pkl",'w') as f: cPickle.dump(euk_spoofses, f) with open("prok_spoofses_gc.pkl",'w') as f: cPickle.dump(prok_spoofses_gc, f) with open("euk_spoofses_gc.pkl",'w') as f: cPickle.dump(euk_spoofses_gc, f) prok_ginis = map(motif_gini, prok_motifs) euk_ginis = map(motif_gini, euk_downmotifs) prok_spoof_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(prok_spoofses)] euk_spoof_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(euk_spoofses)] prok_spoof_gc_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(prok_spoofses_gc)] euk_spoof_gc_ginis = [mean(map(motif_gini, spoofs)) for spoofs in tqdm(euk_spoofses_gc)] sns.set_style('white') palette = sns.cubehelix_palette(3) sns.set_palette(palette) plt.subplot(1,2,1) plt.plot([0,0.5], [0,0.5], linestyle='--', color='black') plt.scatter(prok_spoof_ginis, prok_spoof_gc_ginis, color=palette[1], edgecolor='black', label='Prokaryotic Motifs') plt.xlim(0, 0.5) plt.ylim(0, 0.5) plt.xlabel("Mean Replicate IGC") plt.ylabel("Mean %GC-controlled Replicate IGC") plt.title("Prokaryotic Motifs") plt.subplot(1,2,2) plt.plot([0,0.5], [0,0.5], linestyle='--', color='black') plt.scatter(euk_spoof_ginis, euk_spoof_gc_ginis, color=palette[1], edgecolor='black') plt.xlim(0, 0.5) plt.ylim(0, 0.5) plt.xlabel("Mean Replicate IGC") plt.ylabel("Mean %GC-controlled Replicate IGC") plt.title("Eukaryotic Motifs") maybesave("control-gc.eps")
def interpret_chain(chain, motif, filename=None): N = len(motif) log_fhats = [log_fhat(theta, motif) for theta in chain] log_Zs = [log_ZM_hack(theta, N) for theta in chain] log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)] plt.plot( map(logmod, [mean(score_seq(x[0], site) for site in motif) for x in chain]), label="Mean Site Energy (kBT)") plt.plot(map(logmod, [x[1] for x in chain]), label="$\mu$ (kBT)") plt.plot(map(logmod, [x[2] for x in chain]), label="$Ne$") plt.plot(map(logmod, log_fhats), label="log fhat") plt.plot(map(logmod, log_Zs), label="log_ZM") plt.plot(map(logmod, log_ps), label="log p") plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]), label="Mean Occupancy") plt.legend(loc='right', fontsize='large') plt.xlabel("Iteration", fontsize='large') maybesave(filename)
def make_sigma_0_figure(sigma=0.1, fname="sigma_0.png"): G = 5 * 10**6 def critical_L(sigma): return log(G) / (sigma * (1 - sigma / 2.0)) Lstar = critical_L(sigma) print "Lstar:", Lstar Ls = range(1, int(2 * Lstar)) plt.plot(*pl( lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls), label='Occupancy') plt.ylabel("Occupancy") plt.xlabel("Length") plt.plot([Lstar, Lstar], [0, 1], linestyle='--', label='Predicted Critical Length') plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2") plt.legend(loc='upper left') plt.title("Mean Occupancy for sigma = %s" % sigma) maybesave(fname)
def illustrate_rho_partitioning(filename=None, Ne=2): n = 10 L = 10 sigma = 1 nu = Ne - 1 rhos = range(n * L + 1) log_fs = [predict_log_f(rho, n, L, sigma=sigma) for rho in rhos] log_ws = [log_w(n, L, rho) for rho in rhos] mean_ics = [mean_ic_from_rho(rho, n, L) for rho in rhos] plt.plot(map(exp, log_fs), label="Mean Fitness") plt.plot(map(exp, log_ws), label="Degeneracy") plt.plot(mean_ics, label="Mean IC") ps = normalize(map(exp, [nu * lf + lw for lf, lw in zip(log_fs, log_ws)])) integrand = [ic * p for ic, p in zip(mean_ics, ps)] print sum(ps) plt.plot(ps, label="Probability") plt.plot(integrand, label="Integrand") plt.semilogy() plt.xlabel("Rho (Mutational distance from optimal genotype)") plt.legend() maybesave(filename)
def interpret_results_dict(results_dict, filename=None, annotate=False): ic_in_range = 0 ic_lower = 0 ic_upper = 0 fnames = "motif_ic motif_gini total_motif_mi".split() rel_tfs = [tf for tf in tfdf.tfs if motif_ic(getattr(tfdf, tf)) > 5] for tf in rel_tfs: motif = getattr(tfdf, tf) print tf for fname_idx, fname in enumerate(fnames): f = eval(fname) bio_stat = f(motif) lb, ub = mean_ci(results_dict[tf][fname]) in_range = (lb <= bio_stat <= ub) if fname == 'motif_ic': ic_in_range += (lb < bio_stat < ub) ic_lower += (bio_stat < lb) ic_upper += (ub < bio_stat) print fname, bio_stat, "(%1.2f, %1.2f)" % (lb, ub), in_range print "motif ic in range:", ic_in_range / float(len(rel_tfs)) print "motif ic lower:", ic_lower / float(len(rel_tfs)) print "motif ic higher:", ic_upper / float(len(rel_tfs)) for fname_idx, fname in enumerate(fnames): f = eval(fname) plt.subplot(1, len(fnames), fname_idx + 1) plt.title(fname) bio_stats = [f(getattr(tfdf, tf)) for tf in rel_tfs] sim_stats = [mean(results_dict[tf][fname]) for tf in rel_tfs] pred_obs(zip(bio_stats, sim_stats), show=False) if annotate: for s, xy in zip(rel_tfs, zip(bio_stats, sim_stats)): plt.annotate(s=s, xy=xy) plt.xlabel("Biological Value") plt.ylabel("Simulated Value") r, p = pearsonr(bio_stats, sim_stats) print fname, r, r**2, p plt.tight_layout() maybesave(filename)
def occ_matrix_analysis(n=10, occ_matrices=None, filename=None): Ls = range(1, 30) sigmas = (np.linspace(0, 20, 50)) Nes = np.linspace(1, 5, 25) num_plots = len(Nes) rc = int(ceil(sqrt(num_plots))) if occ_matrices is None: occ_matrices = [[[ predict_stat( n, L, sigma=sigma, Ne=Ne, T=lambda rho: mean_occ_from_rho(rho, n, L, sigma=sigma)) for L in Ls ] for sigma in sigmas] for Ne in tqdm(Nes)] fig, axes = plt.subplots(nrows=rc, ncols=rc, sharex=True, sharey=True) for i, ax in zip(range(len(Nes)), axes.flat): #for i,ax in enumerate(Nes): im = ax.imshow(np.matrix(occ_matrices[i]).transpose()[::-1], interpolation='none', aspect='auto', vmin=0, vmax=1) #ax.set_xticks(Ls) #ax.set_xticks() #plt.tick_params(axis='x',pad=15) #plt.xticks(rotation=90) ax.axis('off') #ax.set_yticks(sigmas) # cax,kw = mpl.colorbar.make_axes([ax for ax in axes.flat]) # plt.colorbar(im, cax=cax, **kw) fig.colorbar(im, ax=axes.ravel().tolist()) #plt.set_xticks(Ls) #maxes = [max(map(max,mat)) for mat in occ_matrices] #print maxes maybesave(filename) return occ_matrices
def prokaryotic_gini_comparison(filename=None): """spoof prokaryotic motifs using maxent, uniform and GLE evosims, showing gini is higher in GLE than in maxent, uniform""" maxent_spoofs = [spoof_motifs_maxent(motif,10,verbose=True) for motif in tqdm(bio_motifs,desc='bio_motifs')] uniform_spoofs = [spoof_motifs_uniform(motif,10,verbose=True) for motif in tqdm(bio_motifs,desc='bio_motifs')] oo_spoofs = [spoof_motifs_oo(motif,10) for motif in tqdm(bio_motifs,desc='bio_motifs')] gle_spoofs = [concat([spoof_motif_gle(motif,10,verbose=True) for i in range(1)]) for motif in tqdm(bio_motifs,desc='bio_motifs')] maxent_ginis = [mean(map(motif_gini,spoofs)) for spoofs in maxent_spoofs] uniform_ginis = [mean(map(motif_gini,spoofs)) for spoofs in uniform_spoofs] gle_ginis = [mean(map(motif_gini,spoofs)) for spoofs in gle_spoofs] plt.subplot(1,2,1) scatter(maxent_ginis,gle_ginis) plt.xlabel("MaxEnt") plt.ylabel("GLE") plt.subplot(1,2,2) plt.xlabel("TU") scatter(uniform_ginis,gle_ginis) plt.suptitle("Gini Coefficients for GLE Simulations vs. MaxEnt, TU Distributions") maybesave(filename)
def make_bin_col_plot(filename=None): plot_bin_col(10,interpolate(.01,10,100),100,sort=True,color='b') plot_bin_col(10,interpolate(.01,10,100),100,sort=False,color='g') maybesave(filename)
def main_experiment(generate_data=False): if generate_data: iterations = 10000 prok_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(prok_motifs)] prok_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])] for chain, motif in tqdm(zip(prok_chains, prok_motifs))] prok_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in prok_motifs] prok_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)] prok_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)] prok_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(prok_motifs)) prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(prok_apws,prok_motifs))] euk_submotifs = map(subsample, euk_motifs) euk_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(euk_submotifs)] euk_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))] euk_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in euk_submotifs] euk_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)] euk_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)] euk_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(euk_submotifs)) euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(euk_apws,euk_submotifs))] with open("prok_chains.pkl",'w') as f: cPickle.dump(prok_chains,f) with open("prok_bayes_spoofs.pkl",'w') as f: cPickle.dump(prok_bayes_spoofs,f) with open("prok_maxent_spoofs.pkl",'w') as f: cPickle.dump(prok_maxent_spoofs,f) with open("prok_psfm_spoofs.pkl",'w') as f: cPickle.dump(prok_psfm_spoofs,f) with open("prok_apw_spoofs.pkl",'w') as f: cPickle.dump(prok_apw_spoofs,f) with open("euk_submotifs.pkl",'w') as f: cPickle.dump(euk_submotifs,f) with open("euk_chains.pkl",'w') as f: cPickle.dump(euk_chains,f) with open("euk_bayes_spoofs.pkl",'w') as f: cPickle.dump(euk_bayes_spoofs,f) with open("euk_maxent_spoofs.pkl",'w') as f: cPickle.dump(euk_maxent_spoofs,f) with open("euk_psfm_spoofs.pkl",'w') as f: cPickle.dump(euk_psfm_spoofs,f) with open("euk_apw_spoofs.pkl",'w') as f: cPickle.dump(euk_apw_spoofs,f) else: with open("prok_chains.pkl") as f: prok_chains = cPickle.load(f) with open("prok_bayes_spoofs.pkl") as f: prok_bayes_spoofs = cPickle.load(f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("prok_psfm_spoofs.pkl") as f: prok_psfm_spoofs = cPickle.load(f) with open("prok_apw_spoofs.pkl") as f: prok_apw_spoofs = cPickle.load(f) with open("euk_submotifs.pkl") as f: euk_submotifs = cPickle.load(f) with open("euk_chains.pkl") as f: euk_chains = cPickle.load(f) with open("euk_bayes_spoofs.pkl") as f: euk_bayes_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("euk_apw_spoofs.pkl") as f: euk_apw_spoofs = cPickle.load(f) with open("euk_psfm_spoofs.pkl") as f: euk_psfm_spoofs = cPickle.load(f) #-------- prok_ics = map(motif_ic, prok_motifs) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_ics = [mean(map(motif_ic,xs)) for xs in prok_maxent_spoofs] prok_maxent_mis = [mean(map(mi_per_col,xs)) for xs in prok_maxent_spoofs] prok_psfm_ics = [mean(map(motif_ic,xs)) for xs in prok_psfm_spoofs] prok_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_psfm_spoofs)] prok_bayes_ics = [mean(map(motif_ic,xs)) for xs in prok_bayes_spoofs] prok_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_bayes_spoofs)] prok_apw_ics = [mean(map(motif_ic,xs)) for xs in prok_apw_spoofs] prok_apw_mis = [mean(map(mi_per_col,xs)) for xs in prok_apw_spoofs] prok_ics_pp = map(motif_ic_per_col, prok_motifs) prok_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_maxent_spoofs] prok_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_psfm_spoofs] prok_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_bayes_spoofs] prok_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_apw_spoofs] #-------- euk_ics = map(motif_ic, tqdm(euk_submotifs)) euk_mis = map(mi_per_col, tqdm(euk_submotifs)) euk_maxent_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_maxent_spoofs)] euk_maxent_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_maxent_spoofs)] euk_psfm_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_psfm_spoofs)] euk_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_psfm_spoofs)] euk_bayes_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_bayes_spoofs)] euk_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_bayes_spoofs)] euk_apw_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_apw_spoofs)] euk_apw_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_apw_spoofs)] euk_ics_pp = map(motif_ic_per_col, euk_motifs) euk_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_maxent_spoofs] euk_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_psfm_spoofs] euk_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_bayes_spoofs] euk_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_apw_spoofs] #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7 ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1 #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5 ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85 mi_xticks = [0, 0.25, 0.5, 0.75, 1] ic_yticks = [0, 0.5, 1, 1.5, 2] revscatter = lambda xs, ys:scatter(ys, xs) sns.set_style('dark') plt.subplot(4,4,1) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4,4,3) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,5) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("PSFM",fontsize='large') plt.subplot(4,4,7) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,9) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_apw_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_apw_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4,4,11) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_apw_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,13) #plt.xticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok IC",fontsize='large') plt.ylabel("Bayes",fontsize='large') plt.subplot(4,4,15) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok MI",fontsize='large') #--- euk plots ---# plt.subplot(4,4,2) plt.xticks([]) plt.yticks([]) r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4,4,4) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,6) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("PSFM",fontsize='large') plt.subplot(4,4,8) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,10) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_apw_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_apw_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4,4,12) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_apw_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,14) #plt.xticks([]) # plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("Bayes",fontsize='large') plt.xlabel("Euk IC",fontsize='large') plt.subplot(4,4,16) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.axis('off') #plt.xlabel("MI (bits/column pair)",fontsize='large') plt.xlabel("Euk MI",fontsize='large') plt.tight_layout() maybesave("spoof-statistics-rmsd.pdf")
def interpret_gle_evo_sim_spoofs(bio_motifs_, spoofs_,filename=None): # assume that structure of spoofs is such that all spoofs for bio_motifs[0] are contained in spoofs[0] trials_per_motif = len(spoofs_[0]) bio_motifs = [bio_motif for bio_motif in bio_motifs_ for i in range(trials_per_motif)] sim_motifs = concat(spoofs_) print len(bio_motifs), len(sim_motifs) assert len(bio_motifs) == len(sim_motifs) # bio_ics = [motif_ic(motif) for motif in bio_motifs # for _ in range(trials_per_motif)] bio_ics = map(motif_ic, bio_motifs) sim_ics = map(motif_ic, sim_motifs) # sim_ics = [mean(map(motif_ic,motifs)) # for spoof in spoofs for motifs in spoof] # bio_ginis = [motif_gini(motif) for motif in bio_motifs # for _ in range(trials_per_motif)] # sim_ginis = [mean(map(motif_gini,motifs)) # for spoof in spoofs for motifs in spoof] bio_ginis = map(motif_gini,bio_motifs) sim_ginis = map(motif_gini,sim_motifs) # bio_log_mis = [log(total_motif_mi(motif)) for motif in bio_motifs # for _ in range(trials_per_motif)] # sim_log_mis = map(log,[mean(map(total_motif_mi,motifs)) # for spoof in tqdm(spoofs) for # motifs in spoof]) lens = [len(motif[0]) for motif in bio_motifs] # bio_mis = [total_motif_mi(motif)/choose(l,2) # for (l, motif) in zip(lens, bio_motifs)] # sim_mis = [total_motif_mi(motif)/choose(l,2) # for (l, motif) in zip(lens, spoofs)] print "finding mutual information" bio_mis = [total_motif_mi(motif)/choose(l,2) for (l, motif) in tqdm(zip(lens, bio_motifs))] sim_mis = [total_motif_mi(motif)/choose(l,2) for (l, motif) in tqdm(zip(lens, sim_motifs))] print "finding motif structures" bio_patterns_ = [find_pattern(motif)[0] for motif in tqdm(bio_motifs_)] bio_patterns = [pattern for pattern in bio_patterns_ for _ in xrange(trials_per_motif)] pattern_colors = {'direct-repeat':'g','inverted-repeat':'b','single-box':'r'} colors = [pattern_colors[p] for p in bio_patterns] plt.subplot(1,3,1) plt.title("Motif IC (bits)") scatter(bio_ics,sim_ics,color=colors, line_color='black') ic_f = poly1d(polyfit(bio_ics, sim_ics,1)) #plt.plot(*pl(ic_f,[min(bio_ics),max(bio_ics)]),linestyle='--',color='b') plt.xlim(*find_limits(bio_ics, sim_ics)) plt.ylim(*find_limits(bio_ics, sim_ics)) plt.ylabel("Simulated") plt.subplot(1,3,2) plt.xlabel("Observed") plt.title("Gini Coefficient") scatter(bio_ginis,sim_ginis,color=colors, line_color='black') gini_f = poly1d(polyfit(bio_ginis, sim_ginis,1)) #plt.plot(*pl(gini_f,[min(bio_ginis),max(bio_ginis)]), # linestyle='--',color='b') plt.xlim(*find_limits(bio_ginis, sim_ginis)) plt.ylim(*find_limits(bio_ginis, sim_ginis)) plt.subplot(1,3,3) plt.title("Pairwise MI per pair (bits)") draft = False end = 10 if draft else 108 scatter(bio_mis,sim_mis,color=colors, line_color='black') mi_f = poly1d(polyfit(bio_mis, sim_mis,1)) # plt.plot(*pl(mi_f,[min(bio_mis),max(bio_mis)]), # linestyle='--',color='b') plt.xlim(*find_limits(bio_mis, sim_mis)) plt.ylim(*find_limits(bio_mis, sim_mis)) plt.legend() # #ax.set_bg_color('none') # ax.set_xlabel("Biological") # ax.set_ylabel("Simulated") plt.tight_layout() maybesave(filename)
def main_experiment(generate_data=False): if generate_data: iterations = 10000 prok_chains = [ posterior_chain2(motif, iterations=iterations) for motif in tqdm(prok_motifs) ] prok_bayes_spoofs = [[ motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations / 2::500]) ] for chain, motif in tqdm(zip(prok_chains, prok_motifs))] prok_psfms = [ psfm_from_motif(motif, pc=1 / 4.0) for motif in prok_motifs ] prok_psfm_spoofs = [[[ sample_from_psfm(psfm) for _ in range(len(motif)) ] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)] prok_maxent_spoofs = [ spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs) ] prok_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0), tqdm(prok_motifs)) prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(prok_apws, prok_motifs))] euk_submotifs = map(subsample, euk_motifs) euk_chains = [ posterior_chain2(motif, iterations=iterations) for motif in tqdm(euk_submotifs) ] euk_bayes_spoofs = [[ motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations / 2::500]) ] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))] euk_psfms = [ psfm_from_motif(motif, pc=1 / 4.0) for motif in euk_submotifs ] euk_psfm_spoofs = [[[ sample_from_psfm(psfm) for _ in range(len(motif)) ] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)] euk_maxent_spoofs = [ spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs) ] euk_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0), tqdm(euk_submotifs)) euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(euk_apws, euk_submotifs))] with open("prok_chains.pkl", 'w') as f: cPickle.dump(prok_chains, f) with open("prok_bayes_spoofs.pkl", 'w') as f: cPickle.dump(prok_bayes_spoofs, f) with open("prok_maxent_spoofs.pkl", 'w') as f: cPickle.dump(prok_maxent_spoofs, f) with open("prok_psfm_spoofs.pkl", 'w') as f: cPickle.dump(prok_psfm_spoofs, f) with open("prok_apw_spoofs.pkl", 'w') as f: cPickle.dump(prok_apw_spoofs, f) with open("euk_submotifs.pkl", 'w') as f: cPickle.dump(euk_submotifs, f) with open("euk_chains.pkl", 'w') as f: cPickle.dump(euk_chains, f) with open("euk_bayes_spoofs.pkl", 'w') as f: cPickle.dump(euk_bayes_spoofs, f) with open("euk_maxent_spoofs.pkl", 'w') as f: cPickle.dump(euk_maxent_spoofs, f) with open("euk_psfm_spoofs.pkl", 'w') as f: cPickle.dump(euk_psfm_spoofs, f) with open("euk_apw_spoofs.pkl", 'w') as f: cPickle.dump(euk_apw_spoofs, f) else: with open("prok_chains.pkl") as f: prok_chains = cPickle.load(f) with open("prok_bayes_spoofs.pkl") as f: prok_bayes_spoofs = cPickle.load(f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("prok_psfm_spoofs.pkl") as f: prok_psfm_spoofs = cPickle.load(f) with open("prok_apw_spoofs.pkl") as f: prok_apw_spoofs = cPickle.load(f) with open("euk_submotifs.pkl") as f: euk_submotifs = cPickle.load(f) with open("euk_chains.pkl") as f: euk_chains = cPickle.load(f) with open("euk_bayes_spoofs.pkl") as f: euk_bayes_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("euk_apw_spoofs.pkl") as f: euk_apw_spoofs = cPickle.load(f) with open("euk_psfm_spoofs.pkl") as f: euk_psfm_spoofs = cPickle.load(f) #-------- prok_ics = map(motif_ic, prok_motifs) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_ics = [mean(map(motif_ic, xs)) for xs in prok_maxent_spoofs] prok_maxent_mis = [mean(map(mi_per_col, xs)) for xs in prok_maxent_spoofs] prok_psfm_ics = [mean(map(motif_ic, xs)) for xs in prok_psfm_spoofs] prok_psfm_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(prok_psfm_spoofs) ] prok_bayes_ics = [mean(map(motif_ic, xs)) for xs in prok_bayes_spoofs] prok_bayes_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(prok_bayes_spoofs) ] prok_apw_ics = [mean(map(motif_ic, xs)) for xs in prok_apw_spoofs] prok_apw_mis = [mean(map(mi_per_col, xs)) for xs in prok_apw_spoofs] prok_ics_pp = map(motif_ic_per_col, prok_motifs) prok_maxent_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_maxent_spoofs ] prok_psfm_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_psfm_spoofs ] prok_bayes_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_bayes_spoofs ] prok_apw_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_apw_spoofs ] #-------- euk_ics = map(motif_ic, tqdm(euk_submotifs)) euk_mis = map(mi_per_col, tqdm(euk_submotifs)) euk_maxent_ics = [ mean(map(motif_ic, xs)) for xs in tqdm(euk_maxent_spoofs) ] euk_maxent_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(euk_maxent_spoofs) ] euk_psfm_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_psfm_spoofs)] euk_psfm_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_psfm_spoofs)] euk_bayes_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_bayes_spoofs)] euk_bayes_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(euk_bayes_spoofs) ] euk_apw_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_apw_spoofs)] euk_apw_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_apw_spoofs)] euk_ics_pp = map(motif_ic_per_col, euk_motifs) euk_maxent_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_maxent_spoofs ] euk_psfm_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_psfm_spoofs ] euk_bayes_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_bayes_spoofs ] euk_apw_ics_pp = [mean(map(motif_ic_per_col, xs)) for xs in euk_apw_spoofs] #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7 ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1 #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5 ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85 mi_xticks = [0, 0.25, 0.5, 0.75, 1] ic_yticks = [0, 0.5, 1, 1.5, 2] revscatter = lambda xs, ys: scatter(ys, xs) sns.set_style('dark') plt.subplot(4, 4, 1) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("MaxEnt", fontsize='large') plt.subplot(4, 4, 3) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_maxent_mis) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 5) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("PSFM", fontsize='large') plt.subplot(4, 4, 7) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 9) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_apw_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_apw_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("APW", fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4, 4, 11) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_apw_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 13) #plt.xticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok IC", fontsize='large') plt.ylabel("Bayes", fontsize='large') plt.subplot(4, 4, 15) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_bayes_mis) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok MI", fontsize='large') #--- euk plots ---# plt.subplot(4, 4, 2) plt.xticks([]) plt.yticks([]) r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4, 4, 4) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 6) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("PSFM",fontsize='large') plt.subplot(4, 4, 8) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 10) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_apw_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_apw_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4, 4, 12) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_apw_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 14) #plt.xticks([]) # plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("Bayes",fontsize='large') plt.xlabel("Euk IC", fontsize='large') plt.subplot(4, 4, 16) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.axis('off') #plt.xlabel("MI (bits/column pair)",fontsize='large') plt.xlabel("Euk MI", fontsize='large') plt.tight_layout() maybesave("spoof-statistics-rmsd.pdf")
def plot_fragments(fragments,filename=None): plt.plot(map_fragments(fragments)) plt.xlabel("Genomic coordinate") plt.ylabel("Read Density") plt.title("Simulated ChIP-Seq Read Map") maybesave(filename)
def prok_euk_ic_gini_experiment(filename=None,pickle_filename=None): """figure 3 in gini paper""" if pickle_filename is None: sys.path.append("/home/pat/jaspar") from parse_jaspar import euk_motifs prok_motifs = bio_motifs euk_motifs = [motif if len(motif) <= 200 else sample(200,motif,replace=False) for motif in euk_motifs] print "prok maxents" prok_maxents = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(prok_motifs)] print "prok uniforms" prok_uniforms = [spoof_motifs_uniform(motif,num_motifs=100) for motif in tqdm(prok_motifs)] print "euk maxents" euk_maxents = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(euk_motifs)] print "euk uniforms" euk_uniforms = [spoof_motifs_uniform(motif,num_motifs=100) for motif in tqdm(euk_motifs)] with open("prok_euk_ic_gini_experiment.pkl",'w') as f: cPickle.dump((prok_maxents, prok_uniforms, euk_maxents, euk_uniforms), f) prok_ics = map(motif_ic, prok_motifs) prok_ginis = map(motif_gini, prok_motifs) euk_ics = map(motif_ic, euk_motifs) euk_ginis = map(motif_gini, euk_motifs) prok_maxent_ics = [mean(map(motif_ic,motifs)) for motifs in prok_maxents] prok_maxent_ginis = [mean(map(motif_gini,motifs)) for motifs in prok_maxents] prok_uniform_ics = [mean(map(motif_ic,motifs)) for motifs in prok_uniforms] prok_uniform_ginis = [mean(map(motif_gini,motifs)) for motifs in prok_uniforms] euk_maxent_ics = [mean(map(motif_ic,motifs)) for motifs in euk_maxents] euk_maxent_ginis = [mean(map(motif_gini,motifs)) for motifs in euk_maxents] euk_uniform_ics = [mean(map(motif_ic,motifs)) for motifs in euk_uniforms] euk_uniform_ginis = [mean(map(motif_gini,motifs)) for motifs in euk_uniforms] prok_patterns = [find_pattern(motif)[0] for motif in tqdm(prok_motifs)] euk_patterns = [find_pattern(motif)[0] for motif in tqdm(euk_motifs)] #pattern_colors = {'direct-repeat':'g','inverted-repeat':'b','single-box':'r'} prok_colors = [pattern_colors[p] for p in prok_patterns] euk_colors = [pattern_colors[p] for p in euk_patterns] with open("prok_euk_ic_gini_all_data.pkl",'w') as f: cPickle.dump((prok_motifs, euk_motifs, prok_ics,prok_ginis, prok_maxent_ics,prok_maxent_ginis, prok_uniform_ics,prok_uniform_ginis, euk_ics,euk_ginis, euk_maxent_ics,euk_maxent_ginis, euk_uniform_ics,euk_uniform_ginis, prok_patterns, euk_patterns),f) else: with open(pickle_filename) as f: (prok_motifs, euk_motifs, prok_ics, prok_ginis, prok_maxent_ics, prok_maxent_ginis, prok_uniform_ics, prok_uniform_ginis, euk_ics, euk_ginis, euk_maxent_ics, euk_maxent_ginis, euk_uniform_ics, euk_uniform_ginis, prok_patterns,euk_patterns) = cPickle.load(f) color_dict = {pat:col for pat,col in zip("direct-repeat inverted-repeat single-box".split(), sns.cubehelix_palette(3))} marker_dict = {pat:col for pat,col in zip("direct-repeat inverted-repeat single-box".split(), "o x ^".split())} dmap = lambda d,xs: [d[x] for x in xs] # plt.subplot(2,2,1) # scatter(prok_maxent_ics,prok_ics,color=prok_colors) # plt.ylabel("Prokaryotic IC (bits)") # plt.xlim(0,35) # plt.ylim(0,35) # plt.subplot(2,2,2) # scatter(prok_uniform_ics,prok_ics,color=prok_colors) # plt.xlim(0,35) # plt.ylim(0,35) # plt.subplot(2,2,3) # scatter(euk_maxent_ics,euk_ics, color=euk_colors) # plt.ylabel("Eukaroytic IC (bits)") # plt.xlabel("Maxent IC (bits)") # plt.xlim(0,35) # plt.ylim(0,35) # plt.subplot(2,2,4) # plt.xlim(0,35) # plt.ylim(0,35) # scatter(euk_uniform_ics,euk_ics,color=euk_colors) # plt.xlabel("Uniform IC (bits)") # maybesave("biological-ics.eps") # marker_dict = {pat:col for pat,col in zip("direct-repeat inverted-repeat single-box".split(),"s x ^".split())} # get_markers = lambda patterns:[marker_dict[pat] for pat in pats] left1, left2, bottom1, bottom2 = 0.16, 0.59, 0.77, 0.33 xmin, xmax, ymin, ymax = 0, 0.6, 0, 0.6 marker_size = 10 sns.set_style('white') #sns.set_style('darkgrid') plt.subplot(2,2,1) # plt.xlim(0,0.4) # plt.ylim(0,0.6) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) for x,y,p in zip(prok_maxent_ginis, prok_ginis, prok_patterns): plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size) plt.plot([0,1],[0,1],linestyle='--',color='black') print "prok maxent" print pearsonr(prok_maxent_ginis,prok_ginis) plt.ylabel("Prokaryotic IGC",fontsize='large') # sns.set_style('white') # a1 = plt.axes([left1, bottom1, .1, .1]) # plt.scatter(prok_ics,prok_maxent_ics,s=10,color='black') # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black') # plt.xlim(0,40) # plt.ylim(0,40) # plt.xlabel("MaxEnt IC") # plt.ylabel("Prok IC") # plt.xticks([]) # plt.yticks([]) plt.subplot(2,2,2) # plt.xlim(0,0.4) # plt.ylim(0,0.6) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) print "prok uniform" print pearsonr(prok_uniform_ginis,prok_ginis) for x,y,p in zip(prok_uniform_ginis, prok_ginis,prok_patterns): plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size) plt.plot([0,1],[0,1],linestyle='--',color='black') # sns.set_style('white') # a2 = plt.axes([left2, bottom1, .1, .1]) # plt.scatter(prok_ics,prok_uniform_ics,s=10,color='black') # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black') # plt.xlim(0,40) # plt.ylim(0,40) # plt.xlabel("TU IC") # plt.ylabel("Prok IC") # plt.xticks([]) # plt.yticks([]) plt.subplot(2,2,3) # plt.xlim(0,0.4) # plt.ylim(0,0.6) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) print "euk maxent:" print pearsonr(euk_maxent_ginis,euk_ginis) for x,y,p in zip(euk_maxent_ginis, euk_ginis, euk_patterns): plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size) plt.plot([0,1],[0,1],linestyle='--',color='black') plt.ylabel("Eukaroytic IGC",fontsize='large') plt.xlabel("MaxEnt IGC",fontsize='large') #sns.set_style('white') # a3 = plt.axes([left1, bottom2, .1, .1]) # plt.scatter(euk_ics,euk_maxent_ics,s=10,color='black') # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black') # plt.xlim(0,40) # plt.ylim(0,40) # plt.xlabel("MaxEnt IC") # plt.ylabel("Euk IC") # plt.xticks([]) # plt.yticks([]) plt.subplot(2,2,4) # plt.xlim(0,0.4) # plt.ylim(0,1) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) print "euk uniform" print pearsonr(euk_uniform_ginis,euk_ginis) for x,y,p in zip(euk_uniform_ginis, euk_ginis,euk_patterns): plt.scatter(x,y,color=color_dict[p],marker=marker_dict[p],s=marker_size) plt.plot([0,1],[0,1],linestyle='--',color='black') plt.xlabel("TU IGC",fontsize='large') # sns.set_style('white') # a4 = plt.axes([left2, bottom2, .1, .1]) # plt.scatter(euk_ics,euk_uniform_ics,s=10,color='black') # plt.plot([0,40],[0,40],linewidth=0.5,linestyle='--',color='black') # plt.xlim(0,40) # plt.ylim(0,40) # plt.xlabel("TU IC") # plt.ylabel("Euk IC") # sns.set_style('darkgrid') # plt.xticks([]) # plt.yticks([]) maybesave(filename)
def grand_spoofing_experiment(prok_motifs, euk_motifs): # should we subsample once or each time?? prok_maxent_spoofs = [spoof_maxent_motifs(motif,10) for motif in tqdm(prok_motifs)] euk_maxent_spoofs = [spoof_maxent_motifs(subsample(motif), 10) for motif in tqdm(euk_motifs)] prok_cftp_spoofs = [spoof_motif_cftp_occ(motif,10) for motif in tqdm(prok_motifs)] euk_cftp_spoofs = [spoof_motif_cftp_occ(subsample(motif),10) for motif in tqdm(euk_motifs)] prok_oo_spoofs = [spoof_oo_motifs(motif,10) for motif in tqdm(prok_motifs)] prok_oo_occ_spoofs = [spoof_oo_motifs_occ(motif,10) for motif in tqdm(prok_motifs)] euk_oo_spoofs = [spoof_oo_motifs(subsample(motif),10) for motif in tqdm(euk_motifs)] euk_oo_occ_spoofs = [spoof_oo_motifs_occ(motif,10) for motif in tqdm(euk_motifs)] with open("prok_maxent_spoofs",'w') as f: cPickle.dump(prok_maxent_spoofs, f) with open("euk_maxent_spoofs",'w') as f: cPickle.dump(euk_maxent_spoofs, f) with open("prok_cftp_spoofs",'w') as f: cPickle.dump(prok_cftp_spoofs, f) with open("euk_cftp_spoofs",'w') as f: cPickle.dump(euk_cftp_spoofs, f) with open("prok_oo_spoofs",'w') as f: cPickle.dump(prok_oo_spoofs, f) with open("euk_oo_spoofs",'w') as f: cPickle.dump(euk_oo_spoofs, f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("prok_cftp_spoofs") as f: prok_cftp_spoofs = cPickle.load(f) with open("euk_cftp_spoofs") as f: euk_cftp_spoofs = cPickle.load(f) with open("prok_oo_spoofs.pkl") as f: prok_oo_spoofs = cPickle.load(f) with open("euk_oo_spoofs.pkl") as f: euk_oo_spoofs = cPickle.load(f) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_maxent_spoofs)] euk_mis = map(mi_per_col, map(subsample,euk_motifs)) euk_maxent_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_maxent_spoofs)] prok_cftp_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_cftp_spoofs)] euk_cftp_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_cftp_spoofs)] prok_oo_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_oo_spoofs)] euk_oo_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_oo_spoofs)] plt.subplot(1,3,1) scatter(prok_maxent_mis, prok_mis) plt.xlabel("Predicted MI",fontsize='large') plt.ylabel("Observed MI",fontsize='large') plt.title("MaxEnt",fontsize='large') scatter(euk_maxent_mis, euk_mis,color='g') plt.subplot(1,3,2) scatter(prok_cftp_mis, prok_mis) scatter(euk_cftp_mis, euk_mis,color='g') plt.xlabel("Predicted MI",fontsize='large') plt.ylabel("Observed MI",fontsize='large') plt.title("Gaussian Linear Ensemble",fontsize='large') plt.subplot(1,3,3) scatter(prok_oo_mis, prok_mis) scatter(euk_oo_mis, euk_mis,color='g') plt.xlabel("Predicted MI",fontsize='large') plt.ylabel("Observed MI",fontsize='large') plt.title("Match-Mismatch",fontsize='large') plt.tight_layout() maybesave("mi-spoof-plot.eps")
def analyze_correlated_digrams_canonical(prok_tests, euk_tests, filename=None): digrams = [(b1,b2) for b1 in "ACGT" for b2 in "ACGT"] canonical_digrams = sorted(list(set([min(dg,tuple(wc(dg))) for dg in digrams]))) prok_q = fdr(concat(prok_tests)) euk_q = fdr(concat(euk_tests)) prok_digrams = defaultdict(int) prok_corr_digrams = defaultdict(int) prok_adj_digrams = defaultdict(int) for tests, motif in tqdm(zip(prok_tests, prok_motifs)): for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))): for bi,bj in transpose((coli,colj)): rev_comp = tuple(wc((bi,bj))) if (bi, bj) > rev_comp: bi, bj = rev_comp prok_digrams[(bi,bj)] += 1 if j == i + 1: prok_adj_digrams[(bi,bj)] += 1 if test <= prok_q: prok_corr_digrams[(bi,bj)] += 1 prok_corr_N = float(sum(prok_corr_digrams.values())) prok_adj_N = float(sum(prok_adj_digrams.values())) prok_N = float(sum(prok_digrams.values())) #prok_ps = normalize(prok_digrams.values()) #prok_adj_ps = normalize(prok_adj_digrams.values()) #prok_corr_ps = normalize(prok_corr_digrams.values()) prok_ps = normalize([prok_digrams[dg] for dg in canonical_digrams]) prok_adj_ps = normalize([prok_adj_digrams[dg] for dg in canonical_digrams]) prok_corr_ps = normalize([prok_corr_digrams[dg] for dg in canonical_digrams]) prok_yerr = [1.96*sqrt(1.0/prok_N*p*(1-p)) for p in prok_ps] prok_adj_yerr = [1.96*sqrt(1.0/prok_adj_N*p*(1-p)) for p in prok_adj_ps] prok_corr_yerr = [1.96*sqrt(1.0/prok_corr_N*p*(1-p)) for p in prok_corr_ps] euk_digrams = defaultdict(int) euk_corr_digrams = defaultdict(int) euk_adj_digrams = defaultdict(int) for tests, motif in tqdm(zip(euk_tests, euk_motifs)): for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))): for bi,bj in transpose((coli,colj)): rev_comp = tuple(wc((bi,bj))) if (bi, bj) > rev_comp: bi, bj = rev_comp euk_digrams[(bi,bj)] += 1 if j == i + 1: euk_adj_digrams[(bi,bj)] += 1 if test <= euk_q: euk_corr_digrams[(bi,bj)] += 1 euk_corr_N = float(sum(euk_corr_digrams.values())) euk_adj_N = float(sum(euk_adj_digrams.values())) euk_N = float(sum(euk_digrams.values())) # euk_ps = normalize(euk_digrams.values()) # euk_adj_ps = normalize(euk_adj_digrams.values()) # euk_corr_ps = normalize(euk_corr_digrams.values()) euk_ps = normalize([euk_digrams[dg] for dg in canonical_digrams]) euk_adj_ps = normalize([euk_adj_digrams[dg] for dg in canonical_digrams]) euk_corr_ps = normalize([euk_corr_digrams[dg] for dg in canonical_digrams]) euk_yerr = [1.96*sqrt(1.0/euk_N*p*(1-p)) for p in euk_ps] euk_adj_yerr = [1.96*sqrt(1.0/euk_adj_N*p*(1-p)) for p in euk_adj_ps] euk_corr_yerr = [1.96*sqrt(1.0/euk_corr_N*p*(1-p)) for p in euk_corr_ps] palette = sns.cubehelix_palette(4) ax = plt.subplot(211) # plt.bar(range(16),normalize(prok_digrams.values())) # plt.bar(range(16),normalize(prok_corr_digrams.values()),color='g') # plt.bar([x-0.2 for x in range(16)], prok_relative_ratios.values(), color='g', label="Correlated Column-pairs",width=0.2) # plt.bar([x for x in range(16)],prok_adj_relative_ratios.values(),color='r',alpha=1,yerr=prok_adj_yerr,label="Adjacent Column-pairs",width=0.2) # plt.bar([x+0.2 for x in range(16)],[1]*16,color='b',alpha=1,yerr=(prok_yerr),capsize=10,capstyle='butt',label="All Column-pairs",width=0.2) plt.bar([x-0.2 for x in range(len(canonical_digrams))], prok_ps, label="All Column-Pairs",width=0.2,yerr=prok_yerr,color=palette[0]) plt.bar([x for x in range(len(canonical_digrams))],prok_adj_ps,label="Adj. Column-Pairs", width=0.2,yerr=prok_adj_yerr,color=palette[1]) plt.bar([x+0.2 for x in range(len(canonical_digrams))],prok_corr_ps,alpha=1, capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=prok_corr_yerr,color=palette[3]) #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1) ax.set_xticks([x for x in range(len(canonical_digrams))]) ax.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large') plt.xlim(-0.5,10.5) plt.ylim(0,0.3) #plt.xlabel("Dimer",fontsize='large') plt.ylabel("Prokaryotic Frequency",fontsize='large') #plt.ylim(0,2) plt.legend(loc='upper right') ax2 = plt.subplot(212) #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1) plt.bar([x-0.2 for x in range(len(canonical_digrams))], euk_ps, label="All Column-Pairs",width=0.2,yerr=euk_yerr,color=palette[0]) plt.bar([x for x in range(len(canonical_digrams))],euk_adj_ps,label="Adj. Column-Pairs", width=0.2,yerr=euk_adj_yerr,color=palette[1]) plt.bar([x+0.2 for x in range(len(canonical_digrams))],euk_corr_ps,alpha=1, capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=euk_corr_yerr,color=palette[3]) ax2.set_xticks([x for x in range(len(canonical_digrams))]) ax2.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large') #plt.xlabel("Dimer",fontsize='large') plt.xlim(-0.5,10.5) plt.ylim(0,0.2) plt.ylabel("Eukaryotic Frequency",fontsize='large') #plt.ylim(0,2) plt.legend(loc='upper right') maybesave(filename)
def plot_energy_matrix(matrix,filename=None): plt.imshow(transpose([[x - max(row) for x in row] for row in matrix]), interpolation='nearest') plt.colorbar() maybesave(filename)