def cumsum_test(): arca_reads = get_arca_reads(1000000) true_rdm = density_from_reads(arca_reads, G) pssm = make_pssm(Escherichia_coli.ArcA) comb_rdm = true_rdm[0] + true_rdm[1] print "fwd_scores" fwd_scores = score_genome_np(pssm, genome) print "rev_scores" rev_scores = score_genome_np(pssm, wc(genome)) scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores)) probs = np.exp(scores)/np.sum(np.exp(scores)) print "sorting scores" score_js = sorted_indices(scores)[::-1] # order scores from greatest to least print "sorting probs" prob_js = sorted_indices(probs)[::-1] # ditto plt.plot(cumsum(rslice(comb_rdm, score_js)), label="scores") plt.plot(cumsum(rslice(comb_rdm, prob_js)), label="boltzmann probs") comb_rdm_copy = list(comb_rdm) controls = 5 for i in range(controls): print i random.shuffle(comb_rdm_copy) plt.plot(cumsum(comb_rdm_copy), color='r') plt.legend(loc=0) plt.xlim(0, 1) plt.ylim(0, 1) plt.show()
def L_vs_sigma_plot(filename=None, with_bio=False): if with_bio: tfdf = extract_motif_object_from_tfdf() motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ls = [len(motif[0]) for motif in motifs] cs = [len(motif) for motif in motifs] ics = [motif_ic(motif) for motif in motifs] ic_density = [ic / L for ic, L in zip(ics, Ls)] sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs] ginis = [motif_gini(motif, correct=False) for motif in motifs] mi_density = [ total_motif_mi(motif) / choose(L, 2) for motif, L in zip(motifs, Ls) ] min_sigma = 0.1 max_sigma = 10 plt.xlim(0, max_sigma) plt.ylim(0, 60) plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)), label="Binding Transition") plt.plot([min_sigma, max_sigma], [log(G, 2) / 2, log(G, 2) / 2], linestyle='--', label="Info Theory Threshold") # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)), # linestyle='--',label="Zero Discrimination Asymptote") if with_bio: plt.scatter(sigmas, Ls, label="Biological Motifs") plt.xlabel("sigma") plt.ylabel("L") plt.legend() maybesave(filename)
def validate_sample_motif_neglect_fg2(iterations=50000): """compare fg_neglect sampling to MCMC""" bio_motif = Escherichia_coli.LexA n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] ringer = ringer_motif(matrix,n) Ne = 2.375 random_motifs = [sample_motif_neglect_fg(matrix,n,Ne) for i in trange(iterations)] random_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(random_motifs)] random_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(random_motifs)] random_ics = map(motif_ic,random_motifs) _, chain = sella_hirsch_mh(matrix=matrix,init="ringer",Ne=Ne,n=n,iterations=iterations) chain_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(chain)] chain_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(chain)] chain_ics = map(motif_ic,chain) plt.subplot(1,2,1) plt.scatter(random_rhos,random_log_fs) plt.scatter(chain_rhos,chain_log_fs,color='g') plt.xlabel("rho") plt.ylabel("log fitness") plt.subplot(1,2,2) plt.scatter(random_rhos,random_ics) plt.scatter(chain_rhos,chain_ics,color='g') plt.xlabel("rho") plt.ylabel("IC")
def make_ecoli_df(): Ls = [] Ls_adj = [] ns = [] sigmas = [] labels = [] motif_ics = [] motif_ics_per_base = [] for tf in Escherichia_coli.tfs: sites = getattr(Escherichia_coli, tf) L = len(sites[0]) n = len(sites) ns.append(n) L_adj = len(sites[0]) + log2(n) sigma = mean((map(sd, make_pssm(sites)))) Ls.append(L) Ls_adj.append(L_adj) motif_ics.append(motif_ic(sites)) motif_ics_per_base.append(motif_ic(sites) / float(L)) sigmas.append(sigma) df = pd.DataFrame( { "L": Ls, "n": ns, "sigma": sigmas, "motif_ic": motif_ics, "info_density": motif_ics_per_base }, index=Escherichia_coli.tfs) return df
def test_estimate_stationary_statistic_ref_framework(): matrix = make_pssm(Escherichia_coli.LexA) n = len(Escherichia_coli.LexA) Nes = np.linspace(1,5,10) pred,obs = transpose([test_estimate_stationary_statistic_ref(matrix,n,Ne,T=motif_ic) for Ne in Nes]) plt.plot(Nes,pred) plt.plot(Nes,obs) return pred,obs
def spoof_motif(motif, T): n = len(motif) L = len(motif[0]) bio_ic = motif_ic(motif) sigma = 2 * mean(map(sd, make_pssm(motif))) # XXX REVSIT THIS ISSUE ic_from_Ne = lambda Ne: predict_stat(n, L, sigma, Ne, G=5 * 10**6, T=lambda rho: mean_ic_from_rho( rho, n, L)) Ne = bisect_interval(lambda Ne: ic_from_Ne(Ne) - bio_ic, 0.01, 5) return predict_stat(n, L, sigma, Ne, T)
def arca_motif_comparison(): arca_reads = get_arca_reads() true_rdm = density_from_reads(arca_reads, G) pssm = make_pssm(Escherichia_coli.ArcA) plt.plot(true_rdm[0]) plt.plot(true_rdm[1]) fwd_scores, rev_scores = score_genome_np(pssm, genome) scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores)) sites = concat([(site, wc(site)) for site in Escherichia_coli.ArcA]) site_locations = [m.start(0) for site in sites for m in re.finditer(site, genome)] site_locations_np = np.zeros(G) for site_loc in site_locations: site_locations_np[site_loc] = 1 plt.plot(site_locations_np) plt.plot(scores)
def validate_sample_motif_neglect_fg(): """compare fg_neglect sampling to random mutation: indeed shows better fitness at given rho""" bio_motif = Escherichia_coli.LexA n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] ringer = ringer_motif(matrix,n) random_motifs = [mutate_motif_k_times(ringer,k) for k in range(n*L)] random_motifs2 = [sample_motif_neglect_fg(matrix,n,Ne) for Ne in np.linspace(1,10,n*L)] random_rhos = [motif_hamming_distance(ringer,motif) for motif in random_motifs] random_log_fs = [log_fitness(matrix,motif,G) for motif in random_motifs] random_rhos2 = [motif_hamming_distance(ringer,motif) for motif in random_motifs2] random_log_fs2 = [log_fitness(matrix,motif,G) for motif in random_motifs2] plt.plot(random_rhos,random_log_fs) plt.plot(random_rhos2,random_log_fs2) plt.plot(random_rhos3,random_log_fs3)
def Ne_from_motif(bio_motif,interp_rounds,iterations=50000): """Given a motif, return Ne that matches mean IC""" bio_ic = motif_ic(bio_motif) n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] print len(matrix) def f(Ne,iterations=iterations): print "Ne",Ne _,chain = sella_hirsch_mh(matrix=matrix,n=n,Ne=Ne,iterations=iterations,init='ringer') return mean(map(motif_ic,chain[iterations/2:])) - bio_ic # lo,hi = 1,5 # data = [] # for _ in xrange(interp_rounds): # guess = (lo + hi)/2.0 # y = f(guess) # print lo,hi,guess,y # data.append((guess,y)) # if y > 0: # hi = guess # else: # lo = guess # return data Ne_min = 1 Ne_max = 5 while f(Ne_max) < 0: print "increasing Ne max" Ne_max *= 2 xs, ys= transpose([(Ne,f(Ne)) for Ne in np.linspace(Ne_min,Ne_max,interp_rounds)]) # now find an interpolant. We desire smallest sigma of gaussian # interpolant such that function has at most one inflection point interp_sigmas = np.linspace(0.01,1,100) interps = [gaussian_interp(xs,ys,sigma=s) for s in interp_sigmas] for i,(sigma, interp) in enumerate(zip(interp_sigmas,interps)): print i,sigma if num_inflection_points(map(interp,np.linspace(Ne_min,Ne_max,100))) == 1: "found 1 inflection point" break print sigma Ne = bisect_interval(interp,Ne_min,Ne_max) return Ne
def power_law_exploration(): """Are the read densities for a given bin power-law distributed?""" print "getting arca reads" arca_reads = get_arca_reads(1000000) print "computing read density map" true_rdm = density_from_reads(arca_reads, G) comb_rdm = true_rdm[0] + true_rdm[1] pssm = make_pssm(Escherichia_coli.ArcA) print "scoring" fwd_scores, rev_scores = score_genome_np(pssm, genome) scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores)) d = defaultdict(list) print "tabulating" for i in xrange(G): score = int(scores[i]) d[score].append(comb_rdm[i]) print "plotting" for key in sorted(d.keys()): counts = Counter(d[key]) Z = float(sum(counts.values())) plt.plot(sorted(counts.keys()), [counts[k]/Z for k in sorted(counts.keys())], label=key) plt.loglog() plt.show()
def jackknife_distribution(motif): scores = [] for i in range(len(motif)): motif_p = [site for j, site in enumerate(motif) if not i == j] scores.append(score_seq(make_pssm(motif_p), motif[i])) return scores
def make_ecoli_sigma_L_plot(): Ls = [] Ls_adj = [] ns = [] sigmas = [] labels = [] motif_ics = [] motif_ics_per_base = [] for tf in Escherichia_coli.tfs: sites = getattr(Escherichia_coli, tf) L = len(sites[0]) n = len(sites) ns.append(n) L_adj = len(sites[0]) + log2(n) sigma = mean((map(sd, make_pssm(sites)))) Ls.append(L) Ls_adj.append(L_adj) motif_ics.append(motif_ic(sites)) motif_ics_per_base.append(motif_ic(sites) / float(L)) sigmas.append(sigma) labels.append(tf) sigma_space = np.linspace(0.1, 3, 10) crit_lambs_actual = map( lambda sigma: critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100), tqdm(sigma_space)) plt.subplot(1, 6, 1) plt.scatter(sigmas, Ls) for L, sigma, label in zip(Ls, sigmas, labels): plt.annotate(label, xy=(sigma, L)) plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space)) plt.plot( *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space)) plt.plot(sigma_space, crit_lambs_actual) plt.subplot(1, 6, 2) plt.scatter(sigmas, Ls_adj) for L_adj, sigma, label in zip(Ls_adj, sigmas, labels): plt.annotate(label, xy=(sigma, L_adj)) plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space)) plt.plot( *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space)) plt.plot(sigma_space, crit_lambs_actual) preds = [critical_lamb(sigma, G=4.5 * 10**6) for sigma in tqdm(sigmas)] preds_actual = [ critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100) for sigma in tqdm(sigmas) ] plt.subplot(1, 6, 3) plt.scatter(preds, Ls) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.title("Preds vs Ls") print "Preds vs Ls", pearsonr(preds, Ls) plt.plot([0, 30], [0, 30]) plt.subplot(1, 6, 4) plt.scatter(preds, Ls_adj) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.plot([0, 30], [0, 30]) plt.title("Preds vs Ls_adj") print "Preds vs Ls_adj", pearsonr(preds, Ls_adj) plt.subplot(1, 6, 5) plt.scatter(preds_actual, Ls) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.plot([0, 30], [0, 30]) plt.title("Preds_actual vs Ls") print "Preds_actual vs Ls", pearsonr(preds_actual, Ls) plt.subplot(1, 6, 6) plt.scatter(preds_actual, Ls_adj) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.plot([0, 30], [0, 30]) plt.title("Preds_actual vs Ls_adj") print "Preds_actual vs Ls_adj", pearsonr(preds_actual, Ls_adj) return Ls, sigmas