def sigma_Ne_contour_plot(filename=None): sigmas = np.linspace(0,5,20) Nes = np.linspace(1,20,20) L = 10 n = 50 copies = 10*n trials = 100 motifss = [[[(sample_motif(sigma, Ne, L, copies, n)) for i in range(trials)] for sigma in sigmas] for Ne in tqdm(Nes)] occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas] for Ne in tqdm(Nes)] print "ic_M" ic_M = mmap(lambda ms:mean(map(motif_ic,ms)),motifss) print "gini_M" gini_M = mmap(lambda ms:mean(map(motif_gini,ms)),motifss) print "mi_M" mi_M = mmap(lambda ms:mean(map(total_motif_mi,ms)),tqdm(motifss)) plt.subplot(2,2,1) plt.contourf(sigmas,Nes,occ_M,cmap='jet') plt.colorbar() plt.subplot(2,2,2) plt.contourf(sigmas,Nes,ic_M,cmap='jet') plt.colorbar() plt.subplot(2,2,3) plt.contourf(sigmas,Nes,gini_M,cmap='jet') plt.colorbar() plt.subplot(2,2,4) plt.contourf(sigmas,Nes,mi_M,cmap='jet') plt.colorbar() maybesave(filename)
def sigma_Ne_contour_plot(filename=None): sigmas = np.linspace(0, 5, 20) Nes = np.linspace(1, 20, 20) L = 10 n = 50 copies = 10 * n trials = 100 motifss = [[[(sample_motif(sigma, Ne, L, copies, n)) for i in range(trials)] for sigma in sigmas] for Ne in tqdm(Nes)] occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas] for Ne in tqdm(Nes)] print "ic_M" ic_M = mmap(lambda ms: mean(map(motif_ic, ms)), motifss) print "gini_M" gini_M = mmap(lambda ms: mean(map(motif_gini, ms)), motifss) print "mi_M" mi_M = mmap(lambda ms: mean(map(total_motif_mi, ms)), tqdm(motifss)) plt.subplot(2, 2, 1) plt.contourf(sigmas, Nes, occ_M, cmap='jet') plt.colorbar() plt.subplot(2, 2, 2) plt.contourf(sigmas, Nes, ic_M, cmap='jet') plt.colorbar() plt.subplot(2, 2, 3) plt.contourf(sigmas, Nes, gini_M, cmap='jet') plt.colorbar() plt.subplot(2, 2, 4) plt.contourf(sigmas, Nes, mi_M, cmap='jet') plt.colorbar() maybesave(filename)
def on_off_experiment2(num_motifs=100,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf,tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [spoof_on_off_motif(motif,num_motifs=num_motifs,trials=1) for motif in bio_motifs] spoof_ginises = mmap(motif_gini,tqdm(spoofses)) spoof_mises = mmap(total_motif_mi,tqdm(spoofses)) cors, ps = [],[] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis,mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors,ps,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1,1],[q,q],linestyle='--',label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1,1]) plt.ylim([10**-4,1+1]) cor_ps = zip(cors,ps) sig_negs = [(c,p) for (c,p) in cor_ps if c < 0 and p < q] sig_poses = [(c,p) for (c,p) in cor_ps if c > 0 and p < q] insigs = [(c,p) for (c,p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors,ps,Ns = transpose(cor_p_Ns) return sum([cor*N for (cor,N) in zip (cors,Ns)])/sum(Ns) plt.title("Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs") maybesave(filename)
def bio_detector_experiment(filename=None): """use high Gini to detect biological motifs""" bio_ginis = map(motif_gini, bio_motifs) maxent_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(bio_motifs)] maxent_ginis = mmap(motif_gini, maxent_spoofs) ps = zipWith(percentile,bio_ginis, maxent_ginis) neg_controls = map(first, maxent_spoofs) neg_control_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(neg_controls)] nc_ps = zipWith(percentile,map(motif_gini,neg_controls), mmap(motif_gini, neg_control_spoofs)) roc_curve(ps, nc_ps) plt.xlabel("FPR",fontsize='large') plt.ylabel("TPR",fontsize='large') maybesave(filename)
def visualize_stationary_sum(matrix,n,Ne,T,samples_per_bin=100): L = len(matrix) nu = Ne - 1 ringer = ringer_motif(matrix,n) motifss = [[mutate_motif_k_times(ringer,k) for i in range(samples_per_bin)] for k in trange(n*L)] log_fss = mmap(lambda motif:log_fitness(matrix,motif,G),tqdm(motifss)) Tss = mmap(T,tqdm(motifss)) log_ws = [log_rho_weight(rho,n,L) for rho in range(n*L)] terms = [mean(exp(nu*log_f + log_w)*T for log_f,T in zip(log_fs,Ts)) for log_w,log_fs,Ts in zip(log_ws,log_fss,Tss)] Z = sum([mean(exp(nu*log_f + log_w) for log_f,T in zip(log_fs,Ts)) for log_w,log_fs,Ts in zip(log_ws,log_fss,Tss)]) print sum(terms)/Z plt.plot(range(n*L),terms)
def uniform_motif_with_ic_imh_ref(n, L, desired_ic, epsilon=0.1, iterations=None, verbose=False, num_chains=8): correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Q(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 if iterations: x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain else: #use gelman rubin criterion x0s = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), num_chains) iterations = 100 converged = False chains = [[] for _ in range(num_chains)] while not converged: for chain, x0 in zip(chains, x0s): chain.extend( mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False)) ic_chains = mmap(motif_ic, chains) R_hat, neff = gelman_rubin(ic_chains) if R_hat < 1.1: return chains else: x0s = [chain[-1] for chain in chains] iterations *= 2
def cv_experiment(motifs, target='uniform'): """see if js_psfm outperforms ml_psfm in 10x cv""" all_mls, all_js = [], [] for motif in motifs: ml_lls = [] js_lls = [] for train, test in cv(motif): ml_mat = mmap(log, psfm_from_motif(train)) js_mat = mmap(log, js_psfm(train, target=target)) ml_ll = mean(score_seq(ml_mat, site) for site in test) js_ll = mean(score_seq(js_mat, site) for site in test) ml_lls.append(ml_ll) js_lls.append(js_ll) avg_ml_ll, avg_js_ll = mean(ml_lls), mean(js_lls) all_mls.append(avg_ml_ll) all_js.append(avg_js_ll) print avg_ml_ll, avg_js_ll, avg_ml_ll < avg_js_ll return all_mls, all_js
def on_off_experiment2(num_motifs=100, filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [ spoof_on_off_motif(motif, num_motifs=num_motifs, trials=1) for motif in bio_motifs ] spoof_ginises = mmap(motif_gini, tqdm(spoofses)) spoof_mises = mmap(total_motif_mi, tqdm(spoofses)) cors, ps = [], [] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis, mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors, ps, filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1, 1], [q, q], linestyle='--', label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1, 1]) plt.ylim([10**-4, 1 + 1]) cor_ps = zip(cors, ps) sig_negs = [(c, p) for (c, p) in cor_ps if c < 0 and p < q] sig_poses = [(c, p) for (c, p) in cor_ps if c > 0 and p < q] insigs = [(c, p) for (c, p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors, ps, Ns = transpose(cor_p_Ns) return sum([cor * N for (cor, N) in zip(cors, Ns)]) / sum(Ns) plt.title( "Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs" ) maybesave(filename)
def make_clusters_with_k(motif, k): print "k:", k L = len(motif[0]) N = float(len(motif)) clusters = [[] for i in range(k)] print "len clusters:", len(clusters) for site in motif: i = random.randrange(k) clusters[i].append(site) print "finished initializing" pssms = [ mmap(log, psfm_from_motif_(cluster, L, pc=1)) for cluster in clusters ] alphas = [len(cluster) / N for cluster in clusters] def log_likelihood(): return sum( log( sum(alpha * exp(score_seq(pssm, site)) for alpha, pssm in zip(alphas, pssms))) for site in motif) last_ll = 0 done_yet = False #for i in range(iterations): while not done_yet: cur_ll = log_likelihood() print "log likelihood:", cur_ll if last_ll == cur_ll: done_yet = True break else: last_ll = cur_ll clusters = [[] for i in range(k)] for site in motif: i = argmax([score_seq(pssm, site) for pssm in pssms]) clusters[i].append(site) pssms = [ mmap(log, psfm_from_motif_(cluster, L, pc=1)) for cluster in clusters ] return clusters, log_likelihood()
def estimate_stationary_statistic(matrix,n,Ne,T,samples_per_bin=10): """given matrix, Ne and statistic T, estimate <T> under stationary distribution""" L = len(matrix) N = n*L nu = Ne - 1 ringer = ringer_motif(matrix,n) all_sampless = [[mutate_motif_k_times(ringer,k) for i in range(samples_per_bin)] for k in trange(N)] Tss = mmap(T,all_sampless) log_fss = mmap(lambda motif:log_fitness(matrix,motif,G),all_sampless) # better expressed as exp(nu*log(f)), but numeric issues log_bz_weightss = [[(nu*log_f) for log_f in log_fs] for rho,log_fs in enumerate(log_fss)] #Z = sum([mean(bz_weights)*rho_weight(rho,n,L) for rho,bz_weights in enumerate(bz_weightss)]) log_Z = logsum([logmean(log_bz_weights) + log_rho_weight(rho,n,L) for rho,log_bz_weights in enumerate(log_bz_weightss)]) # summands = [rho_weight(rho,n,L)*mean(t*bz_weight/Z for t,bz_weight in zip(ts,bz_weights)) # for rho,(ts,bz_weights) in enumerate(zip(Tss,bz_weightss))] log_summands = logsum([logmean([log(t*exp(log_bz_w)) for t,log_bz_w in zip(ts,log_bz_weights)]) + log_rho_weight(rho,n,L) for rho,(ts,log_bz_weights) in enumerate(zip(Tss,log_bz_weightss))]) return exp(log_summands - log_Z)
def estimate_stationary_statistic(matrix, n, Ne, T, samples_per_bin=10): """given matrix, Ne and statistic T, estimate <T> under stationary distribution by importance sampling perturbations from ringer""" L = len(matrix) N = n * L nu = Ne - 1 ringer = ringer_motif(matrix, n) all_sampless = [[ mutate_motif_k_times(ringer, k) for i in range(samples_per_bin) ] for k in trange(N)] Tss = mmap(T, all_sampless) fss = mmap(lambda motif: fitness(matrix, motif, G), all_sampless) # better expressed as exp(nu*log(f)), but numeric issues bz_weightss = [[(f**nu) for f in fs] for rho, fs in enumerate(fss)] Z = sum([ mean(bz_weights) * 4**rho for rho, bz_weights in enumerate(bz_weightss) ]) summands = [ 4**rho * mean(t * bz_weight / Z for t, bz_weight in zip(ts, bz_weights)) for rho, (ts, bz_weights) in enumerate(zip(Tss, bz_weightss)) ] return sum(summands)
def site_sampling_methods_study(n=50, num_motifs=10, plot=True): """validate that the three proposed sampling methods: brute force rejection sampling metropolis hastings do in fact sample from the same distribution """ L = 10 sigma = 1 matrix = sample_matrix(L, sigma) Ne = 5 mu = -10 print "bf" t0 = time.time() bf_motifs = [sample_motif_bf(matrix, mu, Ne, n,verbose=True) for i in trange(num_motifs)] bf_time = time.time() - t0 print "ar" t0 = time.time() ar_motifs = [sample_motif_ar(matrix, mu, Ne, n) for i in range(num_motifs)] ar_time = time.time() - t0 print "mh" t0 = time.time() mh_motifs = [sample_motif_mh(matrix, mu, Ne, n) for i in range(num_motifs)] mh_time = time.time() - t0 icss = mmap(motif_ic,[bf_motifs, ar_motifs, mh_motifs]) print "ics:", map(mean_ci, icss) print "time per motif:", [t/num_motifs for t in [bf_time, ar_time, mh_time]] if plot: plt.boxplot(icss) for xs, ys in choose2(icss): print mannwhitneyu(xs,ys)
def uniform_motif_with_ic_rw(n, L, desired_ic, epsilon=0.1, p=None, iterations=None, num_chains=8, x0=None, beta=None): if p is None: p = 2.0 / (n * L) def Q(motif): return mutate_motif_p(motif, p) def f(motif): return abs(motif_ic(motif) - desired_ic) < epsilon if type(iterations) is int: if x0 is None: x0 = uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=epsilon, iterations=1, beta=beta)[0] chain = mh(f, proposal=Q, x0=x0, iterations=iterations) return chain elif iterations == "harmonic": ar = 1.0 / 5 iterations = int(n * L * harmonic(n * L) / ar) print "iterations:", iterations if x0 is None: x0 = uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=epsilon, iterations=1)[0] chain = mh(f, proposal=Q, x0=x0, iterations=iterations) return chain else: #use gelman rubin criterion x0s = [ uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=epsilon, iterations=1)[0] for i in range(num_chains) ] iterations = 100 converged = False chains = [[] for _ in range(num_chains)] while not converged: for chain, x0 in zip(chains, x0s): chain.extend( mh(f, proposal=Q, x0=x0, iterations=iterations, verbose=False)) ic_chains = mmap(motif_ic, chains) R_hat, neff = gelman_rubin(ic_chains) if R_hat < 1.1: return chains else: x0s = [chain[-1] for chain in chains] iterations *= 2
def normalize_matrix(xss): Z = float(sum(map(sum, xss))) return mmap(lambda x: x / Z, xss)