def avg_ic_from_theta(theta, N, L, trials=3): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic, motifs) mean_ic = mean(ics) return mean_ic
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None): N = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta) motifs = [sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain)] return chain, motifs
def experiment2_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 sites = [random_site(L) for i in xrange(10000)] apw_eps = [score(code, site) for site in sites] site_sigma = sd(apw_eps) pssm = sample_matrix(L, sqrt(site_sigma**2 / L)) #linear_eps = [score_seq(pssm, site) for site in sites] def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [ sd([score(code, site) for site in sites]) for code in codes ] linear_site_sigmas = [ sd([score_seq(pssm, site) for site in sites]) for pssm in pssms ] def apw_phat(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def apw_occ(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu)) apw_mean_fits = [ exp( mean( map( log10, mh(lambda s: apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: apw_occ(code, s))[1:]))) for code in tqdm(codes) ] linear_mean_fits = [ exp( mean( map( log10, mh(lambda s: linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms) ] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g', label='linear') plt.semilogy() plt.legend(loc='lower right')
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [sd([score(code,site) for site in sites]) for code in codes] linear_site_sigmas = [sd([score_seq(pssm,site) for site in sites]) for pssm in pssms] def apw_phat(code, site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def apw_occ(code, site): ep = score(code, site) return 1/(1+exp(ep-mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu)) apw_mean_fits = [exp(mean(map(log10, mh(lambda s:apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, s))[1:]))) for code in tqdm(codes)] linear_mean_fits = [exp(mean(map(log10, mh(lambda s:linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms)] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g',label='linear') plt.semilogy() plt.legend(loc='lower right')
def avg_ic_from_theta(theta, N, L, trials=3): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic,motifs) mean_ic = mean(ics) return mean_ic
def degradation_experiment(): """Determine whether linear or pairwise models are more resistant to degradation""" L = 10 N = 50 Ne = 5 nu = Ne - 1 sigma = 1 mu = -10 matrix = sample_matrix(L, sigma) code = sample_code(L, sigma) li_motif = sample_motif_cftp(matrix, mu, Ne, N) pw_motif = sample_pw_motif_mh(code, N, Ne, mu, iterations=100000)[-1] def li_log_fitness(motif): eps = [score_seq(matrix, site) for site in motif] return sum(-nu * log((1 + exp(ep - mu))) for ep in eps) def pw_log_fitness(motif): eps = map(lambda x: -log(x), pw_prob_sites(motif, code)) return sum(log(1 / (1 + exp(ep - mu))**nu) for ep in eps) li_base_fit = li_log_fitness(li_motif) li_mut_fits = [li_log_fitness(mutate_motif(li_motif)) for i in range(100)] pw_base_fit = pw_log_fitness(pw_motif) pw_mut_fits = [pw_log_fitness(mutate_motif(pw_motif)) for i in range(100)]
def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, mu, s))[25000:] return mean(chain)
def f(theta): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic,motifs) ic = mean(ics) print "sigma, mu, Ne:", sigma, mu, Ne print "mean IC:", ic return exp(-beta*(ic - des_ic)**2)
def f(theta): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic, motifs) ic = mean(ics) print "sigma, mu, Ne:", sigma, mu, Ne print "mean IC:", ic return exp(-beta * (ic - des_ic)**2)
def test_log_ZS_gaussian(L, sigma=1): """test wrt analytic, importance methods""" matrix = sample_matrix(L, sigma) mu = random.random() * 20 - 10 Ne = random.random() * 2 + 1 ans_analytic = log_ZS_analytic((matrix, mu, Ne)) #ans_importance = log_ZS_importance((matrix, mu, Ne)) ans_gaussian = log_ZS_gaussian((matrix, mu, Ne)) #return ans_analytic, ans_importance, ans_gaussian return ans_analytic, ans_gaussian
def test_log_ZS_gaussian(L, sigma = 1): """test wrt analytic, importance methods""" matrix = sample_matrix(L,sigma) mu = random.random() * 20 - 10 Ne = random.random() * 2 + 1 ans_analytic = log_ZS_analytic((matrix, mu, Ne)) #ans_importance = log_ZS_importance((matrix, mu, Ne)) ans_gaussian = log_ZS_gaussian((matrix, mu, Ne)) #return ans_analytic, ans_importance, ans_gaussian return ans_analytic, ans_gaussian
def resample_from_post_chain(chain, N): """given chain of the form [(mat, mu, Ne)], perform reduction: mat -> sigma -> mat' -> motif' Conclusion: heavily underestimates IC. """ L = len(chain[0][0]) sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain] matrices = [sample_matrix(L, sigma) for sigma in sigmas] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain))] return motifs
def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) chain = mh(lambda s: linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, mu, s))[25000:] return mean(chain)
def experiment1(): matrices = [[sample_matrix(10, sigma) for sigma in sigmas] for Ne in Nes] motifses = mmap( lambda matrix: [ sample_motif_cftp(matrix, approx_mu(matrix, 10 * n), Ne, n) for i in range(10) ], tqdm(matrices)) occs = [[ mean( mean_occupancy(matrix, m, approx_mu(matrix, 10 * n)) for m in motif) for (matrix, motif) in zip(matrix_row, motif_row) ] for (matrix_row, motif_row) in zip(matrices, motifses)]
def resample_from_post_chain(chain, N): """given chain of the form [(mat, mu, Ne)], perform reduction: mat -> sigma -> mat' -> motif' Conclusion: heavily underestimates IC. """ L = len(chain[0][0]) sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain] matrices = [sample_matrix(L, sigma) for sigma in sigmas] motifs = [ sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain)) ] return motifs
def test_predict_ic(trials=100): pred_ics = [] obs_ics = [] for trial in trange(trials): sigma = random.random() * 5 + 0.1 L = random.randrange(5, 15) matrix = sample_matrix(L, sigma) mu = random.random() * (-20) Ne = random.random() * 5 + 1 pred_ic = predict_ic(matrix, mu, Ne) obs_ic = motif_ic(sample_motif_cftp(matrix, mu, Ne, n=100)) pred_ics.append(pred_ic) obs_ics.append(obs_ic) r, p = scatter(pred_ics, obs_ics) print r, p
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None): N = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta) motifs = [ sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain) ] return chain, motifs
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([sum(ep*p for ep,p in zip(eps, ps)) for eps, ps in zip(matrix, psfm)]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f,-20,20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [-nu*log(1+exp(score_seq(matrix, site) - mu)) for site in sites] log_qs = [score_seq(log_psfm, site) for site in sites]
def L_sigma_plot(mu=-10): def occupancy(matrix): site = ringer_motif(matrix, 1)[0] ep = score_seq(matrix, site) return 1 / (1 + exp(ep - mu)) Ls = range(1, 30) sigmas = np.linspace(0, 20, 100) occ_matrix = [[ mean(occupancy(sample_matrix(L, sigma)) for i in range(10)) for L in Ls ] for sigma in tqdm(sigmas)] pred_matrix = [[1 / (1 + exp(-L * sigma - mu)) for L in Ls] for sigma in sigmas] plt.subplot(1, 2, 1) plt.imshow(occ_matrix, interpolation='none', aspect='auto') plt.subplot(1, 2, 2) plt.imshow(pred_matrix, interpolation='none', aspect='auto') plt.colorbar()
def main_experiment(samples=30, iterations=10000, delta_ic=0.1): results_dict = {} for tf_idx, tf in enumerate(tfdf.tfs): print "starting on:", tf motif = getattr(tfdf, tf) if motif_ic(motif) < 5: print "excluding", tf, "for low IC" continue bio_ic = motif_ic(motif) n = len(motif) L = len(motif[0]) matrix = matrix_from_motif(motif) sigma = sigma_from_matrix(matrix) mu = approximate_mu(matrix, n, G) Ne = estimate_Ne(matrix, mu, n, bio_ic) spoofs = [] ar = 0 spoof_trials = 0.0 while len(spoofs) < samples: spoof_trials += 1 matrix, chain = sella_hirsch_mh(Ne=Ne, mu=mu, n=1, matrix=sample_matrix(L, sigma), init='ringer', iterations=iterations) spoof_motif = concat( [random.choice(chain[iterations / 2:]) for i in range(n)]) if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic: spoofs.append(spoof_motif) ar += 1 print "spoof acceptance rate:", ar / spoof_trials, len( spoofs), samples, spoof_trials #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]] results_dict[tf] = { fname: map(eval(fname), spoofs) for fname in "motif_ic motif_gini total_motif_mi".split() } print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs)) print bio_ic, mean_ci(results_dict[tf]['motif_ic']) return results_dict
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([ sum(ep * p for ep, p in zip(eps, ps)) for eps, ps in zip(matrix, psfm) ]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f, -20, 20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [ -nu * log(1 + exp(score_seq(matrix, site) - mu)) for site in sites ] log_qs = [score_seq(log_psfm, site) for site in sites]
def sella_hirsch_mh_penalize_mu(Ne=5, n=16, L=16, G=5 * 10**6, sigma=1, alpha=0.01, init="random", matrix=None, x0=None, iterations=50000, p=None): print "p:", p if matrix is None: matrix = sample_matrix(L, sigma) if x0 is None: if init == "random": x0 = (random_motif(L, n), random.gauss(0, 1)) elif init == "ringer": x0 = (ringer_motif(matrix, n), random.gauss(0, 1)) elif init == "anti_ringer": x0 = (anti_ringer_motif(matrix, n), random.gauss(0, 1)) else: x0 = init if p is None: p = 1.0 / (n * L) nu = Ne - 1 def log_f((motif, mu)): return nu * log_fitness_penalize_mu(matrix, motif, mu, alpha) def prop((motif, mu)): motif_p = mutate_motif_p(motif, p) # probability of mutation per basepair mu_p = mu + random.gauss(0, 0.1) return motif_p, mu_p chain = mh(log_f, prop, x0, use_log=True, iterations=iterations) return matrix, chain
def sella_hirsch_mh(Ne=5, n=16, L=16, sigma=1, mu=0, init="random", matrix=None, x0=None, iterations=50000, p=None): print "p:", p if matrix is None: matrix = sample_matrix(L, sigma) else: L = len(matrix) if x0 is None: if init == "random": x0 = random_motif(L, n) elif init == "ringer": x0 = ringer_motif(matrix, n) elif init == "anti_ringer": x0 = anti_ringer_motif(matrix, n) else: x0 = init if p is None: p = 1.0 / (n * L) nu = Ne - 1 def log_f(motif): return nu * log_fitness(matrix, motif, mu) def prop(motif): motif_p = mutate_motif_p(motif, p) # probability of mutation per basepair return motif_p chain = mh(log_f, prop, x0, use_log=True, iterations=iterations) return matrix, chain
def spoof_motif(motif, Ne=None, iterations=10000): matrix = matrix_from_motif(motif) L = len(motif[0]) n = len(motif) sigma = sigma_from_matrix(matrix) spoof_matrix = sample_matrix(L, sigma) bio_ic = motif_ic(motif) # this method of reading site_mu, site_sigma off of motif is slightly suspect... site_mu = site_mu_from_matrix(matrix_from_motif(motif)) site_sigma = site_sigma_from_matrix(matrix_from_motif(motif)) # now need to find mu, nu n = len(motif) assumed_copies = 10 * n mu = approximate_mu(matrix, assumed_copies, G) spoof_mu = approximate_mu(spoof_matrix, assumed_copies, G) if Ne is None: Ne = estimate_Ne(spoof_matrix, spoof_mu, n, bio_ic) print "chose Ne:", Ne spoof_matrix, chain = sella_hirsch_mh(Ne=Ne, matrix=spoof_matrix, mu=mu, n=n) return spoof_matrix, chain, Ne
def experiment2_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 sites = [random_site(L) for i in xrange(10000)] apw_eps = [score(code, site) for site in sites] site_sigma = sd(apw_eps) pssm = sample_matrix(L, sqrt(site_sigma**2/L)) #linear_eps = [score_seq(pssm, site) for site in sites] def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def eps_from_theta(theta, L, N=100): matrix = sample_matrix(L, sigma) motif = sample_motif_cftp(matrix, mu, Ne, N) eps = [score_seq(matrix, site) for site in motif] return eps
def predict_ic_from_theta(theta, L, num_matrices=3): sigma, mu, Ne = theta return mean(predict_ic(sample_matrix(L, sigma), mu, Ne, N=100) for _ in range(num_matrices))
def sample_pair(sigma, Ne): matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, 10 * n) motif = sample_motif_cftp(matrix, mu, Ne, n) return matrix, motif
def sample_mean_occ(sigma, Ne): matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, 10 * n) motif = sample_motif_cftp(matrix, mu, Ne, n) return mean_occupancy(matrix, motif, mu)
def observe_ic_from_theta(theta, L, num_matrices=3): sigma, mu, Ne = theta return mean((motif_ic(sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, n=100)) for _ in range(num_matrices)))
def observe_ic_from_theta(theta, L, num_matrices=3): sigma, mu, Ne = theta return mean( (motif_ic(sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, n=100)) for _ in range(num_matrices)))
def predict_ic_from_theta(theta, L, num_matrices=3): sigma, mu, Ne = theta return mean( predict_ic(sample_matrix(L, sigma), mu, Ne, N=100) for _ in range(num_matrices))
def random_genotype(n, L, linear_sigma, pairwise_sigma, copies): motif = random_motif(L, n) pwm = sample_matrix(L, linear_sigma) pairwise_weights = [[[random.gauss(0, pairwise_sigma) for i in range(4)] for j in range(4)] for k in range(L - 1)] return motif, copies, (pwm, pairwise_weights)