def avg_ic_from_theta(theta, N, L, trials=3): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic,motifs) mean_ic = mean(ics) return mean_ic
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None): N = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta) motifs = [sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain)] return chain, motifs
def degradation_experiment(): """Determine whether linear or pairwise models are more resistant to degradation""" L = 10 N = 50 Ne = 5 nu = Ne - 1 sigma = 1 mu = -10 matrix = sample_matrix(L, sigma) code = sample_code(L, sigma) li_motif = sample_motif_cftp(matrix, mu, Ne, N) pw_motif = sample_pw_motif_mh(code, N, Ne, mu, iterations=100000)[-1] def li_log_fitness(motif): eps = [score_seq(matrix, site) for site in motif] return sum(-nu * log((1 + exp(ep - mu))) for ep in eps) def pw_log_fitness(motif): eps = map(lambda x: -log(x), pw_prob_sites(motif, code)) return sum(log(1 / (1 + exp(ep - mu))**nu) for ep in eps) li_base_fit = li_log_fitness(li_motif) li_mut_fits = [li_log_fitness(mutate_motif(li_motif)) for i in range(100)] pw_base_fit = pw_log_fitness(pw_motif) pw_mut_fits = [pw_log_fitness(mutate_motif(pw_motif)) for i in range(100)]
def avg_ic_from_theta(theta, N, L, trials=3): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic, motifs) mean_ic = mean(ics) return mean_ic
def f(theta): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic,motifs) ic = mean(ics) print "sigma, mu, Ne:", sigma, mu, Ne print "mean IC:", ic return exp(-beta*(ic - des_ic)**2)
def sample_mis(L, sigma, copy_factor, Ne, N, trials=100): mis = [] for _ in trange(trials): matrix = sample_matrix(L, sigma) copies = copy_factor * N mu = approx_mu(matrix, copies) motif = sample_motif_cftp(matrix, mu, Ne, N) mis.append(motif_mi(motif)) return mis
def f(theta): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic, motifs) ic = mean(ics) print "sigma, mu, Ne:", sigma, mu, Ne print "mean IC:", ic return exp(-beta * (ic - des_ic)**2)
def resample_from_post_chain(chain, N): """given chain of the form [(mat, mu, Ne)], perform reduction: mat -> sigma -> mat' -> motif' Conclusion: heavily underestimates IC. """ L = len(chain[0][0]) sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain] matrices = [sample_matrix(L, sigma) for sigma in sigmas] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain))] return motifs
def resample_from_post_chain(chain, N): """given chain of the form [(mat, mu, Ne)], perform reduction: mat -> sigma -> mat' -> motif' Conclusion: heavily underestimates IC. """ L = len(chain[0][0]) sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain] matrices = [sample_matrix(L, sigma) for sigma in sigmas] motifs = [ sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain)) ] return motifs
def test_predict_ic(trials=100): pred_ics = [] obs_ics = [] for trial in trange(trials): sigma = random.random() * 5 + 0.1 L = random.randrange(5, 15) matrix = sample_matrix(L, sigma) mu = random.random() * (-20) Ne = random.random() * 5 + 1 pred_ic = predict_ic(matrix, mu, Ne) obs_ic = motif_ic(sample_motif_cftp(matrix, mu, Ne, n=100)) pred_ics.append(pred_ic) obs_ics.append(obs_ic) r, p = scatter(pred_ics, obs_ics) print r, p
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None): N = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta) motifs = [ sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain) ] return chain, motifs
def posterior_chain(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites='N', verbose=False): """do MH with doubly intractable MCMC one-point estimator""" L = len(motif[0]) N = len(motif) if num_spoof_sites == 'N': num_spoof_sites = N # should this be N or 1? if theta0 is None: matrix0 = [[0, 0, 0, 0] for i in range(L)] mu0 = -10 Ne0 = 3 theta = (matrix0, mu0, Ne0) else: theta = theta0 log_f_theta = log_fhat(theta, motif) chain = [] acceptances = 0 for it in trange(iterations): theta_p = prop2(theta, sigma) log_f_theta_p = log_fhat(theta_p, motif) matrix_p, mu_p, Ne_p = theta_p xp = sample_motif_cftp(matrix_p, mu_p, Ne_p, num_spoof_sites) log_Z = log_fhat(theta, xp) log_Z_p = log_fhat(theta_p, xp) log_ar = log_f_theta_p - log_f_theta + N / num_spoof_sites * (log_Z - log_Z_p) if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0], site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances / float(it + 1) return chain
def sanity_check(trials = 1000): L = 10 matrix = [[-2,0,0,0] for i in range(L)] mu = -10 Ne = 2 nu = Ne - 1 log_match_phats = [-nu * log(1+exp(-2*k - mu)) + log_choose(L,k) + k * log(1/4.0) + (L-k) * log(3/4.0) for k in range(L+1)] match_ps = normalize(map(exp, log_match_phats)) mh_motif = sample_motif_mh(matrix, mu, Ne, trials) mh_match_counts = Counter([site.count('A') for site in mh_motif]) mh_match_ps = [mh_match_counts[k]/float(trials) for k in range(L+1)] cftp_motif = sample_motif_cftp(matrix, mu, Ne, trials) cftp_match_counts = Counter([site.count('A') for site in cftp_motif]) cftp_match_ps = [cftp_match_counts[k]/float(trials) for k in range(L+1)] plt.plot(match_ps, label="Analytic") plt.plot(mh_match_ps, label="MH") plt.plot(cftp_match_ps, label="CFTP") plt.xlabel("Matches") plt.ylabel("Frequency")
def posterior_chain(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites='N', verbose=False): """do MH with doubly intractable MCMC one-point estimator""" L = len(motif[0]) N = len(motif) if num_spoof_sites == 'N': num_spoof_sites = N # should this be N or 1? if theta0 is None: matrix0 = [[0,0,0,0] for i in range(L)] mu0 = -10 Ne0 = 3 theta = (matrix0, mu0, Ne0) else: theta = theta0 log_f_theta = log_fhat(theta, motif) chain = [] acceptances = 0 for it in trange(iterations): theta_p = prop2(theta, sigma) log_f_theta_p = log_fhat(theta_p, motif) matrix_p, mu_p, Ne_p = theta_p xp = sample_motif_cftp(matrix_p, mu_p, Ne_p, num_spoof_sites) log_Z = log_fhat(theta, xp) log_Z_p = log_fhat(theta_p, xp) log_ar = log_f_theta_p - log_f_theta + N/num_spoof_sites * (log_Z - log_Z_p) if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0],site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances/float(it+1) return chain
def sanity_check(trials=1000): L = 10 matrix = [[-2, 0, 0, 0] for i in range(L)] mu = -10 Ne = 2 nu = Ne - 1 log_match_phats = [ -nu * log(1 + exp(-2 * k - mu)) + log_choose(L, k) + k * log(1 / 4.0) + (L - k) * log(3 / 4.0) for k in range(L + 1) ] match_ps = normalize(map(exp, log_match_phats)) mh_motif = sample_motif_mh(matrix, mu, Ne, trials) mh_match_counts = Counter([site.count('A') for site in mh_motif]) mh_match_ps = [mh_match_counts[k] / float(trials) for k in range(L + 1)] cftp_motif = sample_motif_cftp(matrix, mu, Ne, trials) cftp_match_counts = Counter([site.count('A') for site in cftp_motif]) cftp_match_ps = [ cftp_match_counts[k] / float(trials) for k in range(L + 1) ] plt.plot(match_ps, label="Analytic") plt.plot(mh_match_ps, label="MH") plt.plot(cftp_match_ps, label="CFTP") plt.xlabel("Matches") plt.ylabel("Frequency")
def sample_motif((sigma, cf, Ne)): matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, cf * N) return sample_motif_cftp(matrix, mu, Ne, N)
def f(theta): matrix, mu, Ne = theta motif = sample_motif_cftp(matrix, mu, Ne, N) return exp(-beta * (motif_ic(motif) - des_ic)**2)
def eps_from_theta(theta, L, N=100): matrix = sample_matrix(L, sigma) motif = sample_motif_cftp(matrix, mu, Ne, N) eps = [score_seq(matrix, site) for site in motif] return eps
theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0], site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances / float(it + 1) return chain def motif_from_theta((matrix, mu, Ne), N): return sample_motif_cftp(matrix, mu, Ne, N) def logmod(x): return sign(x) * log(abs(x) + 1) def interpret_chain(chain, motif, filename=None): N = len(motif) log_fhats = [log_fhat(theta, motif) for theta in chain] log_Zs = [log_ZM_hack(theta, N) for theta in chain] log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)] plt.plot( map(logmod, [mean(score_seq(x[0], site) for site in motif) for x in chain]), label="Mean Site Energy (kBT)")
def observe_ic_from_theta(theta, L, num_matrices=3): sigma, mu, Ne = theta return mean((motif_ic(sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, n=100)) for _ in range(num_matrices)))
def f(theta): matrix, mu, Ne = theta motif = sample_motif_cftp(matrix, mu, Ne, N) return exp(-beta*(motif_ic(motif) - des_ic)**2)
def observe_ic_from_theta(theta, L, num_matrices=3): sigma, mu, Ne = theta return mean( (motif_ic(sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, n=100)) for _ in range(num_matrices)))
if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0],site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances/float(it+1) return chain def motif_from_theta((matrix, mu, Ne), N): return sample_motif_cftp(matrix, mu, Ne, N) def logmod(x): return sign(x)*log(abs(x) + 1) def interpret_chain(chain, motif, filename=None): N = len(motif) log_fhats = [log_fhat(theta,motif) for theta in chain] log_Zs = [log_ZM_hack(theta,N) for theta in chain] log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)] plt.plot(map(logmod, [mean(score_seq(x[0],site) for site in motif) for x in chain]), label="Mean Site Energy (kBT)") plt.plot(map(logmod, [x[1] for x in chain]),label="$\mu$ (kBT)") plt.plot(map(logmod, [x[2] for x in chain]),label="$Ne$") plt.plot(map(logmod, log_fhats),label="log fhat") plt.plot(map(logmod, log_Zs),label="log_ZM")