def restriction_of_range_half_site_experiment(motif): """is energy of first half-site negatively correlated with energy of second half-site?""" L = len(motif[0]) l = L/2 mat = matrix_from_motif(motif) eps1 = [score_seq(mat[:l], site[:l]) for site in motif] eps2 = [score_seq(mat[l:], site[l:]) for site in motif] return pearsonr(eps1,eps2)
def restriction_of_range_motif_spoof_experiment(motifs): all_eps = [] all_spoof_eps = [] for motif in tqdm(motifs): mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] spoofs = spoof_psfm(motif, pc=0) spoof_eps = [score_seq(mat, site) for site in spoofs] all_eps.append(eps) all_spoof_eps.append(spoof_eps) return all_eps, all_spoof_eps
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([sum(ep*p for ep,p in zip(eps, ps)) for eps, ps in zip(matrix, psfm)]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f,-20,20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [-nu*log(1+exp(score_seq(matrix, site) - mu)) for site in sites] log_qs = [score_seq(log_psfm, site) for site in sites]
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max, matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm( ep, 0, site_sigma) * (ep_min <= ep <= ep_max) d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep)) phat = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row) ]) lamb = bisect_interval(lambda l: mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log, row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep) / pmode: motif.append(site) return motif
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [sd([score(code,site) for site in sites]) for code in codes] linear_site_sigmas = [sd([score_seq(pssm,site) for site in sites]) for pssm in pssms] def apw_phat(code, site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def apw_occ(code, site): ep = score(code, site) return 1/(1+exp(ep-mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu)) apw_mean_fits = [exp(mean(map(log10, mh(lambda s:apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, s))[1:]))) for code in tqdm(codes)] linear_mean_fits = [exp(mean(map(log10, mh(lambda s:linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms)] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g',label='linear') plt.semilogy() plt.legend(loc='lower right')
def fitness(matrix,motif,G): """multiplicative fitness of occupancy over all sites""" eps = [score_seq(matrix,site) for site in motif] fgs = [exp(-ep) for ep in eps] Zb = Zb_from_matrix(matrix,G) Z = sum(fgs) + Zb return prod(fg/Z for fg in fgs)
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max) d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep)) phat = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row)]) lamb = bisect_interval(lambda l:mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log,row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep)/pmode: motif.append(site) return motif
def restriction_of_range_loo_experiment(motif): """can energy of a given position be predicted from energy of remaining bases?""" L = len(motif[0]) mat = matrix_from_motif(motif) eps = [score_seq(mat,site) for site in motif] mean_ep = mean(eps) results = [] for j in range(L): print j loo_mat = mat[:j] + mat[j+1:] for site in motif: loo_ep = score_seq(loo_mat,site[:j] + site[j+1:]) pred_ep = mean_ep - loo_ep obs_ep = score_seq([mat[j]],[site[j]]) results.append((pred_ep, obs_ep)) return results
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [ sd([score(code, site) for site in sites]) for code in codes ] linear_site_sigmas = [ sd([score_seq(pssm, site) for site in sites]) for pssm in pssms ] def apw_phat(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def apw_occ(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu)) apw_mean_fits = [ exp( mean( map( log10, mh(lambda s: apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: apw_occ(code, s))[1:]))) for code in tqdm(codes) ] linear_mean_fits = [ exp( mean( map( log10, mh(lambda s: linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms) ] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g', label='linear') plt.semilogy() plt.legend(loc='lower right')
def cv_experiment(motifs, target='uniform'): """see if js_psfm outperforms ml_psfm in 10x cv""" all_mls, all_js = [], [] for motif in motifs: ml_lls = [] js_lls = [] for train, test in cv(motif): ml_mat = mmap(log, psfm_from_motif(train)) js_mat = mmap(log, js_psfm(train, target=target)) ml_ll = mean(score_seq(ml_mat, site) for site in test) js_ll = mean(score_seq(js_mat, site) for site in test) ml_lls.append(ml_ll) js_lls.append(js_ll) avg_ml_ll, avg_js_ll = mean(ml_lls), mean(js_lls) all_mls.append(avg_ml_ll) all_js.append(avg_js_ll) print avg_ml_ll, avg_js_ll, avg_ml_ll < avg_js_ll return all_mls, all_js
def rejection_sample_site((matrix, mu, Ne)): psfm = psfm_from_matrix(matrix) log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site:score_seq(log_psfm, site) log_M = -sum(map(max,psfm)) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites] log_qs = [log_psfm_prob(site) for site in sites] ars = [exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs)]
def select_sites_by_occupancy(matrix, mu, n): L = len(matrix) motif = [] while len(motif) < n: site = random_site(L) if random.random() < 1 / (1 + exp(score_seq(matrix, site) - mu)): motif.append(site) print len(motif) return motif
def log_ZS_analytic((matrix, mu, Ne)): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) return log(acc)
def log_ZS_analytic((matrix, mu, Ne)): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1/(1+exp(ep-mu)))**(Ne-1) return log(acc)
def posterior_chain2(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites="N", verbose=False, integration='hack'): """do MH, estimating ratio of partition functions empirically""" L = len(motif[0]) N = len(motif) if num_spoof_sites == "N": num_spoof_sites = N # should this be N or 1? if theta0 is None: matrix0 = [[random.gauss(0, 1) for _ in range(4)] for i in range(L)] mu0 = -10 Ne0 = 2 theta = (matrix0, mu0, Ne0) else: theta = theta0 log_f_theta = log_fhat(theta, motif) #log_Z = log_ZM_gaussian(theta, N, integration=integration) log_Z = log_ZM_sophisticated(theta, N) chain = [] acceptances = 0 def log_prior((matrix, mu, Ne)): log_matrix_prior = sum( [log(dnorm(ep, 0, 1)) for row in matrix for ep in row]) log_mu_prior = log(dnorm(mu, 0, 10)) log_Ne_prior = log(exp(-Ne)) return log_matrix_prior + log_mu_prior + log_Ne_prior for it in trange(iterations): #print "Ne:", theta[2] theta_p = prop2(theta, sigma) log_f_theta_p = log_fhat(theta_p, motif) matrix_p, mu_p, Ne_p = theta_p #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad') #log_Z_p = log_ZM_gaussian(theta_p, N, trials=100, integration='hack') #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad') log_Z_p = log_ZM_sophisticated(theta_p, N) #log_Z_p = log_ZM_importance(theta_p, N, trials=100) log_ar = log_f_theta_p - log_f_theta + ( log_Z - log_Z_p) + log_prior(theta_p) - log_prior(theta) if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0], site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances / float(it + 1) return chain
def log_fitness(matrix,motif,G): """multiplicative fitness of occupancy over all sites""" n = len(motif) eps = [score_seq(matrix,site) for site in motif] fgs = [exp(-ep) for ep in eps] Zf = sum(fgs) Zb = Zb_from_matrix(matrix,G) Z = Zf + Zb return -sum(eps) - n*log(Z)
def rejection_sample_site((matrix, mu, Ne)): psfm = psfm_from_matrix(matrix) log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site: score_seq(log_psfm, site) log_M = -sum(map(max, psfm)) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites] log_qs = [log_psfm_prob(site) for site in sites] ars = [ exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs) ]
def ror_experiment(): L = 10 n = 100 sigmas = np.linspace(0.1,10,10) alphas = np.linspace(0,1,10) for sigma in sigmas: for alpha in alphas: theta = - alpha * sigma * L matrix = sample_matrix(L,sigma) sampler = lambda : sample_motif_neglect_fg(matrix,1,Ne=2)[0] motif = sample_until(lambda site:score_seq(matrix,site) < theta,sampler,n) print sigma, alpha, total_motif_mi(motif)
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([ sum(ep * p for ep, p in zip(eps, ps)) for eps, ps in zip(matrix, psfm) ]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f, -20, 20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [ -nu * log(1 + exp(score_seq(matrix, site) - mu)) for site in sites ] log_qs = [score_seq(log_psfm, site) for site in sites]
def sample_site_imh(matrix, mu, Ne, lamb, iterations=None): nu = Ne - 1 L = len(matrix) if iterations is None: iterations = 10*L log_phat = lambda site:-nu*log(1+exp(score_seq(matrix,site)-mu)) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log,row) for row in tilted_psfm] def prop(_): return sample_from_psfm(tilted_psfm) def log_dprop(xp, _): return score_seq(log_tilted_psfm, xp) return mh(log_phat, proposal=prop, dprop=log_dprop, x0=prop(None), use_log=True)[-1]
def log_fitness_approx3(matrix,motif,G): n = len(motif) eps = [score_seq(matrix,site) for site in motif] fgs = [exp(-ep) for ep in eps] Zf = sum(fgs) Zb = Zb_from_matrix(matrix,G) Z = Zf + Zb print Zf,Zb,Zf/Zb good_approximation = -sum(eps) - n*(log(Zf)) Zf_hat = mean(fgs) Zf_resids = [fg - Zf_hat for fg in fgs] worse_approximation = -sum(eps) - n*(log(n) + log(Zf_hat)) print good_approximation, worse_approximation return good_approximation
def site_mh(matrix, mu, Ne, iterations=50000): site_mu, site_sigma = site_mu_from_matrix(matrix), site_sigma_from_matrix( matrix) L = len(matrix) nu = Ne - 1 log_f = lambda site: log_Pe(score_seq(matrix, site), site_mu, site_sigma, mu, Ne) #prop = lambda site:random_site(L) prop = lambda site: mutate_site(site) return mh(log_f, prop, x0=random_site(L), use_log=True, iterations=iterations)
def posterior_chain2(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites="N", verbose=False, integration='hack'): """do MH, estimating ratio of partition functions empirically""" L = len(motif[0]) N = len(motif) if num_spoof_sites == "N": num_spoof_sites = N # should this be N or 1? if theta0 is None: matrix0 = [[random.gauss(0,1) for _ in range(4)] for i in range(L)] mu0 = -10 Ne0 = 2 theta = (matrix0, mu0, Ne0) else: theta = theta0 log_f_theta = log_fhat(theta, motif) #log_Z = log_ZM_gaussian(theta, N, integration=integration) log_Z = log_ZM_sophisticated(theta, N) chain = [] acceptances = 0 def log_prior((matrix, mu, Ne)): log_matrix_prior = sum([log(dnorm(ep,0,1)) for row in matrix for ep in row]) log_mu_prior = log(dnorm(mu,0,10)) log_Ne_prior = log(exp(-Ne)) return log_matrix_prior + log_mu_prior + log_Ne_prior for it in trange(iterations): #print "Ne:", theta[2] theta_p = prop2(theta, sigma) log_f_theta_p = log_fhat(theta_p, motif) matrix_p, mu_p, Ne_p = theta_p #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad') #log_Z_p = log_ZM_gaussian(theta_p, N, trials=100, integration='hack') #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad') log_Z_p = log_ZM_sophisticated(theta_p, N) #log_Z_p = log_ZM_importance(theta_p, N, trials=100) log_ar = log_f_theta_p - log_f_theta + (log_Z - log_Z_p) + log_prior(theta_p) - log_prior(theta) if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0],site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances/float(it+1) return chain
def log_fitness_approx(matrix,motif,G,terms=2): n = len(motif) eps = [score_seq(matrix,site) for site in motif] fgs = [exp(-ep) for ep in eps] Zf = sum(fgs) Zb = Zb_from_matrix(matrix,G) Z = Zf + Zb zeroth_term = log(n+Zb) * (terms >= 0) first_term = (-1/(n+Zb)*sum(eps)) * (terms >= 1) second_term = 1/2.0*1/(n+Zb)**2*((n + Zb - 1)*sum(ep**2 for ep in eps) - sum(epi*epj for epi,epj in choose2(eps))) * (terms >= 2) print zeroth_term,first_term,second_term # first_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps))) # second_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps)) + 1/2.0*1/(n+Zb)**2*((n))) return -sum(eps) - n*(zeroth_term + first_term + second_term)
def interpret_chain(chain, motif, filename=None): N = len(motif) log_fhats = [log_fhat(theta,motif) for theta in chain] log_Zs = [log_ZM_hack(theta,N) for theta in chain] log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)] plt.plot(map(logmod, [mean(score_seq(x[0],site) for site in motif) for x in chain]), label="Mean Site Energy (kBT)") plt.plot(map(logmod, [x[1] for x in chain]),label="$\mu$ (kBT)") plt.plot(map(logmod, [x[2] for x in chain]),label="$Ne$") plt.plot(map(logmod, log_fhats),label="log fhat") plt.plot(map(logmod, log_Zs),label="log_ZM") plt.plot(map(logmod, log_ps),label="log p") plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]),label="Mean Occupancy") plt.legend(loc='right',fontsize='large') plt.xlabel("Iteration",fontsize='large') maybesave(filename)
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10*N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) epsilon = (1+double_sigma)*sigma # 15 Jan 2016 print "sigma:", sigma #bio_ic = motif_ic(motif) mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] mu = gle_approx_mu(mat, copies) bio_occ = mean([1/(1+exp(ep-mu)) for ep in eps]) def f(Ne): return expected_occupancy(epsilon, Ne, L, copies) - bio_occ Ne = log_regress_spec2(f,[1,10],tol=10**-3) return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
def sample_uniform_energy(matrix): mu = sum(map(mean, matrix)) sigma = sqrt(sum(map(lambda x:variance(x,correct=False), matrix))) ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) M_min = 1/norm.pdf(ep_min, mu, sigma) M_max = 1/norm.pdf(ep_max, mu, sigma) M = max(M_min, M_max) trials = 0 while True: trials += 1 if trials % 10000 == 0: print trials site = random_site(L) ep = score_seq(matrix, site) ar = 1/(M*norm.pdf(ep, mu, sigma)) if random.random() < ar: return site
def sample_uniform_energy(matrix): mu = sum(map(mean, matrix)) sigma = sqrt(sum(map(lambda x: variance(x, correct=False), matrix))) ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) M_min = 1 / norm.pdf(ep_min, mu, sigma) M_max = 1 / norm.pdf(ep_max, mu, sigma) M = max(M_min, M_max) trials = 0 while True: trials += 1 if trials % 10000 == 0: print trials site = random_site(L) ep = score_seq(matrix, site) ar = 1 / (M * norm.pdf(ep, mu, sigma)) if random.random() < ar: return site
def posterior_chain(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites='N', verbose=False): """do MH with doubly intractable MCMC one-point estimator""" L = len(motif[0]) N = len(motif) if num_spoof_sites == 'N': num_spoof_sites = N # should this be N or 1? if theta0 is None: matrix0 = [[0, 0, 0, 0] for i in range(L)] mu0 = -10 Ne0 = 3 theta = (matrix0, mu0, Ne0) else: theta = theta0 log_f_theta = log_fhat(theta, motif) chain = [] acceptances = 0 for it in trange(iterations): theta_p = prop2(theta, sigma) log_f_theta_p = log_fhat(theta_p, motif) matrix_p, mu_p, Ne_p = theta_p xp = sample_motif_cftp(matrix_p, mu_p, Ne_p, num_spoof_sites) log_Z = log_fhat(theta, xp) log_Z_p = log_fhat(theta_p, xp) log_ar = log_f_theta_p - log_f_theta + N / num_spoof_sites * (log_Z - log_Z_p) if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0], site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances / float(it + 1) return chain
def interpret_chain(chain, motif, filename=None): N = len(motif) log_fhats = [log_fhat(theta, motif) for theta in chain] log_Zs = [log_ZM_hack(theta, N) for theta in chain] log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)] plt.plot( map(logmod, [mean(score_seq(x[0], site) for site in motif) for x in chain]), label="Mean Site Energy (kBT)") plt.plot(map(logmod, [x[1] for x in chain]), label="$\mu$ (kBT)") plt.plot(map(logmod, [x[2] for x in chain]), label="$Ne$") plt.plot(map(logmod, log_fhats), label="log fhat") plt.plot(map(logmod, log_Zs), label="log_ZM") plt.plot(map(logmod, log_ps), label="log p") plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]), label="Mean Occupancy") plt.legend(loc='right', fontsize='large') plt.xlabel("Iteration", fontsize='large') maybesave(filename)
def sample_site_imh(matrix, mu, Ne, lamb, iterations=None): nu = Ne - 1 L = len(matrix) if iterations is None: iterations = 10 * L log_phat = lambda site: -nu * log(1 + exp(score_seq(matrix, site) - mu)) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log, row) for row in tilted_psfm] def prop(_): return sample_from_psfm(tilted_psfm) def log_dprop(xp, _): return score_seq(log_tilted_psfm, xp) return mh(log_phat, proposal=prop, dprop=log_dprop, x0=prop(None), use_log=True)[-1]
def make_clusters_with_k(motif, k): print "k:", k L = len(motif[0]) N = float(len(motif)) clusters = [[] for i in range(k)] print "len clusters:", len(clusters) for site in motif: i = random.randrange(k) clusters[i].append(site) print "finished initializing" pssms = [ mmap(log, psfm_from_motif_(cluster, L, pc=1)) for cluster in clusters ] alphas = [len(cluster) / N for cluster in clusters] def log_likelihood(): return sum( log( sum(alpha * exp(score_seq(pssm, site)) for alpha, pssm in zip(alphas, pssms))) for site in motif) last_ll = 0 done_yet = False #for i in range(iterations): while not done_yet: cur_ll = log_likelihood() print "log likelihood:", cur_ll if last_ll == cur_ll: done_yet = True break else: last_ll = cur_ll clusters = [[] for i in range(k)] for site in motif: i = argmax([score_seq(pssm, site) for pssm in pssms]) clusters[i].append(site) pssms = [ mmap(log, psfm_from_motif_(cluster, L, pc=1)) for cluster in clusters ] return clusters, log_likelihood()
def posterior_chain(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites='N', verbose=False): """do MH with doubly intractable MCMC one-point estimator""" L = len(motif[0]) N = len(motif) if num_spoof_sites == 'N': num_spoof_sites = N # should this be N or 1? if theta0 is None: matrix0 = [[0,0,0,0] for i in range(L)] mu0 = -10 Ne0 = 3 theta = (matrix0, mu0, Ne0) else: theta = theta0 log_f_theta = log_fhat(theta, motif) chain = [] acceptances = 0 for it in trange(iterations): theta_p = prop2(theta, sigma) log_f_theta_p = log_fhat(theta_p, motif) matrix_p, mu_p, Ne_p = theta_p xp = sample_motif_cftp(matrix_p, mu_p, Ne_p, num_spoof_sites) log_Z = log_fhat(theta, xp) log_Z_p = log_fhat(theta_p, xp) log_ar = log_f_theta_p - log_f_theta + N/num_spoof_sites * (log_Z - log_Z_p) if log(random.random()) < log_ar: theta = theta_p log_f_theta = log_f_theta_p log_Z = log_Z_p acceptances += 1 chain.append(theta) if verbose: print "log(f), log_Z:", log_f_theta, log_Z print "mean_ep:", mean(score_seq(theta[0],site) for site in motif) print "mean_occ:", mean(occs(theta, motif)) print "mu, Ne:", theta[1], theta[2] print "acceptances:", acceptances/float(it+1) return chain
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None, Ne_tol=10**-4, double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10 * N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) epsilon = (1 + double_sigma) * sigma # 15 Jan 2016 print "sigma:", sigma #bio_ic = motif_ic(motif) mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] mu = gle_approx_mu(mat, copies) bio_occ = mean([1 / (1 + exp(ep - mu)) for ep in eps]) def f(Ne): return expected_occupancy(epsilon, Ne, L, copies) - bio_occ Ne = log_regress_spec2(f, [1, 10], tol=10**-3) return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
from pwm_utils import sigma_from_matrix from math import log, exp, sqrt import random from evo_sampling import sample_motif_cftp from tqdm import * from matplotlib import pyplot as plt from scipy.stats import norm import numpy as np from scipy import integrate from formosa import spoof_maxent_motifs from adjacent_pairwise_model import code_from_motif, sample_site def log_fhat((matrix, mu, Ne), motif): assert type(motif) is list nu = Ne - 1 eps = [score_seq(matrix, site) for site in motif] return -sum(nu*log(1+exp(ep-mu)) for ep in eps) def log_ZS_analytic((matrix, mu, Ne)): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1/(1+exp(ep-mu)))**(Ne-1) return log(acc) def log_ZM_analytic((matrix, mu, Ne), N): log_ZS = log_ZS_analytic((matrix, mu, Ne)) return N * log_ZS
def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))
def fitness_additive(matrix,motif,G): eps = [score_seq(matrix,site) for site in motif] fg = sum(exp(-ep) for ep in eps) Zb = Zb_from_matrix(matrix,G) return fg/(fg + Zb)
def eps_from_theta(theta, L, N=100): matrix = sample_matrix(L, sigma) motif = sample_motif_cftp(matrix, mu, Ne, N) eps = [score_seq(matrix, site) for site in motif] return eps
def log_dprop(xp, _): return score_seq(log_tilted_psfm, xp)
def Zb_from_matrix_ref(matrix,G): L = len(matrix) eps = np.array([score_seq(matrix,random_site(L)) for i in trange(G)]) return np.sum(np.exp(-eps))
def phat(site): ep = score_seq(matix, site) return 1/(1+exp(ep-mu))**(Ne-1)
def random_genotype(n, L, linear_sigma, pairwise_sigma, copies): motif = random_motif(L, n) pwm = sample_matrix(L, linear_sigma) pairwise_weights = [[[random.gauss(0, pairwise_sigma) for i in range(4)] for j in range(4)] for k in range(L - 1)] return motif, copies, (pwm, pairwise_weights) def btoi(b): return "ACGT".index(b) def energy_score((pwm, pairwise_weights), seq): linear_score = score_seq(pwm, seq) pairwise_score = sum(weight[btoi(b1)][btoi(b2)] for weight, (b1, b2) in zip(pairwise_weights, pairs(seq))) return linear_score + pairwise_score def compute_Zb(G, (linear_weights, pairwise_weights)): pure_pairwise_weights = [[ [pw[i][j] + lwi[i] + lwj[j] for j in range(4)] for i in range(4) ] for pw, (lwi, lwj) in zip(pairwise_weights, pairs(linear_weights))] Ws = [ np.matrix([[exp(w[btoi(b1)][btoi(b2)]) for b2 in "ACGT"] for b1 in "ACGT"]) for w in pure_pairwise_weights ] return np.array([1, 1, 1, 1]).dot(reduce(lambda x, y: x.dot(y),
def phat(s): assert len(s) == L ep = score_seq(matrix,s) return (1 + exp(ep - mu))**(-nu)
def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1)
def log_phat(s): ep = score_seq(matrix,s) nu = Ne - 1 return -nu*log(1 + exp(ep - mu))
def log_fit(site): return -nu*log(1+exp(score_seq(matrix,site)-mu))
def f(site): ep = score_seq(matrix, site) return phat(ep)
def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1)
def fitness(site, matrix, mu, Ne): ep = score_seq(matrix, site) return (1/(1+exp(ep-mu)))**(Ne-1)
def log_fitness_approx2(matrix,motif,G): """approximate fitness by neglecting competition from other functional sites, i.e. Zb""" eps = [score_seq(matrix,site) for site in motif] Zb = Zb_from_matrix(matrix,G) return -sum(eps) - n*log(Zb)
def sample_Zb_terms(L,sigma,trials=10000): matrix = sample_matrix(L,sigma) return [score_seq(matrix,random_site(L)) for i in xrange(trials)]
def phat(s): ep = score_seq(matrix,s) return (1 + exp(ep - mu))**(-nu)
def log_f(site): ep = score_seq(matrix, site) return -nu*log(1+exp(ep-mu))