def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [ sd([score(code, site) for site in sites]) for code in codes ] linear_site_sigmas = [ sd([score_seq(pssm, site) for site in sites]) for pssm in pssms ] def apw_phat(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def apw_occ(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu)) apw_mean_fits = [ exp( mean( map( log10, mh(lambda s: apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: apw_occ(code, s))[1:]))) for code in tqdm(codes) ] linear_mean_fits = [ exp( mean( map( log10, mh(lambda s: linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms) ] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g', label='linear') plt.semilogy() plt.legend(loc='lower right')
def experiment2_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 sites = [random_site(L) for i in xrange(10000)] apw_eps = [score(code, site) for site in sites] site_sigma = sd(apw_eps) pssm = sample_matrix(L, sqrt(site_sigma**2 / L)) #linear_eps = [score_seq(pssm, site) for site in sites] def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [sd([score(code,site) for site in sites]) for code in codes] linear_site_sigmas = [sd([score_seq(pssm,site) for site in sites]) for pssm in pssms] def apw_phat(code, site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def apw_occ(code, site): ep = score(code, site) return 1/(1+exp(ep-mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu)) apw_mean_fits = [exp(mean(map(log10, mh(lambda s:apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, s))[1:]))) for code in tqdm(codes)] linear_mean_fits = [exp(mean(map(log10, mh(lambda s:linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms)] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g',label='linear') plt.semilogy() plt.legend(loc='lower right')
def uniform_motif_with_ic_imh_ref(n, L, desired_ic, epsilon=0.1, iterations=None, verbose=False, num_chains=8): correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Q(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 if iterations: x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain else: #use gelman rubin criterion x0s = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), num_chains) iterations = 100 converged = False chains = [[] for _ in range(num_chains)] while not converged: for chain, x0 in zip(chains, x0s): chain.extend( mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False)) ic_chains = mmap(motif_ic, chains) R_hat, neff = gelman_rubin(ic_chains) if R_hat < 1.1: return chains else: x0s = [chain[-1] for chain in chains] iterations *= 2
def mean_field_test(M=10, K=2, sigma=1, plotting=True): Vs = [[None for j in range(M)] for jp in range(M)] for j in range(M): for jp in range(j + 1, M): d = {(xj, xjp): random.gauss(0, sigma) for xj in range(K) for xjp in range(K)} Vjjp = lambda xj, xjp: d[(xj, xjp)] Vs[j][jp] = Vjjp states = list(itertools.product(*[range(K) for j in range(M)])) def Hp(xs): return sum(Vs[j][jp](xj, xjp) for ((j, xj), (jp, xjp)) in itertools.combinations(enumerate(xs), 2)) mf_hs = mean_field_hs(Vs, K) print "computing Zp" Zp = sum(exp(-beta * Hp(xs)) for xs in states) def P(xs): return exp(-beta * Hp(xs)) / Zp def Hq(xs): return sum(mf_hs[j][xj] for j, xj in enumerate(xs)) print "computing Zq" Zq = sum(exp(-beta * Hq(xs)) for xs in states) def Q(xs): return exp(-beta * Hq(xs)) / Zq # for state in states: # print state,P(state),Q(state) ps = [P(state) for state in states] qs = [Q(state) for state in states] print pearsonr(ps, qs) print "Sp (bits):", sum(-p * log2(p) for p in ps) print "Sq (bits):", sum(-q * log2(q) for q in qs) print "Dkl(P||Q) (bits):", sum(p * log2(p / q) for p, q in zip(ps, qs)) def rQ(xs): """MFA proposal""" return [inverse_cdf_sample(range(K), boltzmann(mf_h)) for mf_h in mf_hs] def rR(xs): """Uniform proposal""" return [random.choice(range(K)) for j in range(M)] mh(f=P, proposal=rQ, dprop=Q, x0=[0] * M) mh(f=P, proposal=rR, x0=[0] * M) if plotting: plt.scatter(ps, qs) plt.xlabel("Ps") plt.ylabel("Qs") plt.loglog() minp, maxp = min(ps), max(ps) minq, maxq = min(qs), max(qs) plt.plot([minp, maxp], [minq, maxq]) plt.xlim(minp, maxp) plt.ylim(minq, maxq) plt.show()
def uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=0.1, iterations=None, verbose=False, beta=None, num_chains=8): if beta is None: correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Q(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] # first, determine probability of landing in range ar = 0 iterations = 100 while ar == 0: ar = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False, return_ar=True) iterations *= 2 iterations = int(1.0 / ar * 10) chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain
def mh_simulate(iterations=50000,verbose=False,method="direct_sampling"): copy_number = 5 def logf(config): return -hamiltonian(config) def prop(config): new_config = config[:] attached_tfs = sum(config) # number currently bound to chromosome r = random.random() if r < attached_tfs/float(copy_number): # choose a tf on the chromosome pos = random.choice(positions(config)) new_config[pos] = 0 # else: choose a tf off the chromosome new_pos = random.choice(range(config_len + 1)) if new_pos < config_len: new_config[new_pos] = 1 # else tf goes off chromosome return new_config Z = float(sum(ks)) ps = [k/Z for k in ks] sampler = inverse_cdf_sampler(range(len(ks)),ps) def prop_direct(config): sample = direct_sampling(ks,copy_number,sampler=sampler) return from_positions(sample) def log_dprop_direct(config,old_config): occupancy = sum(config) poses = positions(config) return log(falling_fac(copy_number,occupancy)*product(exp(-beta*eps[i]*config[i]) for i in range(config_len))) def prop_rsa(config): sample = rsa(ks,copy_number) return from_positions(sample) def log_dprop_rsa(config,old_config): #print config _ks = ks[:] prob = 1 for i,x in enumerate(config): if x > 0: prob *= _ks[i]/sum(_ks) #print x,prob _ks[i] = 0 return log(prob) x0 = [0]*config_len if method == "direct_sampling": return mh(logf,prop_direct,x0,dprop=log_dprop_direct,verbose=verbose,use_log=True,iterations=iterations) elif method == "rsa": return mh(logf,prop_rsa,x0,dprop=log_dprop_rsa,verbose=verbose,use_log=True,iterations=iterations) else: return mh(logf,prop,x0,dprop=None,verbose=verbose,use_log=True,iterations=iterations)
def explore_coupling_const(iterations=1000000): """Given 3 state system, explore spin probabilities as function of coupling strength""" N = 10 x0 = [0] * N hs = [log(1000000)] * N def hamil(xs, J): return dot(xs, hs) + J * (xs[0] + sum([xi * xj for (xi, xj) in pairs(xs)])) Js = interpolate(-16, -8 + 1, 20) def proposal(xs): return [int(random.random() < 0.5) for i in range(N)] results = [] for J in Js: chain = mh(f=lambda xs: -hamil(xs, J), proposal=proposal, x0=x0, use_log=True, iterations=iterations) ps = map(mean, transpose(chain)) results.append((J, ps)) Js, pss = transpose(results) pss = transpose(pss) colors = "bgrcmyk" for i, ps in enumerate(pss): color = colors[i % len(colors)] plt.plot(Js, ps, marker="o", linestyle="", color=color) errs = [p + 1.96 * sqrt(p * (1 - p) / iterations) ** (i + 1) + p ** (i + 1) for p in pss[0]] print i, errs plt.plot(Js, [p ** (i + 1) for p in pss[0]]) # plt.errorbar(Js,[p**(i+1) for p in pss[0]],yerr=errs, # marker='',linestyle='--',color=color) plt.plot(Js, [1.0 / iterations for J in Js]) # plt.semilogy() return results
def sella_hirsch_imh(matrix,n,Ne,iterations=50000): f = lambda motif:log_fitness(matrix,motif,G) nu = Ne - 1 pss = [normalize([exp(-nu*ep) for ep in col]) for col in matrix] rq = lambda motif:sample_motif_neglect_fg(matrix,n,Ne,pss=pss) dq = lambda motif_prime,motif: dsample_motif_neglect_fg(matrix,motif_prime,Ne,pss=pss) return matrix, mh(f,rq,rq(None),dprop=dq,use_log=True)
def infer_synthetic_energy_model(num_reads=100000): """the whole show: infer the energy model from true reads""" G = len(genome) w = 10 true_matrix = [[-2, 0, 0, 0] for _ in range(w)] true_mu = -20 true_eps = score_genome_np(true_matrix, genome) true_ps = fd_solve_np(true_eps, true_mu) MFL = 250 #mean frag length = 250bp lamb = 1/250.0 true_reads = reads_from_ps(true_ps, MFL, min_seq_len=75, num_reads=num_reads) true_rdm = density_from_reads(true_reads, G) init_matrix = random_energy_matrix(w) init_mu = -20 init_scores = score_genome_np(init_matrix, genome) init_state = ((init_matrix, init_mu), init_scores) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads)) rprop = lambda state:complete_rprop(state, genome) verbose = True iterations = 50000 print "true_ll:", logf(((true_matrix, true_mu), true_eps)) matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, capture_state=capture_state, verbose=verbose, use_log=True, iterations=iterations, modulus=100) return matrix_chain
def main(G=5000000,iterations=50000,init_matrix=None,init_mu=None,verbose=True): """Test case for FD-inference""" print "generating genome" genome = random_site(G) print "generating eps" eps = score_genome_np(TRUE_ENERGY_MATRIX,genome) min_mu,max_mu = -40,0 mu = bisect_interval(lambda mu:np.sum(fd_solve_np(eps,mu))-q,min_mu,max_mu,verbose=True,tolerance=1e-1) print "computing ps" true_ps = fd_solve_np(eps,mu) print "true q:",np.sum(true_ps) print "generating chip dataset" mapped_reads = np.array(map_reads_np(chip_ps_np(true_ps,MEAN_FRAGMENT_LENGTH,NUM_CELLS_ORIGINAL),G)) print "finished chip dataset" if init_matrix is None: init_matrix = random_energy_matrix(w) if init_mu is None: init_mu = -20#random.random()*40 - 20 init_scores = score_genome_np(init_matrix,genome) init_state = ((init_matrix,init_mu),init_scores) logf = lambda state:complete_log_likelihood(state,mapped_reads) print "true mu:",mu print "true log_likelihood:",logf(((TRUE_ENERGY_MATRIX,mu),eps)) rprop = lambda state:complete_rprop(state,genome) print "hitting mh loop" matrix_chain = mh(logf,proposal=rprop,x0=init_state,dprop=log_dprop,capture_state=capture_state,verbose=verbose,use_log=True,iterations=iterations,modulus=100) return matrix_chain,genome,mapped_reads
def evo_ic_sample_motif(N, L, des_ic, beta=1, theta=None, iterations=10000, verbose=False): """Do MH over evo param space with likelihood function proportional to IC mismatch""" matrix0 = [[random.gauss(0, 1) for _ in range(4)] for i in range(L)] mu0 = -10 Ne0 = 2 if theta is None: theta = (matrix0, mu0, Ne0) def f(theta): matrix, mu, Ne = theta motif = sample_motif_cftp(matrix, mu, Ne, N) return exp(-beta * (motif_ic(motif) - des_ic)**2) chain = mh(f, prop2, theta, iterations=iterations, verbose=verbose, cache=False) return chain
def evo_ic_sample_motif2(N, L, des_ic, beta=1, theta=None, iterations=10000, prop_sigma=1, trials=1, verbose=False): """Do MH over evo param space with likelihood function proportional to IC mismatch""" if theta is None: sigma0 = 1 mu0 = -10 Ne0 = 2 theta = (sigma0, mu0, Ne0) def f(theta): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic,motifs) ic = mean(ics) print "sigma, mu, Ne:", sigma, mu, Ne print "mean IC:", ic return exp(-beta*(ic - des_ic)**2) def prop(theta): #print "propping:", theta thetap = (max(0.01,theta[0] + random.gauss(0,prop_sigma)), theta[1] + random.gauss(0,prop_sigma), max(1,theta[2] + random.gauss(0,prop_sigma))) #print "thetap:", thetap return thetap chain = mh(f, prop, theta, iterations=iterations, verbose=verbose, cache=False) return chain
def estremo_gibbs(iterations=50000, verbose=False, every=1000, sigma=1, mu=-10, Ne=5): nu = Ne - 1 L = 10 N = 20 code, motif = (sample_code(L=10, sigma=1), random_motif(length=L, num_sites=N)) def log_f((code, motif)): eps = map(lambda x: -log(x), pw_prob_sites(motif, code)) return sum(nu * log(1 / (1 + exp(ep - mu))) for ep in eps) chain = [(code, motif[:])] print log_f((code, motif)) for iteration in trange(iterations): for i in range(N): site = motif[i] for j in range(L): b = site[j] log_ps = [] bps = [bp for bp in "ACGT" if not bp == b] for bp in bps: site_p = subst(site, bp, j) log_ps.append(log_f((code, [site_p]))) log_ps = [p - min(log_ps) for p in log_ps] bp = inverse_cdf_sample(bps, map(exp, log_ps), normalized=False) motif[i] = subst(site, bp, j) for k in range(L - 1): for b1 in "ACGT": for b2 in "ACGT": dws = [random.gauss(0, 0.1) for _ in range(10)] code_ps = [[d.copy() for d in code] for _ in range(10)] for code_p, dw in zip(code_ps, dws): code_p[k][b1, b2] += dw log_ps = [log_f((code_p, motif)) for code_p in code_ps] log_ps = [p - min(log_ps) for p in log_ps] code_p = inverse_cdf_sample(code_ps, map(exp, log_ps), normalized=False) code = code_p print log_f((code, motif)) chain.append((code, motif[:])) return chain x0 = (sample_code(L=10, sigma=1), random_motif(length=10, num_sites=20)) chain = mh(log_f, prop, x0, use_log=True, iterations=iterations, verbose=verbose, every=every) return chain
def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, mu, s))[25000:] return mean(chain)
def apw_fit(sigma, mu, Ne): code = sample_code(L, sigma) def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:apw_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, mu, s))[25000:] return mean(chain)
def bsh_chain(N=1000,iterations=50000): L = 20 n = 10 log_f = lambda(tf,motif):N*log(fitness((tf,motif))) prop = mutate x0 = ringer(n,L)#([random.choice([0,1]) for i in range(L)],random_motif(L,n)) chain = mh(log_f,prop,x0,use_log=True,iterations=iterations) return chain
def sample_model(model, iterations=50000,x0=None): k = len(model) L = int(1 + sqrt(1+8*k)/2) if x0 is None: x0 = random_site(L) chain = mh(lambda s:score(model,s), proposal=mutate_site, x0=random_site(L), use_log=True, iterations=iterations) return chain
def mr_system_mh(alphas,G=100000.0,n=16,L=10): scale = 10000 #lower means less stringent matrix = [[0,0,0,0] for i in range(L)] motif = [random_site(L) for i in range(n)] scaled_sse = lambda matrix,motif:(sse(matrix,motif,alphas,G,n))*scale return mh(lambda (matrix,motif):exp(-scaled_sse(matrix,motif)), lambda (matrix,motif):propose(matrix,motif), (matrix,motif), iterations=100000, every=1000,verbose=True)
def sample_model(model, iterations=50000, x0=None): k = len(model) L = int(1 + sqrt(1 + 8 * k) / 2) if x0 is None: x0 = random_site(L) chain = mh(lambda s: score(model, s), proposal=mutate_site, x0=random_site(L), use_log=True, iterations=iterations) return chain
def experiment1_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 pssm = linearize(code) def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def sample_pw_motif_mh(code, N, Ne, mu, iterations=50000): nu = Ne - 1 def log_f(motif): eps = map(lambda x: -log(x), pw_prob_sites(motif, code)) return sum(log(1 / (1 + exp(ep - mu))**nu) for ep in eps) prop = mutate_motif L = len(code) + 1 x0 = random_motif(L, N) return mh(log_f, prop, x0, cache=True, use_log=True, iterations=iterations)
def apw_fit(sigma, mu, Ne): code = sample_code(L, sigma) def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) chain = mh(lambda s: apw_phat(s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: apw_occ(code, mu, s))[25000:] return mean(chain)
def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) chain = mh(lambda s: linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, mu, s))[25000:] return mean(chain)
def evo_ic_sample_motif(N, L, des_ic, beta=1, theta=None, iterations=10000, verbose=False): """Do MH over evo param space with likelihood function proportional to IC mismatch""" matrix0 = [[random.gauss(0,1) for _ in range(4)] for i in range(L)] mu0 = -10 Ne0 = 2 if theta is None: theta = (matrix0, mu0, Ne0) def f(theta): matrix, mu, Ne = theta motif = sample_motif_cftp(matrix, mu, Ne, N) return exp(-beta*(motif_ic(motif) - des_ic)**2) chain = mh(f, prop2, theta, iterations=iterations, verbose=verbose, cache=False) return chain
def sample_site_imh(matrix, mu, Ne, lamb, iterations=None): nu = Ne - 1 L = len(matrix) if iterations is None: iterations = 10*L log_phat = lambda site:-nu*log(1+exp(score_seq(matrix,site)-mu)) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log,row) for row in tilted_psfm] def prop(_): return sample_from_psfm(tilted_psfm) def log_dprop(xp, _): return score_seq(log_tilted_psfm, xp) return mh(log_phat, proposal=prop, dprop=log_dprop, x0=prop(None), use_log=True)[-1]
def site_mh(matrix, mu, Ne, iterations=50000): site_mu, site_sigma = site_mu_from_matrix(matrix), site_sigma_from_matrix( matrix) L = len(matrix) nu = Ne - 1 log_f = lambda site: log_Pe(score_seq(matrix, site), site_mu, site_sigma, mu, Ne) #prop = lambda site:random_site(L) prop = lambda site: mutate_site(site) return mh(log_f, prop, x0=random_site(L), use_log=True, iterations=iterations)
def sella_hirsch_mh_sampling(n=16,L=16,G=1000,N=100,sigma=1,iterations=50000): Zb = compute_Zb(n,L,sigma,G) nu = N-1 def fitness(motif): eps = [sigma*sum(b!="A" for b in site) for site in motif] fg = sum(exp(-sigma*ep) for ep in eps) return fg/(fg + Zb) def log_p(motif): return (nu * log(fitness(motif))) def proposal(motif): p = 4.0/(n*L) return mutate_motif_p(motif,p) x0 = random_motif(n,L) chain = mh(log_p,proposal,x0,use_log=True,iterations=iterations) return chain
def experiment2_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 sites = [random_site(L) for i in xrange(10000)] apw_eps = [score(code, site) for site in sites] site_sigma = sd(apw_eps) pssm = sample_matrix(L, sqrt(site_sigma**2/L)) #linear_eps = [score_seq(pssm, site) for site in sites] def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def experiment1_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 pssm = linearize(code) def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def mh_ising(hs, J, iterations=50000, verbose=False): sigmas = [random.choice([-1, 1]) for i in hs] N = len(hs) iterations *= N # iterations per spin def hamil(ss): return sum([s * h for (s, h) in zip(ss, hs)]) + J * (sum(ss[i] * ss[(i + 1) % N] for i in range(N))) def f(ss): return -hamil(ss) def prop(ss): i = random.randrange(N) ss_new = ss[:] ss_new[i] *= -1 return ss_new chain = mh(f, prop, sigmas, iterations=iterations, verbose=verbose, use_log=True) return map(lambda spin: mean([(s + 1) / 2 for s in spin]), transpose(chain))
def metropolis_pb(ks,q,verbose=False,mu_offset=0,iterations=50000): """Metropolis-Hastings sampling for ks, given product-bernoulli proposal function""" G = len(ks) eps = [-log(k) for k in ks] f = lambda mu:sum(fd(ep,mu) for ep in eps) - q mu = bisect_interval(f,-50,50) + mu_offset def weight(ss): return (falling_fac(q,sum(ss))*product(k**s for k,s in zip(ks,ss))) def proposal(ss): #state = [int(random.random() < p) for _ in range(len(ss))] state = rstate(eps,mu) #print "proposed state with occ:",sum(state) return state def dprop(ss): prop = dstate(ss,eps,mu) #print "prop:",prop return prop x0 = proposal([0] * len(ks)) return mh(weight,proposal,x0,dprop=dprop,verbose=verbose,iterations=iterations)
def infer_arca_energy_model(num_reads=1000000): """the whole show: infer the energy model from true reads""" true_reads = get_arca_reads(num_reads) G = len(genome) lamb = 1/250.0 true_rdm = density_from_reads(true_reads, G) w = 10 init_matrix = random_energy_matrix(w) init_mu = -20 init_scores = score_genome_np(init_matrix, genome) init_state = ((init_matrix, init_mu), init_scores) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads)) rprop = lambda state:complete_rprop(state, genome) verbose = True iterations = 50000 matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, capture_state=capture_state, verbose=verbose, use_log=True, iterations=iterations, modulus=100) return matrix_chain
def uniform_motif_imh_tv(n, L, desired_ic, beta=None, epsilon=None, tv=0.01): """run uniform imh to within total variation bound tv""" correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col if beta == None: beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) if epsilon == None: epsilon = 1.0 / (2 * beta) print "maximally efficient epsilon:", epsilon ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Qp(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def Q(motif): return sample_until(lambda m: abs(motif_ic(m) - desired_ic) < epsilon, lambda: Qp(None), 1)[0] def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 alpha = exp(-2 * beta * epsilon) iterations = int(ceil(log(tv) / log(1 - alpha))) print "iterations:", iterations x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] # first, determine probability of landing in range chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain
def sample_site_imh(matrix, mu, Ne, lamb, iterations=None): nu = Ne - 1 L = len(matrix) if iterations is None: iterations = 10 * L log_phat = lambda site: -nu * log(1 + exp(score_seq(matrix, site) - mu)) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log, row) for row in tilted_psfm] def prop(_): return sample_from_psfm(tilted_psfm) def log_dprop(xp, _): return score_seq(log_tilted_psfm, xp) return mh(log_phat, proposal=prop, dprop=log_dprop, x0=prop(None), use_log=True)[-1]
def evo_ic_sample_motif2(N, L, des_ic, beta=1, theta=None, iterations=10000, prop_sigma=1, trials=1, verbose=False): """Do MH over evo param space with likelihood function proportional to IC mismatch""" if theta is None: sigma0 = 1 mu0 = -10 Ne0 = 2 theta = (sigma0, mu0, Ne0) def f(theta): sigma, mu, Ne = theta matrices = [sample_matrix(L, sigma) for i in xrange(trials)] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix in matrices] ics = map(motif_ic, motifs) ic = mean(ics) print "sigma, mu, Ne:", sigma, mu, Ne print "mean IC:", ic return exp(-beta * (ic - des_ic)**2) def prop(theta): #print "propping:", theta thetap = (max(0.01, theta[0] + random.gauss(0, prop_sigma)), theta[1] + random.gauss(0, prop_sigma), max(1, theta[2] + random.gauss(0, prop_sigma))) #print "thetap:", thetap return thetap chain = mh(f, prop, theta, iterations=iterations, verbose=verbose, cache=False) return chain
def metropolis_uniform(ks,q,verbose=False,mu_offset=0,iterations=50000): """Metropolis-Hastings sampling for ks, given uniform proposal function""" G = len(ks) eps = [-log(k) for k in ks] f = lambda mu:sum(fd(ep,mu) for ep in eps) - q mu = bisect_interval(f,-50,50) + mu_offset def weight(ss): return (falling_fac(q,sum(ss))*product(k**s for k,s in zip(ks,ss))) def proposal(ss): on_chr_prob = sum(ss)/float(q) on_chr = random.random() < on_chr_prob ss_new = ss[:] if on_chr: pos = random.choice([i for (i,s) in enumerate(ss) if s]) ss_new[pos] = 0 new_pos = random.choice([-1] + [i for (i,s) in enumerate(ss) if not s]) if new_pos >= 0: ss_new[new_pos] = 1 return ss_new x0 = proposal([0] * len(ks)) return mh(weight,proposal,x0,verbose=verbose,iterations=iterations)
def main(): sigma = 8 ks = [1] + [exp(random.gauss(0,sigma)) for i in range(100)] #k0 is off-state G = len(ks)-1 q = 5 def Pstar(xs): """Compute probability of config under Gq model, up to Z""" weight = falling_fac(q,len([x for x in xs if x > 0])) return weight * product([ks[x] for x in xs]) def rQ(xs): """given current configuration, sample one independently using rsa""" return smart_rsa(ks,q) def dQ(xs,xs_last): """Return probability of configuration under rsa""" _ks = ks[:] prob = 1 for x in xs: k = _ks[x] prob *= k/sum(_ks) if x > 0: _ks[x] = 0 return prob tic = time.time() chain = mh(Pstar,rQ,[0,0,0,0,0],dQ) toc = time.time() print "ran chain in:",toc-tic print "starting direct sampling" tic = time.time() test_xs = [direct_sampling(ks,q) for i in verbose_gen(xrange(50001), modulus=1)] toc = time.time() print "direct sampling in:",toc-tic ss = [ss_from_xs(xs,G) for xs in chain] test_ss = [ss_from_xs(xs,G) for xs in test_xs] plt.plot(map(mean,transpose(ss)),label="Lifting") plt.plot(map(mean,transpose(test_ss)),label="Direct Sampling") plt.xlabel("Chromosomal coordinate") plt.ylabel("Occupancy") plt.legend() plt.show()
def recovery(): G = 10000 config = [G/2] mfl = 250 lamb = 1/float(mfl) num_frags = 1000 frags = concat([chip(G,config,mfl) for i in xrange(num_frags)]) min_seq_length = 75 sequenced_frags = filter(lambda (start,stop):stop - start > min_seq_length,frags) fd_frags,bk_frags = separate(lambda x:random.random() < 0.5,sequenced_frags) fd_reads = [('+',start,start+75) for (start,stop) in fd_frags] bk_reads = [('-',stop-75,stop) for (start,stop) in bk_frags] reads = fd_reads + bk_reads hyp0 = [int(random.random() < 0.5) for i in range(G)] def f(hyp): return log_likelihood(reads,hyp,lamb,G) def prop(hyp): i = random.randrange(G) hyp_copy = hyp[:] hyp_copy[i] = 1 - hyp_copy[i] return hyp_copy chain = mh(f,prop,hyp0,use_log=True,verbose=True)
def chip_ps_ising(ps,mean_frag_length,cells=10000,iterations=50000,x0=None,verbose=False): eps = -np.log(ps/(1-ps)) lamb = 1.0/mean_frag_length coupling = -mean(eps) + log(lamb/(1-lamb)) #coupling = -log(mean_frag_length) G = len(eps) if x0 is None: x0 = np.zeros(G) def hamiltonian(xs): field_contrib = np.dot(xs,eps) coupling_contrib = coupling*np.dot(np.diff(xs) == 0,xs[:-1]) # if random.random() < 0.001: # print "field contrib:",field_contrib,"coupling contrib:",coupling_contrib return field_contrib + coupling_contrib # bonus for [...,1,1,...] #return coupling/2.0*sum(np.diff(xs)) # penalty for differences def propose(xs): if random.random() < 0.001: print "occupation number:",np.sum(xs) ys = np.array(xs) i = random.randrange(G) ys[i] = 1 - ys[i] return ys def propose2(xs): return np.random.random(G) < ps def propose3(xs): flip_p = 1.0/mean_frag_length flip = 0 ys = xs[:] for i in range(G): if random.random() < flip_p: flip = 1 - flip ys[i] = ys[i] - flip return ys def log_dprop2(xs,ys): return np.dot(np.log(ps),xs) + np.dot(np.log(1-ps),1-xs) chain = mh(f=lambda xs:-hamiltonian(xs),proposal=propose,iterations=iterations, x0=x0,dprop=None,use_log=True,verbose=verbose) return chain
def sella_hirsch_mh(Ne=5, n=16, L=16, sigma=1, mu=0, init="random", matrix=None, x0=None, iterations=50000, p=None): print "p:", p if matrix is None: matrix = sample_matrix(L, sigma) else: L = len(matrix) if x0 is None: if init == "random": x0 = random_motif(L, n) elif init == "ringer": x0 = ringer_motif(matrix, n) elif init == "anti_ringer": x0 = anti_ringer_motif(matrix, n) else: x0 = init if p is None: p = 1.0 / (n * L) nu = Ne - 1 def log_f(motif): return nu * log_fitness(matrix, motif, mu) def prop(motif): motif_p = mutate_motif_p(motif, p) # probability of mutation per basepair return motif_p chain = mh(log_f, prop, x0, use_log=True, iterations=iterations) return matrix, chain
def sella_hirsch_mh_penalize_mu(Ne=5, n=16, L=16, G=5 * 10**6, sigma=1, alpha=0.01, init="random", matrix=None, x0=None, iterations=50000, p=None): print "p:", p if matrix is None: matrix = sample_matrix(L, sigma) if x0 is None: if init == "random": x0 = (random_motif(L, n), random.gauss(0, 1)) elif init == "ringer": x0 = (ringer_motif(matrix, n), random.gauss(0, 1)) elif init == "anti_ringer": x0 = (anti_ringer_motif(matrix, n), random.gauss(0, 1)) else: x0 = init if p is None: p = 1.0 / (n * L) nu = Ne - 1 def log_f((motif, mu)): return nu * log_fitness_penalize_mu(matrix, motif, mu, alpha) def prop((motif, mu)): motif_p = mutate_motif_p(motif, p) # probability of mutation per basepair mu_p = mu + random.gauss(0, 0.1) return motif_p, mu_p chain = mh(log_f, prop, x0, use_log=True, iterations=iterations) return matrix, chain
def mr_system(alphas,init_system=None,G=100000.0,n=16,L=10, sse_epsilon=0.00000001,use_annealing=True,scale=1000, iterations=10000,motif_prob=0.5,verbose=False): proposal = lambda matrix,motif:propose(matrix,motif,motif_prob=motif_prob) if init_system is None: matrix = [[0,0,0,0] for i in range(L)] motif = [random_site(L) for i in range(n)] else: matrix,motif = init_system if use_annealing: scaled_sse = lambda(matrix,motif):((sse(matrix,motif,alphas,G,n))*scale) return anneal(scaled_sse, lambda(matrix,motif):proposal(matrix,motif), (matrix,motif), iterations=iterations, stopping_crit = sse_epsilon*scale,verbose=verbose) else: scaled_sse = lambda(matrix,motif):exp((sse(matrix,motif,alphas,G,n))*-scale) return mh(scaled_sse, lambda(matrix,motif):proposal(matrix,motif), (matrix,motif), iterations=iterations, every=100,verbose=True)
def sella_hirsch_mh_sampling(n=16, L=16, G=1000, N=100, sigma=1, iterations=50000): Zb = compute_Zb(n, L, sigma, G) nu = N - 1 def fitness(motif): eps = [sigma * sum(b != "A" for b in site) for site in motif] fg = sum(exp(-sigma * ep) for ep in eps) return fg / (fg + Zb) def log_p(motif): return (nu * log(fitness(motif))) def proposal(motif): p = 4.0 / (n * L) return mutate_motif_p(motif, p) x0 = random_motif(n, L) chain = mh(log_p, proposal, x0, use_log=True, iterations=iterations) return chain
def estremo(iterations=50000, verbose=False, every=1, sigma=1, mu=-10, Ne=5): nu = Ne - 1 def log_f((code, motif)): eps = map(lambda x: -log(x), pw_prob_sites(motif, code)) return sum(nu * log(1 / (1 + exp(ep - mu))) for ep in eps) def prop((code, motif)): code_p = [d.copy() for d in code] i = random.randrange(len(code)) b1, b2 = random.choice("ACGT"), random.choice("ACGT") code_p[i][(b1, b2)] += random.gauss(0, sigma) motif_p = mutate_motif(motif) return (code_p, motif_p) x0 = (sample_code(L=10, sigma=1), random_motif(length=10, num_sites=20)) chain = mh(log_f, prop, x0, use_log=True, iterations=iterations, verbose=verbose, every=every) return chain
def sample_site(): return mh(f, mutate_site, best_site, iterations=10*L, verbose=0)[-1]
def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L))
def sample_site_mh(matrix, mu, Ne, ringer_site, iterations=1000): nu = Ne - 1 def phat(s): ep = score_seq(matrix, s) return (1 + exp(ep - mu))**(-nu) return mh(f=phat,proposal=mutate_site,x0=ringer_site, iterations=iterations)
def mh_ringer(code): f = lambda (x): fitness(code, x) prop = lambda x: mutate(x, 0.001, 0.001) x0 = sample_species() chain = mh(f, prop, x0, use_log=True)
def evolve_trajectory(ic): return mh(lambda (matrix,motif):exp(-sse(matrix,motif,alphas,G,n)), lambda (matrix,motif):(mutate_matrix(matrix),motif), (matrix,motif), iterations=10000)
def mh_motif(n,w,desired_ic,epsilon,scale=10,iterations=10000): """Find a motif satisfying desired_ic +/- epsilon by mh sampling""" motif = [random_site(w) for i in range(n) ] f = lambda m:exp(-abs(desired_ic-motif_ic(m))*scale) proposal = mutate_motif return mh(f,proposal,motif,iterations=iterations)
def main(): chain1 = mh(P, Q1, 1, dprop=dQ1) chain2 = mh(P, Q2, 1)
def sample_site(): return mh(f, mutate_site, best_site, iterations=10 * L, verbose=0)[-1]
def uniform_motif_with_ic_rw(n, L, desired_ic, epsilon=0.1, p=None, iterations=None, num_chains=8, x0=None, beta=None): if p is None: p = 2.0 / (n * L) def Q(motif): return mutate_motif_p(motif, p) def f(motif): return abs(motif_ic(motif) - desired_ic) < epsilon if type(iterations) is int: if x0 is None: x0 = uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=epsilon, iterations=1, beta=beta)[0] chain = mh(f, proposal=Q, x0=x0, iterations=iterations) return chain elif iterations == "harmonic": ar = 1.0 / 5 iterations = int(n * L * harmonic(n * L) / ar) print "iterations:", iterations if x0 is None: x0 = uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=epsilon, iterations=1)[0] chain = mh(f, proposal=Q, x0=x0, iterations=iterations) return chain else: #use gelman rubin criterion x0s = [ uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=epsilon, iterations=1)[0] for i in range(num_chains) ] iterations = 100 converged = False chains = [[] for _ in range(num_chains)] while not converged: for chain, x0 in zip(chains, x0s): chain.extend( mh(f, proposal=Q, x0=x0, iterations=iterations, verbose=False)) ic_chains = mmap(motif_ic, chains) R_hat, neff = gelman_rubin(ic_chains) if R_hat < 1.1: return chains else: x0s = [chain[-1] for chain in chains] iterations *= 2