def experiment2_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 sites = [random_site(L) for i in xrange(10000)] apw_eps = [score(code, site) for site in sites] site_sigma = sd(apw_eps) pssm = sample_matrix(L, sqrt(site_sigma**2 / L)) #linear_eps = [score_seq(pssm, site) for site in sites] def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [ sd([score(code, site) for site in sites]) for code in codes ] linear_site_sigmas = [ sd([score_seq(pssm, site) for site in sites]) for pssm in pssms ] def apw_phat(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def apw_occ(code, site): ep = score(code, site) return 1 / (1 + exp(ep - mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu)) apw_mean_fits = [ exp( mean( map( log10, mh(lambda s: apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: apw_occ(code, s))[1:]))) for code in tqdm(codes) ] linear_mean_fits = [ exp( mean( map( log10, mh(lambda s: linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms) ] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g', label='linear') plt.semilogy() plt.legend(loc='lower right')
def sample_model(model, iterations=50000,x0=None): k = len(model) L = int(1 + sqrt(1+8*k)/2) if x0 is None: x0 = random_site(L) chain = mh(lambda s:score(model,s), proposal=mutate_site, x0=random_site(L), use_log=True, iterations=iterations) return chain
def sample_model(model, iterations=50000, x0=None): k = len(model) L = int(1 + sqrt(1 + 8 * k) / 2) if x0 is None: x0 = random_site(L) chain = mh(lambda s: score(model, s), proposal=mutate_site, x0=random_site(L), use_log=True, iterations=iterations) return chain
def sample_site_cftp_dep(matrix, mu, Ne): L = len(matrix) def log_phat(s): ep = score_seq(matrix,s) nu = Ne - 1 return -nu*log(1 + exp(ep - mu)) first_site = "A"*L last_site = "T"*L best_site = "".join(["ACGT"[argmin(row)] for row in matrix]) worst_site = "".join(["ACGT"[argmax(row)] for row in matrix]) trajs = [[best_site],[random_site(L)],[random_site(L)],[random_site(L)], [worst_site]] def mutate_site(site,(ri,rb)): return subst(site,"ACGT"[rb],ri)
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max) d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep)) phat = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row)]) lamb = bisect_interval(lambda l:mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log,row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep)/pmode: motif.append(site) return motif
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [sd([score(code,site) for site in sites]) for code in codes] linear_site_sigmas = [sd([score_seq(pssm,site) for site in sites]) for pssm in pssms] def apw_phat(code, site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def apw_occ(code, site): ep = score(code, site) return 1/(1+exp(ep-mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu)) apw_mean_fits = [exp(mean(map(log10, mh(lambda s:apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, s))[1:]))) for code in tqdm(codes)] linear_mean_fits = [exp(mean(map(log10, mh(lambda s:linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms)] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g',label='linear') plt.semilogy() plt.legend(loc='lower right')
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max, matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm( ep, 0, site_sigma) * (ep_min <= ep <= ep_max) d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep)) phat = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row) ]) lamb = bisect_interval(lambda l: mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log, row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep) / pmode: motif.append(site) return motif
def apw_fit(sigma, mu, Ne): code = sample_code(L, sigma) def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:apw_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, mu, s))[25000:] return mean(chain)
def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, mu, s))[25000:] return mean(chain)
def select_sites_by_occupancy(matrix, mu, n): L = len(matrix) motif = [] while len(motif) < n: site = random_site(L) if random.random() < 1 / (1 + exp(score_seq(matrix, site) - mu)): motif.append(site) print len(motif) return motif
def alignment_simulation(): ell = 100 L = 10 N = 50 trials = 100000 seqs = [random_site(ell) for i in range(N)] def random_alignment(): rs = [random.randrange(ell-L+1) for _ in range(N)] return [seq[r:r+L] for seq, r in zip(seqs,rs)] ics = [motif_ic(random_alignment()) for _ in trange(trials)]
def mr_system_mh(alphas,G=100000.0,n=16,L=10): scale = 10000 #lower means less stringent matrix = [[0,0,0,0] for i in range(L)] motif = [random_site(L) for i in range(n)] scaled_sse = lambda matrix,motif:(sse(matrix,motif,alphas,G,n))*scale return mh(lambda (matrix,motif):exp(-scaled_sse(matrix,motif)), lambda (matrix,motif):propose(matrix,motif), (matrix,motif), iterations=100000, every=1000,verbose=True)
def experiment1_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 pssm = linearize(code) def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def sample_site_ar(matrix, mu, Ne, lamb=None, modulus=10**6, return_ar=False): nu = Ne - 1 if nu == 0: return random_site(len(matrix)) if lamb is None: lamb = nu/2.0 L = len(matrix) def rQ(): return sample_from_matrix(matrix, lamb) #log_Z = sum(log(sum(exp(-lamb*ep) for ep in col)) for col in matrix) def log_dQ(site): log_numer = -lamb*sum(row["ACGT".index(b)] for b,row in zip(site,matrix)) return log_numer# - log_Z def log_fit(site): return -nu*log(1+exp(score_seq(matrix,site)-mu)) ep_max = sum(max(row) for row in matrix) ep_min = sum(min(row) for row in matrix) alpha = lamb/float(nu) def find_logM(ep): return nu*log((exp(alpha*ep)/(1+exp(ep-mu)))) def log_M_p(ep): term1 = alpha*exp(alpha*ep)/(exp(ep-mu)+1) term2 = -exp(alpha*ep+ep-mu)/((exp(ep-mu)+1)**2) return term1 + term2 if alpha != 1 and alpha/(1-alpha) > 0: ep_crit = log(alpha/(1-alpha)) + mu else: deriv = log_M_p(0) if deriv < 0: ep_crit = ep_min else: ep_crit = ep_max log_M = find_logM(ep_crit) # print "choosing from:",find_logM(ep_min), find_logM(ep_max) # print "log_M:",log_M trials = 0 while True: trials += 1 s = rQ() log_f = log_fit(s) log_prop = log_dQ(s) + log_M log_ar = log_f - log_prop log_r = log(random.random()) accept = log_r < log_ar #print trials, s, "*" if accept else " ",log_r, log_ar, log_f, log_prop assert log_f < 0 assert log_ar < 0 if trials % modulus == 0: print trials, s, "*" if accept else " ",log_r, log_ar, log_f, log_prop if accept: if return_ar: return trials else: return s
def roc_experiment(motif, trials=10**5): pw_model = pairwise_model_from_motif(motif) li_model = linear_model_from_motif(motif) L = len(motif[0]) negatives = [random_site(L) for i in trange(trials)] pw_pos = [pw_prob_site(site, pw_model) for site in motif] pw_neg = [pw_prob_site(site, pw_model) for site in tqdm(negatives)] li_pos = [linear_prob_site(site, li_model) for site in motif] li_neg = [linear_prob_site(site, li_model) for site in tqdm(negatives)] _, _, _, pw_auc = roc_curve(pw_pos, pw_neg) _, _, _, li_auc = roc_curve(li_pos, li_neg, color='g') return li_auc, pw_auc
def alignment_simulation(): ell = 100 L = 10 N = 50 trials = 100000 seqs = [random_site(ell) for i in range(N)] def random_alignment(): rs = [random.randrange(ell - L + 1) for _ in range(N)] return [seq[r:r + L] for seq, r in zip(seqs, rs)] ics = [motif_ic(random_alignment()) for _ in trange(trials)]
def apw_fit(sigma, mu, Ne): code = sample_code(L, sigma) def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) chain = mh(lambda s: apw_phat(s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: apw_occ(code, mu, s))[25000:] return mean(chain)
def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) chain = mh(lambda s: linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state=lambda s: linear_occ(pssm, mu, s))[25000:] return mean(chain)
def train_pairwise_model(motif, pc=1 / 16.0, decay_timescale=10000, take_stock=1000, eta=0.01, stop_crit=0.01): L = len(motif[0]) N = len(motif) fs = get_pairwise_freqs(motif, pc=pc) ws = [{(b1, b2): 0 for (b1, b2) in dinucs} for _ in range(int(choose(L, 2)))] x = random_site(L) log_y = score(ws, x) chain = [] # sses = [0.0] * (int(iterations/take_stock) + 1) #chain = [] #for iteration in xrange(iterations): iteration = 0 stock_counter = take_stock while True: xp = mutate_site(x) log_yp = score(ws, xp) if log(random.random()) < log_yp - log_y: x = xp log_y = log_yp chain.append(x) if iteration > 0 and iteration % stock_counter == 0: current_fs = get_pairwise_freqs( sample(N, chain[iteration - stock_counter:iteration], replace=False)) sse = 0 for w, f, cur_f in zip(ws, fs, current_fs): for b1, b2 in dinucs: delta = f[b1, b2] - cur_f[b1, b2] sse += delta**2 w[b1, b2] += eta * ( delta) #* exp(-iteration/float(decay_timescale)) #sses[iteration/take_stock] = sse sse_per_col_pair = sse / choose(L, 2) print iteration, stock_counter, sse_per_col_pair, exp( -iteration / float(decay_timescale)), ws[0]['A', 'A'] stock_counter += random.randrange(2) #print "motif_ic:", motif_ic(chain[iteration-stock_counter : iteration]) if iteration > 0 and sse_per_col_pair < stop_crit: print "breaking:", sse, sse_per_col_pair break log_y = score(ws, x) # recalculate this because weights change #stock_counter += take_stock * (iteration > take_stock) iteration += 1 return ws
def site_mh(matrix, mu, Ne, iterations=50000): site_mu, site_sigma = site_mu_from_matrix(matrix), site_sigma_from_matrix( matrix) L = len(matrix) nu = Ne - 1 log_f = lambda site: log_Pe(score_seq(matrix, site), site_mu, site_sigma, mu, Ne) #prop = lambda site:random_site(L) prop = lambda site: mutate_site(site) return mh(log_f, prop, x0=random_site(L), use_log=True, iterations=iterations)
def mr_system_sa(alphas,init_system=None,G=100000.0,n=16,L=10, sse_epsilon=0.0001,proposal=propose,scale=1000, iterations=10000,return_trajectory=False): if init_system is None: matrix = [[0,0,0,0] for i in range(L)] motif = [random_site(L) for i in range(n)] else: matrix,motif = init_system scaled_sse = lambda(matrix,motif):sse(matrix,motif,alphas,G,n)*scale return anneal(scaled_sse, lambda(matrix,motif):proposal(matrix,motif), (matrix,motif), iterations=iterations, stopping_crit = sse_epsilon*scale, return_trajectory=return_trajectory)
def experiment2_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 sites = [random_site(L) for i in xrange(10000)] apw_eps = [score(code, site) for site in sites] site_sigma = sd(apw_eps) pssm = sample_matrix(L, sqrt(site_sigma**2/L)) #linear_eps = [score_seq(pssm, site) for site in sites] def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def experiment1_(): L = 10 sigma = 1 code = sample_code(L, 1) mu = -10 Ne = 2 pssm = linearize(code) def apw_phat(site): ep = score(code, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def linear_phat(site): ep = score_seq(pssm, site) return 1 / (1 + exp(ep - mu))**(Ne - 1) def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L)) apw_chain = mh(apw_phat, proposal=mutate_site, x0=random_site(L)) linear_chain = mh(linear_phat, proposal=mutate_site, x0=random_site(L)) apw_fits = map(apw_phat, apw_chain) linear_fits = map(linear_phat, linear_chain) return apw_fits, linear_fits
def sample_uniform_energy(matrix): mu = sum(map(mean, matrix)) sigma = sqrt(sum(map(lambda x:variance(x,correct=False), matrix))) ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) M_min = 1/norm.pdf(ep_min, mu, sigma) M_max = 1/norm.pdf(ep_max, mu, sigma) M = max(M_min, M_max) trials = 0 while True: trials += 1 if trials % 10000 == 0: print trials site = random_site(L) ep = score_seq(matrix, site) ar = 1/(M*norm.pdf(ep, mu, sigma)) if random.random() < ar: return site
def sample_uniform_energy(matrix): mu = sum(map(mean, matrix)) sigma = sqrt(sum(map(lambda x: variance(x, correct=False), matrix))) ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) M_min = 1 / norm.pdf(ep_min, mu, sigma) M_max = 1 / norm.pdf(ep_max, mu, sigma) M = max(M_min, M_max) trials = 0 while True: trials += 1 if trials % 10000 == 0: print trials site = random_site(L) ep = score_seq(matrix, site) ar = 1 / (M * norm.pdf(ep, mu, sigma)) if random.random() < ar: return site
def train_pairwise_model(motif, pc=1/16.0, decay_timescale=10000, take_stock=1000, eta=0.01, stop_crit=0.01): L = len(motif[0]) N = len(motif) fs = get_pairwise_freqs(motif, pc=pc) ws = [{(b1, b2):0 for (b1,b2) in dinucs} for _ in range(int(choose(L,2)))] x = random_site(L) log_y = score(ws, x) chain = [] # sses = [0.0] * (int(iterations/take_stock) + 1) #chain = [] #for iteration in xrange(iterations): iteration = 0 stock_counter = take_stock while True: xp = mutate_site(x) log_yp = score(ws, xp) if log(random.random()) < log_yp - log_y: x = xp log_y = log_yp chain.append(x) if iteration > 0 and iteration % stock_counter == 0: current_fs = get_pairwise_freqs(sample(N,chain[iteration-stock_counter : iteration], replace=False)) sse = 0 for w, f, cur_f in zip(ws, fs, current_fs): for b1, b2 in dinucs: delta = f[b1, b2] - cur_f[b1,b2] sse += delta**2 w[b1, b2] += eta*(delta) #* exp(-iteration/float(decay_timescale)) #sses[iteration/take_stock] = sse sse_per_col_pair = sse/choose(L,2) print iteration, stock_counter, sse_per_col_pair, exp(-iteration/float(decay_timescale)), ws[0]['A','A'] stock_counter += random.randrange(2) #print "motif_ic:", motif_ic(chain[iteration-stock_counter : iteration]) if iteration > 0 and sse_per_col_pair < stop_crit: print "breaking:", sse, sse_per_col_pair break log_y = score(ws, x) # recalculate this because weights change #stock_counter += take_stock * (iteration > take_stock) iteration += 1 return ws
def sample_site_bf(matrix, mu, Ne, ringer_site=None, verbose=False): """Sample site of length L from stationary fitness distribution under E(s) at effective population Ne, chemical potential mu. (bf for brute force) """ nu = Ne - 1 L = len(matrix) if ringer_site is None: ringer_site = ringer_motif(matrix,1)[0] def phat(s): ep = score_seq(matrix,s) return (1 + exp(ep - mu))**(-nu) phat_max = phat(ringer_site) trials = 0 while True: trials += 1 site = random_site(L) ar = phat(site)/phat_max if random.random() < ar: if verbose: print trials, ar return site
def mr_system(alphas,init_system=None,G=100000.0,n=16,L=10, sse_epsilon=0.00000001,use_annealing=True,scale=1000, iterations=10000,motif_prob=0.5,verbose=False): proposal = lambda matrix,motif:propose(matrix,motif,motif_prob=motif_prob) if init_system is None: matrix = [[0,0,0,0] for i in range(L)] motif = [random_site(L) for i in range(n)] else: matrix,motif = init_system if use_annealing: scaled_sse = lambda(matrix,motif):((sse(matrix,motif,alphas,G,n))*scale) return anneal(scaled_sse, lambda(matrix,motif):proposal(matrix,motif), (matrix,motif), iterations=iterations, stopping_crit = sse_epsilon*scale,verbose=verbose) else: scaled_sse = lambda(matrix,motif):exp((sse(matrix,motif,alphas,G,n))*-scale) return mh(scaled_sse, lambda(matrix,motif):proposal(matrix,motif), (matrix,motif), iterations=iterations, every=100,verbose=True)
def sample_species2(): bd = [random.choice(aas) for i in range(L)] site = random_site(L) sites = [site for i in range(n)] return (bd, sites)
def sample_apw_site(): return mh(apw_phat, proposal=mutate_site, x0=random_site(L))
def log_Z_analytic((matrix, mu, Ne), N): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1/(1+exp(ep-mu)))**(Ne-1) return N * log(acc) def log_ZS_naive((matrix, mu, Ne), trials=1000): acc = 0 nu = Ne - 1 L = len(matrix) for i in xrange(trials): ep = score_seq(matrix, random_site(L)) acc += (1/(1+exp(ep-mu)))**(Ne-1) mean_Zs = acc / trials return L * log(4) + log(mean_Zs) def log_ZM_naive((matrix, mu, Ne), N, trials=1000): return N * log_ZS_naive((matrix, mu, Ne), trials=1000) def log_ZS_hack((matrix, mu, Ne), N): L = len(matrix) mat_mu = sum(map(mean,matrix)) mat_sigma = sqrt(sum(map(lambda xs:variance(xs,correct=False), matrix))) log_perc_below_threshold = norm.logcdf(mu - log((Ne-1)), mat_mu, mat_sigma) log_Zs = L * log(4) + log_perc_below_threshold return log_Zs
from sample import direct_sampling,rsa from project_utils import score_seq,sample_average,inverse_cdf_sampler,falling_fac from utils import random_site,pairs,mh,maybesave,transpose,product from matplotlib import pyplot as plt from math import log,exp import random random.seed(1) genome = "ACGTTGCA" * 5 + random_site(80) + "ACGTTGCA" * 5 + random_site(10) G = len(genome) beta = 1 energy_matrix = [[-2,0,0,0], [-0,-2,0,0], [-0,0,-2,0], [-0,0,0,-2], [-0,0,0,-2], [-0,0,-2,0], [-0,-2,0,0], [-2,0,0,0]] w = len(energy_matrix) config_len = G-w interaction_energy = -8 # TFs in contact get -2 added to configuration energy exclusion_energy = 1000000 eps =[score_seq(energy_matrix,genome[i:i+w]) for i in range(G-w+1)] ks = [exp(-ep) for ep in eps] def positions(config): return [i for i,x in enumerate(config) if x > 0] def from_positions(poses):
def sample_Zb_terms(L,sigma,trials=10000): matrix = sample_matrix(L,sigma) return [score_seq(matrix,random_site(L)) for i in xrange(trials)]
"""compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) return N * log(acc) def log_ZS_naive((matrix, mu, Ne), trials=1000): acc = 0 nu = Ne - 1 L = len(matrix) for i in xrange(trials): ep = score_seq(matrix, random_site(L)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) mean_Zs = acc / trials return L * log(4) + log(mean_Zs) def log_ZM_naive((matrix, mu, Ne), N, trials=1000): return N * log_ZS_naive((matrix, mu, Ne), trials=1000) def log_ZS_hack((matrix, mu, Ne), N): L = len(matrix) mat_mu = sum(map(mean, matrix)) mat_sigma = sqrt(sum(map(lambda xs: variance(xs, correct=False), matrix))) log_perc_below_threshold = norm.logcdf(mu - log((Ne - 1)), mat_mu, mat_sigma)
def mh_motif(n,w,desired_ic,epsilon,scale=10,iterations=10000): """Find a motif satisfying desired_ic +/- epsilon by mh sampling""" motif = [random_site(w) for i in range(n) ] f = lambda m:exp(-abs(desired_ic-motif_ic(m))*scale) proposal = mutate_motif return mh(f,proposal,motif,iterations=iterations)
def sample_sites(n=n,L=L): return [random_site(L) for i in range(n)]
def Zb_from_matrix_ref(matrix,G): L = len(matrix) eps = np.array([score_seq(matrix,random_site(L)) for i in trange(G)]) return np.sum(np.exp(-eps))
""" This file contains data for the genome (GENOME) and DNA binding domain (TRUE_ENERGY_MATRIX) """ from utils import random_site W = 10 # width of DNA binding domain TRUE_ENERGY_MATRIX = ([[-2,0,0,0] for i in range(W)]) with open('genome.fa') as f: lines = f.readlines() #GENOME = lines[1] GENOME = random_site(5000000) L = len(GENOME) MEAN_FRAG_LENGTH = 250 # Mon May 12 19:20:38 EDT 2014 # toy genome of 10k bases, MEAN_FRAG_LENGTH = 50 works perfectly