def excess_mi_experiment(filename=None): """Do artificial motifs with linear BEMs show the same patterns of excess MI as biological motifs? (Yes)""" n = 10 L = 10 G = 1000 desired_ic = 10 replicates = 1000 ics = np.array( [mean_ic_from_eps(eps, n, L) for eps in enumerate_eps(n, L)]) def mean_ic(N): ps = sella_hirsch_predictions(n, L, G, N) return ics.dot(ps) Ne = secant_interval(lambda N: mean_ic(N) - desired_ic, 0, 2000, tolerance=0.1, verbose=True) # ~= 1525 ps = sella_hirsch_predictions(n, L, G, Ne) sh_sampler = inverse_cdf_sampler(list(enumerate_eps(n, L)), ps) sh_motifs = [ sample_motif_from_mismatches(sh_sampler(), L) for i in trange(replicates) ] sh_mean_ic = mean(map( motif_ic, sh_motifs)) # may undershoot desired due to approximation maxent_motifs = maxent_sample_motifs_with_ic(n, L, sh_mean_ic, replicates) plt.suptitle( "Motif Statistics for Match/Mismatch Model vs. MaxEnt Ensembles (n=10,L=10,G=1000)" ) all_boxplot_comparisons([sh_motifs, maxent_motifs], labels=["M/MM", "MaxEnt"], plot_titles="IC Gini MI".split(), filename=filename)
def uniform_motifs_accept_reject(n, L, desired_ic, num_motifs, epsilon=0.1, beta=None, verbose=False): if beta is None: correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta, verbose=verbose) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) return [ uniform_motif_accept_reject(n, L, desired_ic, epsilon=epsilon, beta=beta, ps=ps, count_sampler=count_sampler, verbose=verbose) for i in trange(num_motifs) ]
def uniform_motif_accept_reject(n, L, desired_ic, epsilon=0.1, beta=None, ps=None, count_sampler=None, verbose=False): print "uniform motif accept reject:", n, L, desired_ic, beta correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col if desired_ic_for_beta == 2 * L: # if we reach the upper limit, things break down cols = [sample_col_from_count((0, 0, 0, n)) for _ in range(L)] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p if beta is None: beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) if verbose: print "beta:", beta if ps is None: ps = count_ps_from_beta(n, beta) if count_sampler is None: count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def rQ_raw(): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def rQ(): return sample_until(lambda M: inrange(M, desired_ic, epsilon), rQ_raw, 1, progress_bar=False)[0] def dQhat(motif): return exp(beta * motif_ic(motif)) Imin = desired_ic - epsilon Imax = desired_ic + epsilon log_M = -beta * Imin if verbose: print "Imin, Imax, log_M:", Imin, Imax, log_M def dQ(motif): return exp(beta * motif_ic(motif) + log_M) def AR(motif): return 1.0 / dQ(motif) #M = exp(-beta*(desired_ic - epsilon)) # which ic? +/- correction trials = 0 while True: trials += 1 motif = rQ() r = random.random() if r < AR(motif): return motif if verbose and trials % 100 == 0: print trials, AR(motif)
def maxent_motifs_with_ic(n, L, desired_ic, num_motifs, tolerance=10**-10, beta=None, verbose=False): if beta is None: correction_per_col = 3 / (2 * log(2) * n) desired_ic += L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic, tolerance=tolerance, verbose=verbose) if verbose: print "beta:", beta ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def sample(): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] return map(lambda site: "".join(site), transpose(cols)) return [sample() for _ in trange(num_motifs)]
def uniform_motif_with_ic_imh_ref(n, L, desired_ic, epsilon=0.1, iterations=None, verbose=False, num_chains=8): correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Q(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 if iterations: x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain else: #use gelman rubin criterion x0s = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), num_chains) iterations = 100 converged = False chains = [[] for _ in range(num_chains)] while not converged: for chain, x0 in zip(chains, x0s): chain.extend( mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False)) ic_chains = mmap(motif_ic, chains) R_hat, neff = gelman_rubin(ic_chains) if R_hat < 1.1: return chains else: x0s = [chain[-1] for chain in chains] iterations *= 2
def uniform_motif_with_ic_imh(n, L, desired_ic, epsilon=0.1, iterations=None, verbose=False, beta=None, num_chains=8): if beta is None: correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Q(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] # first, determine probability of landing in range ar = 0 iterations = 100 while ar == 0: ar = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False, return_ar=True) iterations *= 2 iterations = int(1.0 / ar * 10) chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain
def uniform_motif_imh_tv(n, L, desired_ic, beta=None, epsilon=None, tv=0.01): """run uniform imh to within total variation bound tv""" correction_per_col = 3 / (2 * log(2) * n) desired_ic_for_beta = desired_ic + L * correction_per_col if beta == None: beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta) if epsilon == None: epsilon = 1.0 / (2 * beta) print "maximally efficient epsilon:", epsilon ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) def Qp(motif): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] motif_p = map(lambda site: "".join(site), transpose(cols)) return motif_p def Q(motif): return sample_until(lambda m: abs(motif_ic(m) - desired_ic) < epsilon, lambda: Qp(None), 1)[0] def log_dQ(motif_p, motif): return (beta * motif_ic(motif_p)) def log_f(motif): in_range = abs(motif_ic(motif) - desired_ic) < epsilon return 0 if in_range else -10.0**100 alpha = exp(-2 * beta * epsilon) iterations = int(ceil(log(tv) / log(1 - alpha))) print "iterations:", iterations x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0] # first, determine probability of landing in range chain = mh(log_f, proposal=Q, dprop=log_dQ, x0=x0, iterations=iterations, use_log=True, verbose=False) return chain
def maxent_motif_with_ic(n, L, desired_ic, tolerance=10**-10, beta=None, verbose=False): """sample motif from max ent distribution with mean desired_ic""" # first we adjust the desired ic upwards so that when motif_ic is # called with 1st order correction, we get the desired ic. if beta is None: if verbose: print "finding beta" correction_per_col = 3 / (2 * log(2) * n) desired_ic += L * correction_per_col beta = find_beta_for_mean_motif_ic(n, L, desired_ic, tolerance=tolerance, verbose=verbose) ps = count_ps_from_beta(n, beta) count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps) counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count) for count in counts] return map(lambda site: "".join(site), transpose(cols))