def sample_motif_cftp_param_study(): """Examine dependence of IC on sigma, Ne""" grid_points = 10 sigmas = np.linspace(0.5,10,grid_points) Nes = np.linspace(1,10,grid_points) trials = 3 n = 20 L = 10 def f(sigma, Ne): matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, 10*n) return motif_ic(sample_motif_cftp(matrix, mu, Ne, n)) ics = [[(mean(f(sigma, Ne) for _ in range(trials))) for sigma in sigmas] for Ne in tqdm(Nes,desc="ic grid")] plt.contourf(sigmas, Nes,ics) plt.colorbar() #bio_motifs = [getattr(Escherichia_coli,tf) for tf in Escherichia_coli.tfs] bio_sigmas = [sigma_from_matrix(pssm_from_motif(motif,pc=1)) for motif in bio_motifs] bio_ics = [motif_ic(motif) for motif in bio_motifs] #griddata((sigmas,Nes),ics) interp = interp2d(sigmas,Nes,ics) bio_Nes = [bisect_interval(lambda Ne:interp(show(bio_sigma),Ne)-bio_ic,0,20) for bio_sigma, bio_ic in zip(bio_sigmas,bio_ics)] plt.scatter(sigm)
def spoof_motifs(motif, num_motifs=10, trials=1, sigma=None, Ne_tol=10**-4, double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10 * N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) epsilon = (1 + double_sigma) * sigma # 15 Jan 2016 print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [ sample_motif(epsilon, Ne, L, copies, n, ps=ps) for i in range(trials) ] return mean(map(motif_ic, motifs)) - bio_ic Ne = log_regress_spec2(f, [1, 10], tol=10**-3) return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def spoof_motif_cftp(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-2,verbose=False): n = len(motif) L = len(motif[0]) copies = 10*n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, copies=10*n, G=5*10**6) print "mu:", mu def f(Ne): motifs = [sample_motif_cftp(matrix, mu, Ne, n, verbose=verbose) for i in trange(trials)] return mean(map(motif_ic,motifs)) - bio_ic # lb = 1 # ub = 10 # while f(ub) < 0: # ub *= 2 # print ub x0s = [2,10]#(lb + ub)/2.0 # print "choosing starting seed for Ne" # fs = map(lambda x:abs(f(x)),x0s) # print "starting values:",x0s,fs # x0 = x0s[argmin(fs)] # print "chose:",x0 # Ne = bisect_interval_noisy_ref(f,x0,lb=1,verbose=True) Ne = log_regress_spec2(f,x0s,tol=Ne_tol) print "Ne:",Ne return [sample_motif_cftp(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def spoof_motif_ref(motif, num_motifs=10, trials=10, sigma=None, Ne_tol=10**-4): n = len(motif) L = len(motif[0]) copies = 10 * n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [ sample_motif(sigma, Ne, L, copies, n, ps=ps) for i in range(trials) ] return mean(map(motif_ic, motifs)) - bio_ic lb = 1 ub = 2 while f(ub) < 0: ub *= 2 ub *= 2 # once more for good measure x0 = (lb + ub) / 2.0 print "Ne guess:", x0 Nes = [ bisect_interval_noisy(f, x0=x0, tolerance=Ne_tol, lb=1) for i in range(3) ] Ne = mean(Nes) print "Nes:", Nes, Ne return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def resample_from_post_chain(chain, N): """given chain of the form [(mat, mu, Ne)], perform reduction: mat -> sigma -> mat' -> motif' Conclusion: heavily underestimates IC. """ L = len(chain[0][0]) sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain] matrices = [sample_matrix(L, sigma) for sigma in sigmas] motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain))] return motifs
def resample_from_post_chain(chain, N): """given chain of the form [(mat, mu, Ne)], perform reduction: mat -> sigma -> mat' -> motif' Conclusion: heavily underestimates IC. """ L = len(chain[0][0]) sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain] matrices = [sample_matrix(L, sigma) for sigma in sigmas] motifs = [ sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain)) ] return motifs
def spoof_motifs(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10*N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) epsilon = (1+double_sigma)*sigma # 15 Jan 2016 print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [sample_motif(epsilon, Ne, L, copies, n,ps=ps) for i in range(trials)] return mean(map(motif_ic,motifs)) - bio_ic Ne = log_regress_spec2(f,[1,10],tol=10**-3) return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10*N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) epsilon = (1+double_sigma)*sigma # 15 Jan 2016 print "sigma:", sigma #bio_ic = motif_ic(motif) mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] mu = gle_approx_mu(mat, copies) bio_occ = mean([1/(1+exp(ep-mu)) for ep in eps]) def f(Ne): return expected_occupancy(epsilon, Ne, L, copies) - bio_occ Ne = log_regress_spec2(f,[1,10],tol=10**-3) return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
def analyze_bio_motifs(Nes,trials=20): results = {} for tf_idx,tf in enumerate(Escherichia_coli.tfs): Ne = Nes[tf] bio_motif = getattr(Escherichia_coli,tf) n,L = len(bio_motif),len(bio_motif[0]) bio_matrix = matrix_from_motif(bio_motif) sigma = sigma_from_matrix(bio_matrix) matrix_chains = [sella_hirsch_mh(n=n,L=L,sigma=sigma,Ne=Ne,init='ringer') for i in range(trials)] ics = [mean(map(motif_ic,chain[-1000:])) for (matrix,chain) in matrix_chains] ginis = [mean(map(motif_gini,chain[-1000:])) for (matrix,chain) in matrix_chains] mis = [mean(map(total_motif_mi,chain[-1000:])) for (matrix,chain) in matrix_chains] print "results for:",tf,tf_idx print motif_ic(bio_motif),mean(ics),sd(ics) print motif_gini(bio_motif),mean(ginis),sd(ginis) print total_motif_mi(bio_motif),mean(mis),sd(mis) results[tf] = (mean(ics),sd(ics),mean(ginis),sd(ginis),mean(mis),sd(mis)) return results
def spoof_motif_ar(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4): n = len(motif) L = len(motif[0]) copies = 10*n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, copies=10*n, G=5*10**6) print "mu:", mu def f(Ne): motifs = [sample_motif_ar(matrix, mu, Ne, n) for i in trange(trials)] return mean(map(motif_ic,motifs)) - bio_ic x0 = 2 print "Ne guess:", x0 Ne = bisect_interval_noisy(f,x0=x0,iterations=100,lb=1, verbose=False,w=0.5) print "Ne:",Ne return [sample_motif_ar(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def main_experiment(samples=30, iterations=10000, delta_ic=0.1): results_dict = {} for tf_idx, tf in enumerate(tfdf.tfs): print "starting on:", tf motif = getattr(tfdf, tf) if motif_ic(motif) < 5: print "excluding", tf, "for low IC" continue bio_ic = motif_ic(motif) n = len(motif) L = len(motif[0]) matrix = matrix_from_motif(motif) sigma = sigma_from_matrix(matrix) mu = approximate_mu(matrix, n, G) Ne = estimate_Ne(matrix, mu, n, bio_ic) spoofs = [] ar = 0 spoof_trials = 0.0 while len(spoofs) < samples: spoof_trials += 1 matrix, chain = sella_hirsch_mh(Ne=Ne, mu=mu, n=1, matrix=sample_matrix(L, sigma), init='ringer', iterations=iterations) spoof_motif = concat( [random.choice(chain[iterations / 2:]) for i in range(n)]) if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic: spoofs.append(spoof_motif) ar += 1 print "spoof acceptance rate:", ar / spoof_trials, len( spoofs), samples, spoof_trials #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]] results_dict[tf] = { fname: map(eval(fname), spoofs) for fname in "motif_ic motif_gini total_motif_mi".split() } print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs)) print bio_ic, mean_ci(results_dict[tf]['motif_ic']) return results_dict
def spoof_motif(motif, Ne=None, iterations=10000): matrix = matrix_from_motif(motif) L = len(motif[0]) n = len(motif) sigma = sigma_from_matrix(matrix) spoof_matrix = sample_matrix(L, sigma) bio_ic = motif_ic(motif) # this method of reading site_mu, site_sigma off of motif is slightly suspect... site_mu = site_mu_from_matrix(matrix_from_motif(motif)) site_sigma = site_sigma_from_matrix(matrix_from_motif(motif)) # now need to find mu, nu n = len(motif) assumed_copies = 10 * n mu = approximate_mu(matrix, assumed_copies, G) spoof_mu = approximate_mu(spoof_matrix, assumed_copies, G) if Ne is None: Ne = estimate_Ne(spoof_matrix, spoof_mu, n, bio_ic) print "chose Ne:", Ne spoof_matrix, chain = sella_hirsch_mh(Ne=Ne, matrix=spoof_matrix, mu=mu, n=n) return spoof_matrix, chain, Ne
def spoof_motif_ref(motif, num_motifs=10, trials=10, sigma=None,Ne_tol=10**-4): n = len(motif) L = len(motif[0]) copies = 10*n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [sample_motif(sigma, Ne, L, copies, n,ps=ps) for i in range(trials)] return mean(map(motif_ic,motifs)) - bio_ic lb = 1 ub = 2 while f(ub) < 0: ub *= 2 ub *= 2 # once more for good measure x0 = (lb + ub)/2.0 print "Ne guess:", x0 Nes = [bisect_interval_noisy(f,x0=x0,tolerance=Ne_tol,lb=1) for i in range(3)] Ne = mean(Nes) print "Nes:",Nes,Ne return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None, Ne_tol=10**-4, double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10 * N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) epsilon = (1 + double_sigma) * sigma # 15 Jan 2016 print "sigma:", sigma #bio_ic = motif_ic(motif) mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] mu = gle_approx_mu(mat, copies) bio_occ = mean([1 / (1 + exp(ep - mu)) for ep in eps]) def f(Ne): return expected_occupancy(epsilon, Ne, L, copies) - bio_occ Ne = log_regress_spec2(f, [1, 10], tol=10**-3) return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]