def validate_sample_motif_neglect_fg2(iterations=50000): """compare fg_neglect sampling to MCMC""" bio_motif = Escherichia_coli.LexA n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] ringer = ringer_motif(matrix,n) Ne = 2.375 random_motifs = [sample_motif_neglect_fg(matrix,n,Ne) for i in trange(iterations)] random_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(random_motifs)] random_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(random_motifs)] random_ics = map(motif_ic,random_motifs) _, chain = sella_hirsch_mh(matrix=matrix,init="ringer",Ne=Ne,n=n,iterations=iterations) chain_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(chain)] chain_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(chain)] chain_ics = map(motif_ic,chain) plt.subplot(1,2,1) plt.scatter(random_rhos,random_log_fs) plt.scatter(chain_rhos,chain_log_fs,color='g') plt.xlabel("rho") plt.ylabel("log fitness") plt.subplot(1,2,2) plt.scatter(random_rhos,random_ics) plt.scatter(chain_rhos,chain_ics,color='g') plt.xlabel("rho") plt.ylabel("IC")
def validate_sample_motif_neglect_fg(): """compare fg_neglect sampling to random mutation: indeed shows better fitness at given rho""" bio_motif = Escherichia_coli.LexA n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] ringer = ringer_motif(matrix,n) random_motifs = [mutate_motif_k_times(ringer,k) for k in range(n*L)] random_motifs2 = [sample_motif_neglect_fg(matrix,n,Ne) for Ne in np.linspace(1,10,n*L)] random_rhos = [motif_hamming_distance(ringer,motif) for motif in random_motifs] random_log_fs = [log_fitness(matrix,motif,G) for motif in random_motifs] random_rhos2 = [motif_hamming_distance(ringer,motif) for motif in random_motifs2] random_log_fs2 = [log_fitness(matrix,motif,G) for motif in random_motifs2] plt.plot(random_rhos,random_log_fs) plt.plot(random_rhos2,random_log_fs2) plt.plot(random_rhos3,random_log_fs3)
def entropy_drift_analysis(sigma=2, color='b', color_p='g'): """why is convergence so difficult to obtain for, say, sigma = 2? Explore selection/mutation balance.""" n = 16 L = 16 matrix = sample_matrix(L, sigma) ringer = ringer_motif(matrix, n) mutants = [ iterate(mutate_motif, ringer, i) for i in trange(256) for j in range(10) ] dists = [ motif_hamming_distance(ringer, mutant) for mutant in tqdm(mutants) ] fs = [log_fitness(matrix, mutant, G) for mutant in tqdm(mutants)] fps = [] trials = 100 for mutant in tqdm(mutants): nexts = [] f = log_fitness(matrix, mutant, G) for i in range(trials): mutant_p = mutate_motif(mutant) fp = log_fitness(matrix, mutant_p, G) if log(random.random()) < fp - f: nexts.append(fp) else: nexts.append(f) fps.append(mean(nexts)) plt.subplot(3, 1, 1) plt.scatter(dists, fs, color=color, marker='.') plt.scatter(dists, fps, color=color_p, marker='.') #plt.semilogy() plt.subplot(3, 1, 2) plt.scatter(dists, [(f - fp) / f for (f, fp) in zip(fs, fps)], color=color, marker='.') plt.plot([0, len(fs)], [0, 0], linestyle='--', color='black') plt.subplot(3, 1, 3) diffs = [fp - f for f, fp in zip(fs, fps)] plt.scatter(fs, diffs, marker='.', color=color) interpolant = poly1d(polyfit(fs, diffs, 1)) plt.plot(*pl(interpolant, [min(fs), max(fs)])) plt.plot([min(fs), max(fs)], [0, 0], linestyle='--', color='black') minx, maxx = min(fs + fs), max(fs + fps)
def plot_matrix_chain_ringer_distance((matrix, chain)): n = len(chain[0]) ringer = ringer_motif(matrix, n) plt.plot([motif_hamming_distance(motif, ringer) for motif in tqdm(chain)])
def plot_matrix_chain_distance_from_first((matrix, chain)): init = chain[0] plt.plot([motif_hamming_distance(m, init) for m in tqdm(chain)])
def rho_fitness_plot(matrix_chain): matrix, chain = matrix_chain n = len(chain[0]) L = len(chain[0][0]) N = n * L ringer = ringer_motif(matrix, n) print "fs" fs = [log10(fitness(matrix, motif, G)) for motif in tqdm(chain)] print "rhos" rhos = [motif_hamming_distance(ringer, motif) for motif in tqdm(chain)] print "ics" ics = [motif_ic(motif) for motif in tqdm(chain)] plt.xlabel("Distance from ringer") plt.ylabel("fitness") print "perturbations" perturbations = [ mutate_motif_k_times(ringer, random.randrange(N)) for _ in tqdm(chain) ] print "fs" perturb_fs = [ log10(fitness(matrix, motif, G)) for motif in tqdm(perturbations) ] print "rhos" perturb_rhos = [ motif_hamming_distance(ringer, motif) for motif in tqdm(perturbations) ] print "ics" perturb_ics = [motif_ic(motif) for motif in tqdm(perturbations)] print "log odds" log_odds = [ sample_log_odds(matrix, n, 3) for lamb in tqdm(np.linspace(0.5, 5, 10000)) ] print "fs" log_odds_fs = [ log10(fitness(matrix, motif, G)) for motif in tqdm(log_odds) ] print "rhos" log_odds_rhos = [ motif_hamming_distance(ringer, motif) for motif in tqdm(log_odds) ] print "ics" log_odds_ics = [motif_ic(motif) for motif in tqdm(log_odds)] plt.subplot(1, 3, 1) plt.scatter(perturb_rhos, perturb_fs, color='g') plt.scatter(log_odds_rhos, log_odds_fs, color='r') plt.scatter(rhos, fs, color='b') plt.xlabel("rho") plt.ylabel("log f") plt.subplot(1, 3, 2) plt.scatter(perturb_rhos, perturb_ics, color='g') plt.scatter(log_odds_rhos, log_odds_ics, color='r') plt.scatter(rhos, ics, color='b') plt.xlabel("rho") plt.ylabel("IC") plt.subplot(1, 3, 3) plt.scatter(perturb_ics, perturb_fs, color='g') plt.scatter(log_odds_ics, log_odds_fs, color='r') plt.scatter(ics, fs, color='b') plt.xlabel("IC") plt.ylabel("fs")
def log_dprop(motif, _): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer #return log_choose(N,k) + k * log(p) + (N-k)*log(1-p) + k*log(1/3.0) #return -log(N) + k * log(1/3.0) return log(ps[k]) - log_choose(N, k) + k * log(1 / 3.0)
def dprop(motif, _): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer #return choose_reference(N,k) * p**k * (1-p)**(N-k) * (1/3.0)**k #return 1.0/N * (1/3.0)**k return ps[k] * choose(N, k) * (1 / 3.0)**k
def weight(motif): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer return 4**k
def log_dprop(motif, _): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer ### return log(ps[k]) + k * log(1/3.0) ### XXX BE LESS RETARDED ABOUT THIS XXX # p(k)*(1/choose(N,k)) * (1/3.0)**k return log(ps[k]) - log_choose(N, k) + k * log(1.0 / 3)
def plot_log_fs_vs_log_dprops(Ne=5, n=16, L=16, G=5 * 10**6, sigma=1, matrix=None, x0=None, iterations=50000, lamb=1): if matrix is None: matrix = sample_matrix(L, sigma) nu = Ne - 1 N = n * L ringer = ringer_motif(matrix, n) ps = normalize([exp(-lamb * i) for i in range(N)]) #ps = [1.0/N]*N def log_f(motif): return log_fitness(matrix, motif, G) def prop(motif): k = inverse_cdf_sample(range(N), ps) motif_p = mutate_motif_k_times(ringer, k) return motif_p def log_dprop(motif, _): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer ### return log(ps[k]) + k * log(1/3.0) ### XXX BE LESS RETARDED ABOUT THIS XXX # p(k)*(1/choose(N,k)) * (1/3.0)**k return log(ps[k]) - log_choose(N, k) + k * log(1.0 / 3) def weight(motif): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer return 4**k matrix_probs = [normalize([exp(-ep) for ep in row]) for row in matrix] def prop_fanciful(motif): return [ "".join( [inverse_cdf_sample("ACGT", probs) for probs in matrix_probs]) for i in xrange(n) ] def dprop(motif, _): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer #return choose_reference(N,k) * p**k * (1-p)**(N-k) * (1/3.0)**k #return 1.0/N * (1/3.0)**k return ps[k] * choose(N, k) * (1 / 3.0)**k def log_dprop(motif, _): k = motif_hamming_distance(ringer, motif) # number of mutations from ringer #return log_choose(N,k) + k * log(p) + (N-k)*log(1-p) + k*log(1/3.0) #return -log(N) + k * log(1/3.0) return log(ps[k]) - log_choose(N, k) + k * log(1 / 3.0) def log_dprop_fanciful(motif, _): return sum( log(matrix_probs[i]["ACGT".index(b)]) for site in motif for i, b in enumerate(site)) motifs = [prop_fanciful(None) for i in trange(iterations)] log_fs = np.array(map(log_f, tqdm(motifs))) log_dprops = np.array( [log_dprop_fanciful(motif, None) for motif in tqdm(motifs)]) ws = np.exp(log_fs - log_dprops) ws = ws / np.sum(ws) rhos = np.array( [motif_hamming_distance(motif, ringer) for motif in motifs]) print "mean log_f:", ws.dot(log_fs) print "mean rho:", ws.dot(rhos) plt.subplot(2, 3, 1) plt.title("log fitness vs proposal") plt.scatter(log_fs, log_dprops) minval, maxval = min(log_fs + log_dprops), max(log_fs + log_dprops) plt.plot([minval, maxval], [minval, maxval], linestyle='--') plt.subplot(2, 3, 2) #plt.scatter(exp,log_fs,rhos) plt.title("ringer dstance vs weight") plt.scatter(rhos, np.log10(ws)) plt.subplot(2, 3, 3) #plt.scatter(exp,log_fs,rhos) plt.title("log fitness vs wieght") plt.scatter(log_fs, np.log10(ws)) plt.subplot(2, 3, 5) #plt.scatter(exp,log_fs,rhos) plt.scatter(rhos, log_fs) plt.subplot(2, 3, 6) #plt.scatter(exp,log_fs,rhos) plt.scatter(log_fs, np.log10(ws))