def validate_sample_motif_neglect_fg2(iterations=50000):
    """compare fg_neglect sampling to MCMC"""
    bio_motif = Escherichia_coli.LexA
    n = len(bio_motif)
    L = len(bio_motif[0])
    matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)]
    ringer = ringer_motif(matrix,n)
    Ne = 2.375 
    random_motifs = [sample_motif_neglect_fg(matrix,n,Ne) for i in trange(iterations)]
    random_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(random_motifs)]
    random_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(random_motifs)]
    random_ics = map(motif_ic,random_motifs)
    _, chain = sella_hirsch_mh(matrix=matrix,init="ringer",Ne=Ne,n=n,iterations=iterations)
    chain_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(chain)]
    chain_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(chain)]
    chain_ics = map(motif_ic,chain)
    plt.subplot(1,2,1)
    plt.scatter(random_rhos,random_log_fs)
    plt.scatter(chain_rhos,chain_log_fs,color='g')
    plt.xlabel("rho")
    plt.ylabel("log fitness")
    plt.subplot(1,2,2)
    plt.scatter(random_rhos,random_ics)
    plt.scatter(chain_rhos,chain_ics,color='g')
    plt.xlabel("rho")
    plt.ylabel("IC")
def validate_sample_motif_neglect_fg():
    """compare fg_neglect sampling to random mutation: indeed shows better fitness at given rho"""
    bio_motif = Escherichia_coli.LexA
    n = len(bio_motif)
    L = len(bio_motif[0])
    matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)]
    ringer = ringer_motif(matrix,n)
    random_motifs = [mutate_motif_k_times(ringer,k) for k in range(n*L)]
    random_motifs2 = [sample_motif_neglect_fg(matrix,n,Ne) for Ne in np.linspace(1,10,n*L)]
    random_rhos = [motif_hamming_distance(ringer,motif) for motif in random_motifs]
    random_log_fs = [log_fitness(matrix,motif,G) for motif in random_motifs]
    random_rhos2 = [motif_hamming_distance(ringer,motif) for motif in random_motifs2]
    random_log_fs2 = [log_fitness(matrix,motif,G) for motif in random_motifs2]
    plt.plot(random_rhos,random_log_fs)
    plt.plot(random_rhos2,random_log_fs2)
    plt.plot(random_rhos3,random_log_fs3)
def entropy_drift_analysis(sigma=2, color='b', color_p='g'):
    """why is convergence so difficult to obtain for, say, sigma = 2?  Explore selection/mutation balance."""
    n = 16
    L = 16
    matrix = sample_matrix(L, sigma)
    ringer = ringer_motif(matrix, n)
    mutants = [
        iterate(mutate_motif, ringer, i) for i in trange(256)
        for j in range(10)
    ]
    dists = [
        motif_hamming_distance(ringer, mutant) for mutant in tqdm(mutants)
    ]
    fs = [log_fitness(matrix, mutant, G) for mutant in tqdm(mutants)]
    fps = []
    trials = 100
    for mutant in tqdm(mutants):
        nexts = []
        f = log_fitness(matrix, mutant, G)
        for i in range(trials):
            mutant_p = mutate_motif(mutant)
            fp = log_fitness(matrix, mutant_p, G)
            if log(random.random()) < fp - f:
                nexts.append(fp)
            else:
                nexts.append(f)
        fps.append(mean(nexts))
    plt.subplot(3, 1, 1)
    plt.scatter(dists, fs, color=color, marker='.')
    plt.scatter(dists, fps, color=color_p, marker='.')
    #plt.semilogy()
    plt.subplot(3, 1, 2)
    plt.scatter(dists, [(f - fp) / f for (f, fp) in zip(fs, fps)],
                color=color,
                marker='.')
    plt.plot([0, len(fs)], [0, 0], linestyle='--', color='black')
    plt.subplot(3, 1, 3)
    diffs = [fp - f for f, fp in zip(fs, fps)]
    plt.scatter(fs, diffs, marker='.', color=color)
    interpolant = poly1d(polyfit(fs, diffs, 1))
    plt.plot(*pl(interpolant, [min(fs), max(fs)]))
    plt.plot([min(fs), max(fs)], [0, 0], linestyle='--', color='black')
    minx, maxx = min(fs + fs), max(fs + fps)
def plot_matrix_chain_ringer_distance((matrix, chain)):
    n = len(chain[0])
    ringer = ringer_motif(matrix, n)
    plt.plot([motif_hamming_distance(motif, ringer) for motif in tqdm(chain)])
def plot_matrix_chain_distance_from_first((matrix, chain)):
    init = chain[0]
    plt.plot([motif_hamming_distance(m, init) for m in tqdm(chain)])
def rho_fitness_plot(matrix_chain):
    matrix, chain = matrix_chain
    n = len(chain[0])
    L = len(chain[0][0])
    N = n * L
    ringer = ringer_motif(matrix, n)
    print "fs"
    fs = [log10(fitness(matrix, motif, G)) for motif in tqdm(chain)]
    print "rhos"
    rhos = [motif_hamming_distance(ringer, motif) for motif in tqdm(chain)]
    print "ics"
    ics = [motif_ic(motif) for motif in tqdm(chain)]
    plt.xlabel("Distance from ringer")
    plt.ylabel("fitness")
    print "perturbations"
    perturbations = [
        mutate_motif_k_times(ringer, random.randrange(N)) for _ in tqdm(chain)
    ]
    print "fs"
    perturb_fs = [
        log10(fitness(matrix, motif, G)) for motif in tqdm(perturbations)
    ]
    print "rhos"
    perturb_rhos = [
        motif_hamming_distance(ringer, motif) for motif in tqdm(perturbations)
    ]
    print "ics"
    perturb_ics = [motif_ic(motif) for motif in tqdm(perturbations)]
    print "log odds"
    log_odds = [
        sample_log_odds(matrix, n, 3)
        for lamb in tqdm(np.linspace(0.5, 5, 10000))
    ]
    print "fs"
    log_odds_fs = [
        log10(fitness(matrix, motif, G)) for motif in tqdm(log_odds)
    ]
    print "rhos"
    log_odds_rhos = [
        motif_hamming_distance(ringer, motif) for motif in tqdm(log_odds)
    ]
    print "ics"
    log_odds_ics = [motif_ic(motif) for motif in tqdm(log_odds)]
    plt.subplot(1, 3, 1)
    plt.scatter(perturb_rhos, perturb_fs, color='g')
    plt.scatter(log_odds_rhos, log_odds_fs, color='r')
    plt.scatter(rhos, fs, color='b')
    plt.xlabel("rho")
    plt.ylabel("log f")
    plt.subplot(1, 3, 2)
    plt.scatter(perturb_rhos, perturb_ics, color='g')
    plt.scatter(log_odds_rhos, log_odds_ics, color='r')
    plt.scatter(rhos, ics, color='b')
    plt.xlabel("rho")
    plt.ylabel("IC")
    plt.subplot(1, 3, 3)
    plt.scatter(perturb_ics, perturb_fs, color='g')
    plt.scatter(log_odds_ics, log_odds_fs, color='r')
    plt.scatter(ics, fs, color='b')
    plt.xlabel("IC")
    plt.ylabel("fs")
 def log_dprop(motif, _):
     k = motif_hamming_distance(ringer,
                                motif)  # number of mutations from ringer
     #return log_choose(N,k) + k * log(p) + (N-k)*log(1-p) + k*log(1/3.0)
     #return -log(N) + k * log(1/3.0)
     return log(ps[k]) - log_choose(N, k) + k * log(1 / 3.0)
 def dprop(motif, _):
     k = motif_hamming_distance(ringer,
                                motif)  # number of mutations from ringer
     #return choose_reference(N,k) * p**k * (1-p)**(N-k) * (1/3.0)**k
     #return 1.0/N * (1/3.0)**k
     return ps[k] * choose(N, k) * (1 / 3.0)**k
 def weight(motif):
     k = motif_hamming_distance(ringer,
                                motif)  # number of mutations from ringer
     return 4**k
 def log_dprop(motif, _):
     k = motif_hamming_distance(ringer,
                                motif)  # number of mutations from ringer
     ### return log(ps[k]) + k * log(1/3.0)  ### XXX BE LESS RETARDED ABOUT THIS XXX
     # p(k)*(1/choose(N,k)) * (1/3.0)**k
     return log(ps[k]) - log_choose(N, k) + k * log(1.0 / 3)
def plot_log_fs_vs_log_dprops(Ne=5,
                              n=16,
                              L=16,
                              G=5 * 10**6,
                              sigma=1,
                              matrix=None,
                              x0=None,
                              iterations=50000,
                              lamb=1):
    if matrix is None:
        matrix = sample_matrix(L, sigma)
    nu = Ne - 1
    N = n * L
    ringer = ringer_motif(matrix, n)
    ps = normalize([exp(-lamb * i) for i in range(N)])

    #ps = [1.0/N]*N
    def log_f(motif):
        return log_fitness(matrix, motif, G)

    def prop(motif):
        k = inverse_cdf_sample(range(N), ps)
        motif_p = mutate_motif_k_times(ringer, k)
        return motif_p

    def log_dprop(motif, _):
        k = motif_hamming_distance(ringer,
                                   motif)  # number of mutations from ringer
        ### return log(ps[k]) + k * log(1/3.0)  ### XXX BE LESS RETARDED ABOUT THIS XXX
        # p(k)*(1/choose(N,k)) * (1/3.0)**k
        return log(ps[k]) - log_choose(N, k) + k * log(1.0 / 3)

    def weight(motif):
        k = motif_hamming_distance(ringer,
                                   motif)  # number of mutations from ringer
        return 4**k

    matrix_probs = [normalize([exp(-ep) for ep in row]) for row in matrix]

    def prop_fanciful(motif):
        return [
            "".join(
                [inverse_cdf_sample("ACGT", probs) for probs in matrix_probs])
            for i in xrange(n)
        ]

    def dprop(motif, _):
        k = motif_hamming_distance(ringer,
                                   motif)  # number of mutations from ringer
        #return choose_reference(N,k) * p**k * (1-p)**(N-k) * (1/3.0)**k
        #return 1.0/N * (1/3.0)**k
        return ps[k] * choose(N, k) * (1 / 3.0)**k

    def log_dprop(motif, _):
        k = motif_hamming_distance(ringer,
                                   motif)  # number of mutations from ringer
        #return log_choose(N,k) + k * log(p) + (N-k)*log(1-p) + k*log(1/3.0)
        #return -log(N) + k * log(1/3.0)
        return log(ps[k]) - log_choose(N, k) + k * log(1 / 3.0)

    def log_dprop_fanciful(motif, _):
        return sum(
            log(matrix_probs[i]["ACGT".index(b)]) for site in motif
            for i, b in enumerate(site))

    motifs = [prop_fanciful(None) for i in trange(iterations)]
    log_fs = np.array(map(log_f, tqdm(motifs)))
    log_dprops = np.array(
        [log_dprop_fanciful(motif, None) for motif in tqdm(motifs)])
    ws = np.exp(log_fs - log_dprops)
    ws = ws / np.sum(ws)
    rhos = np.array(
        [motif_hamming_distance(motif, ringer) for motif in motifs])
    print "mean log_f:", ws.dot(log_fs)
    print "mean rho:", ws.dot(rhos)
    plt.subplot(2, 3, 1)
    plt.title("log fitness vs proposal")
    plt.scatter(log_fs, log_dprops)
    minval, maxval = min(log_fs + log_dprops), max(log_fs + log_dprops)
    plt.plot([minval, maxval], [minval, maxval], linestyle='--')
    plt.subplot(2, 3, 2)
    #plt.scatter(exp,log_fs,rhos)
    plt.title("ringer dstance vs weight")
    plt.scatter(rhos, np.log10(ws))
    plt.subplot(2, 3, 3)
    #plt.scatter(exp,log_fs,rhos)
    plt.title("log fitness vs wieght")
    plt.scatter(log_fs, np.log10(ws))
    plt.subplot(2, 3, 5)
    #plt.scatter(exp,log_fs,rhos)
    plt.scatter(rhos, log_fs)
    plt.subplot(2, 3, 6)
    #plt.scatter(exp,log_fs,rhos)
    plt.scatter(log_fs, np.log10(ws))