def sample_motif_cftp_param_study():
    """Examine dependence of IC on sigma, Ne"""
    grid_points = 10
    sigmas = np.linspace(0.5,10,grid_points)
    Nes = np.linspace(1,10,grid_points)
    trials = 3
    n = 20
    L = 10
    def f(sigma, Ne):
        matrix = sample_matrix(L, sigma)
        mu = approx_mu(matrix, 10*n)
        return motif_ic(sample_motif_cftp(matrix, mu, Ne, n))
    ics = [[(mean(f(sigma, Ne) for _ in range(trials)))
            for sigma in sigmas] for Ne in tqdm(Nes,desc="ic grid")]
    plt.contourf(sigmas, Nes,ics)
    plt.colorbar()
    #bio_motifs = [getattr(Escherichia_coli,tf) for tf in Escherichia_coli.tfs]
    bio_sigmas = [sigma_from_matrix(pssm_from_motif(motif,pc=1))
                  for motif in bio_motifs]
    bio_ics = [motif_ic(motif) for motif in bio_motifs]
    #griddata((sigmas,Nes),ics)
    interp = interp2d(sigmas,Nes,ics)
    bio_Nes = [bisect_interval(lambda Ne:interp(show(bio_sigma),Ne)-bio_ic,0,20)
               for bio_sigma, bio_ic in zip(bio_sigmas,bio_ics)]
    plt.scatter(sigm)
def spoof_motifs(motif,
                 num_motifs=10,
                 trials=1,
                 sigma=None,
                 Ne_tol=10**-4,
                 double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10 * N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    epsilon = (1 + double_sigma) * sigma  # 15 Jan 2016
    print "sigma:", sigma
    bio_ic = motif_ic(motif)

    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [
            sample_motif(epsilon, Ne, L, copies, n, ps=ps)
            for i in range(trials)
        ]
        return mean(map(motif_ic, motifs)) - bio_ic

    Ne = log_regress_spec2(f, [1, 10], tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def spoof_motif_cftp(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-2,verbose=False):
    n = len(motif)
    L = len(motif[0])
    copies = 10*n
    if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    matrix = sample_matrix(L, sigma)
    mu = approx_mu(matrix, copies=10*n, G=5*10**6)
    print "mu:", mu
    def f(Ne):
        motifs = [sample_motif_cftp(matrix, mu, Ne, n, verbose=verbose)
                  for i in trange(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    # lb = 1
    # ub = 10
    # while f(ub) < 0:
    #     ub *= 2
    #     print ub
    x0s = [2,10]#(lb + ub)/2.0
    # print "choosing starting seed for Ne"
    # fs = map(lambda x:abs(f(x)),x0s)
    # print "starting values:",x0s,fs
    # x0 = x0s[argmin(fs)]
    # print "chose:",x0
    # Ne = bisect_interval_noisy_ref(f,x0,lb=1,verbose=True)
    Ne = log_regress_spec2(f,x0s,tol=Ne_tol)
    print "Ne:",Ne
    return [sample_motif_cftp(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def spoof_motif_ref(motif,
                    num_motifs=10,
                    trials=10,
                    sigma=None,
                    Ne_tol=10**-4):
    n = len(motif)
    L = len(motif[0])
    copies = 10 * n
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)

    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [
            sample_motif(sigma, Ne, L, copies, n, ps=ps) for i in range(trials)
        ]
        return mean(map(motif_ic, motifs)) - bio_ic

    lb = 1
    ub = 2
    while f(ub) < 0:
        ub *= 2
    ub *= 2  # once more for good measure
    x0 = (lb + ub) / 2.0
    print "Ne guess:", x0
    Nes = [
        bisect_interval_noisy(f, x0=x0, tolerance=Ne_tol, lb=1)
        for i in range(3)
    ]
    Ne = mean(Nes)
    print "Nes:", Nes, Ne
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def resample_from_post_chain(chain, N):
    """given chain of the form [(mat, mu, Ne)], perform reduction:
    mat -> sigma -> mat' -> motif'

    Conclusion: heavily underestimates IC.
    """
    L = len(chain[0][0])
    sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain]
    matrices = [sample_matrix(L, sigma) for sigma in sigmas]
    motifs = [sample_motif_cftp(matrix, mu, Ne, N) for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain))]
    return motifs
Example #6
0
def resample_from_post_chain(chain, N):
    """given chain of the form [(mat, mu, Ne)], perform reduction:
    mat -> sigma -> mat' -> motif'

    Conclusion: heavily underestimates IC.
    """
    L = len(chain[0][0])
    sigmas = [sigma_from_matrix(mat) for (mat, mu, Ne) in chain]
    matrices = [sample_matrix(L, sigma) for sigma in sigmas]
    motifs = [
        sample_motif_cftp(matrix, mu, Ne, N)
        for matrix, (_, mu, Ne) in tqdm(zip(matrices, chain))
    ]
    return motifs
def spoof_motifs(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10*N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    epsilon = (1+double_sigma)*sigma # 15 Jan 2016
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [sample_motif(epsilon, Ne, L, copies, n,ps=ps)
                  for i in range(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    Ne = log_regress_spec2(f,[1,10],tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10*N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    epsilon = (1+double_sigma)*sigma # 15 Jan 2016
    print "sigma:", sigma
    #bio_ic = motif_ic(motif)
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat, site) for site in motif]
    mu = gle_approx_mu(mat, copies)
    bio_occ = mean([1/(1+exp(ep-mu)) for ep in eps])
    def f(Ne):
        return expected_occupancy(epsilon, Ne, L, copies) - bio_occ
    Ne = log_regress_spec2(f,[1,10],tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
def analyze_bio_motifs(Nes,trials=20):
    results = {}
    for tf_idx,tf in enumerate(Escherichia_coli.tfs):
        Ne = Nes[tf]
        bio_motif = getattr(Escherichia_coli,tf)
        n,L = len(bio_motif),len(bio_motif[0])
        bio_matrix = matrix_from_motif(bio_motif)
        sigma = sigma_from_matrix(bio_matrix)
        matrix_chains = [sella_hirsch_mh(n=n,L=L,sigma=sigma,Ne=Ne,init='ringer') for i in range(trials)]
        ics = [mean(map(motif_ic,chain[-1000:])) for (matrix,chain) in matrix_chains]
        ginis = [mean(map(motif_gini,chain[-1000:])) for (matrix,chain) in matrix_chains]
        mis = [mean(map(total_motif_mi,chain[-1000:])) for (matrix,chain) in matrix_chains]
        print "results for:",tf,tf_idx
        print motif_ic(bio_motif),mean(ics),sd(ics)
        print motif_gini(bio_motif),mean(ginis),sd(ginis)
        print total_motif_mi(bio_motif),mean(mis),sd(mis)
        results[tf] = (mean(ics),sd(ics),mean(ginis),sd(ginis),mean(mis),sd(mis))
    return results
def spoof_motif_ar(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4):
    n = len(motif)
    L = len(motif[0])
    copies = 10*n
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    matrix = sample_matrix(L, sigma)
    mu = approx_mu(matrix, copies=10*n, G=5*10**6)
    print "mu:", mu
    def f(Ne):
        motifs = [sample_motif_ar(matrix, mu, Ne, n)
                  for i in trange(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    x0 = 2
    print "Ne guess:", x0
    Ne = bisect_interval_noisy(f,x0=x0,iterations=100,lb=1, verbose=False,w=0.5)
    print "Ne:",Ne
    return [sample_motif_ar(matrix, mu, Ne, n) for _ in trange(num_motifs)]
Example #11
0
def main_experiment(samples=30, iterations=10000, delta_ic=0.1):
    results_dict = {}
    for tf_idx, tf in enumerate(tfdf.tfs):
        print "starting on:", tf
        motif = getattr(tfdf, tf)
        if motif_ic(motif) < 5:
            print "excluding", tf, "for low IC"
            continue
        bio_ic = motif_ic(motif)
        n = len(motif)
        L = len(motif[0])
        matrix = matrix_from_motif(motif)
        sigma = sigma_from_matrix(matrix)
        mu = approximate_mu(matrix, n, G)
        Ne = estimate_Ne(matrix, mu, n, bio_ic)
        spoofs = []
        ar = 0
        spoof_trials = 0.0
        while len(spoofs) < samples:
            spoof_trials += 1
            matrix, chain = sella_hirsch_mh(Ne=Ne,
                                            mu=mu,
                                            n=1,
                                            matrix=sample_matrix(L, sigma),
                                            init='ringer',
                                            iterations=iterations)
            spoof_motif = concat(
                [random.choice(chain[iterations / 2:]) for i in range(n)])
            if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic:
                spoofs.append(spoof_motif)
                ar += 1
            print "spoof acceptance rate:", ar / spoof_trials, len(
                spoofs), samples, spoof_trials
        #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]]
        results_dict[tf] = {
            fname: map(eval(fname), spoofs)
            for fname in "motif_ic motif_gini total_motif_mi".split()
        }
        print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs))
        print bio_ic, mean_ci(results_dict[tf]['motif_ic'])
    return results_dict
Example #12
0
def spoof_motif(motif, Ne=None, iterations=10000):
    matrix = matrix_from_motif(motif)
    L = len(motif[0])
    n = len(motif)
    sigma = sigma_from_matrix(matrix)
    spoof_matrix = sample_matrix(L, sigma)
    bio_ic = motif_ic(motif)
    # this method of reading site_mu, site_sigma off of motif is slightly suspect...
    site_mu = site_mu_from_matrix(matrix_from_motif(motif))
    site_sigma = site_sigma_from_matrix(matrix_from_motif(motif))
    # now need to find mu, nu
    n = len(motif)
    assumed_copies = 10 * n
    mu = approximate_mu(matrix, assumed_copies, G)
    spoof_mu = approximate_mu(spoof_matrix, assumed_copies, G)
    if Ne is None:
        Ne = estimate_Ne(spoof_matrix, spoof_mu, n, bio_ic)
        print "chose Ne:", Ne
    spoof_matrix, chain = sella_hirsch_mh(Ne=Ne,
                                          matrix=spoof_matrix,
                                          mu=mu,
                                          n=n)
    return spoof_matrix, chain, Ne
def spoof_motif_ref(motif, num_motifs=10, trials=10, sigma=None,Ne_tol=10**-4):
    n = len(motif)
    L = len(motif[0])
    copies = 10*n
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [sample_motif(sigma, Ne, L, copies, n,ps=ps)
                  for i in range(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    lb = 1
    ub = 2
    while f(ub) < 0:
        ub *= 2
    ub *= 2 # once more for good measure
    x0 = (lb + ub)/2.0
    print "Ne guess:", x0
    Nes = [bisect_interval_noisy(f,x0=x0,tolerance=Ne_tol,lb=1) for i in range(3)]
    Ne = mean(Nes)
    print "Nes:",Nes,Ne
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def spoof_motifs_occ(motif,
                     num_motifs=10,
                     trials=1,
                     sigma=None,
                     Ne_tol=10**-4,
                     double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10 * N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    epsilon = (1 + double_sigma) * sigma  # 15 Jan 2016
    print "sigma:", sigma
    #bio_ic = motif_ic(motif)
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat, site) for site in motif]
    mu = gle_approx_mu(mat, copies)
    bio_occ = mean([1 / (1 + exp(ep - mu)) for ep in eps])

    def f(Ne):
        return expected_occupancy(epsilon, Ne, L, copies) - bio_occ

    Ne = log_regress_spec2(f, [1, 10], tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]