def sigma_Ne_contour_plot(filename=None):
    sigmas = np.linspace(0,5,20)
    Nes = np.linspace(1,20,20)
    L = 10
    n = 50
    copies = 10*n
    trials = 100
    motifss = [[[(sample_motif(sigma, Ne, L, copies, n))
               for i in range(trials)]
          for sigma in sigmas] for Ne in tqdm(Nes)]
    occ_M = [[expected_occupancy(sigma, Ne, L, copies)
          for sigma in sigmas] for Ne in tqdm(Nes)]
    print "ic_M"
    ic_M = mmap(lambda ms:mean(map(motif_ic,ms)),motifss)
    print "gini_M"
    gini_M = mmap(lambda ms:mean(map(motif_gini,ms)),motifss)
    print "mi_M"
    mi_M = mmap(lambda ms:mean(map(total_motif_mi,ms)),tqdm(motifss))
    plt.subplot(2,2,1)
    plt.contourf(sigmas,Nes,occ_M,cmap='jet')
    plt.colorbar()
    plt.subplot(2,2,2)
    plt.contourf(sigmas,Nes,ic_M,cmap='jet')
    plt.colorbar()
    plt.subplot(2,2,3)
    plt.contourf(sigmas,Nes,gini_M,cmap='jet')
    plt.colorbar()
    plt.subplot(2,2,4)
    plt.contourf(sigmas,Nes,mi_M,cmap='jet')
    plt.colorbar()
    maybesave(filename)
def sigma_Ne_contour_plot(filename=None):
    sigmas = np.linspace(0, 5, 20)
    Nes = np.linspace(1, 20, 20)
    L = 10
    n = 50
    copies = 10 * n
    trials = 100
    motifss = [[[(sample_motif(sigma, Ne, L, copies, n))
                 for i in range(trials)] for sigma in sigmas]
               for Ne in tqdm(Nes)]
    occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas]
             for Ne in tqdm(Nes)]
    print "ic_M"
    ic_M = mmap(lambda ms: mean(map(motif_ic, ms)), motifss)
    print "gini_M"
    gini_M = mmap(lambda ms: mean(map(motif_gini, ms)), motifss)
    print "mi_M"
    mi_M = mmap(lambda ms: mean(map(total_motif_mi, ms)), tqdm(motifss))
    plt.subplot(2, 2, 1)
    plt.contourf(sigmas, Nes, occ_M, cmap='jet')
    plt.colorbar()
    plt.subplot(2, 2, 2)
    plt.contourf(sigmas, Nes, ic_M, cmap='jet')
    plt.colorbar()
    plt.subplot(2, 2, 3)
    plt.contourf(sigmas, Nes, gini_M, cmap='jet')
    plt.colorbar()
    plt.subplot(2, 2, 4)
    plt.contourf(sigmas, Nes, mi_M, cmap='jet')
    plt.colorbar()
    maybesave(filename)
Ejemplo n.º 3
0
def on_off_experiment2(num_motifs=100,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"):
    """compare MI vs Gini on biological_motifs"""
    bio_motifs = [getattr(tfdf,tf) for tf in tfdf.tfs]
    Ns = map(len, bio_motifs)
    spoofses = [spoof_on_off_motif(motif,num_motifs=num_motifs,trials=1) for motif in bio_motifs]
    spoof_ginises = mmap(motif_gini,tqdm(spoofses))
    spoof_mises = mmap(total_motif_mi,tqdm(spoofses))
    cors, ps = [],[]
    for ginis, mis in zip(ginises, mises):
        cor, p = pearsonr(ginis,mis)
        cors.append(cor)
        ps.append(p)
    q = fdr(ps)
    
    plt.scatter(cors,ps,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf")
    plt.plot([-1,1],[q,q],linestyle='--',label="FDR-Adjusted Significance Level")
    plt.semilogy()
    plt.legend()
    plt.xlabel("Pearson Correlation Coefficient")
    plt.ylabel("P value")
    plt.xlim([-1,1])
    plt.ylim([10**-4,1+1])
    cor_ps = zip(cors,ps)
    sig_negs = [(c,p) for (c,p) in cor_ps if c < 0 and p < q]
    sig_poses = [(c,p) for (c,p) in cor_ps if c > 0 and p < q]
    insigs = [(c,p) for (c,p) in cor_ps if p > q]
    def weighted_correlation(cor_p_Ns):
        cors,ps,Ns = transpose(cor_p_Ns)
        return sum([cor*N for (cor,N) in zip (cors,Ns)])/sum(Ns)
    plt.title("Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs")
    maybesave(filename)
def bio_detector_experiment(filename=None):
    """use high Gini to detect biological motifs"""
    bio_ginis = map(motif_gini, bio_motifs)
    maxent_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(bio_motifs)]
    maxent_ginis = mmap(motif_gini, maxent_spoofs)
    ps = zipWith(percentile,bio_ginis, maxent_ginis)
    neg_controls = map(first, maxent_spoofs)
    neg_control_spoofs = [spoof_motifs_maxent(motif,num_motifs=100) for motif in tqdm(neg_controls)]
    nc_ps = zipWith(percentile,map(motif_gini,neg_controls), mmap(motif_gini, neg_control_spoofs))
    roc_curve(ps, nc_ps)
    plt.xlabel("FPR",fontsize='large')
    plt.ylabel("TPR",fontsize='large')
    maybesave(filename)
def visualize_stationary_sum(matrix,n,Ne,T,samples_per_bin=100):
    L = len(matrix)
    nu = Ne - 1
    ringer = ringer_motif(matrix,n)
    motifss = [[mutate_motif_k_times(ringer,k) for i in range(samples_per_bin)] for k in trange(n*L)]
    log_fss = mmap(lambda motif:log_fitness(matrix,motif,G),tqdm(motifss))
    Tss = mmap(T,tqdm(motifss))
    log_ws = [log_rho_weight(rho,n,L) for rho in range(n*L)]
    terms = [mean(exp(nu*log_f + log_w)*T for log_f,T in zip(log_fs,Ts))
             for log_w,log_fs,Ts in zip(log_ws,log_fss,Tss)]
    Z = sum([mean(exp(nu*log_f + log_w) for log_f,T in zip(log_fs,Ts))
             for log_w,log_fs,Ts in zip(log_ws,log_fss,Tss)])
    print sum(terms)/Z
    plt.plot(range(n*L),terms)
Ejemplo n.º 6
0
def uniform_motif_with_ic_imh_ref(n,
                                  L,
                                  desired_ic,
                                  epsilon=0.1,
                                  iterations=None,
                                  verbose=False,
                                  num_chains=8):
    correction_per_col = 3 / (2 * log(2) * n)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta)
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)

    def Q(motif):
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p

    def log_dQ(motif_p, motif):
        return (beta * motif_ic(motif_p))

    def log_f(motif):
        in_range = abs(motif_ic(motif) - desired_ic) < epsilon
        return 0 if in_range else -10.0**100

    if iterations:
        x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0]
        chain = mh(log_f,
                   proposal=Q,
                   dprop=log_dQ,
                   x0=x0,
                   iterations=iterations,
                   use_log=True,
                   verbose=False)
        return chain
    else:  #use gelman rubin criterion
        x0s = sample_until(lambda x: log_f(x) > -1, lambda: Q(None),
                           num_chains)
        iterations = 100
        converged = False
        chains = [[] for _ in range(num_chains)]
        while not converged:
            for chain, x0 in zip(chains, x0s):
                chain.extend(
                    mh(log_f,
                       proposal=Q,
                       dprop=log_dQ,
                       x0=x0,
                       iterations=iterations,
                       use_log=True,
                       verbose=False))
            ic_chains = mmap(motif_ic, chains)
            R_hat, neff = gelman_rubin(ic_chains)
            if R_hat < 1.1:
                return chains
            else:
                x0s = [chain[-1] for chain in chains]
                iterations *= 2
def cv_experiment(motifs, target='uniform'):
    """see if js_psfm outperforms ml_psfm in 10x cv"""
    all_mls, all_js = [], []
    for motif in motifs:
        ml_lls = []
        js_lls = []
        for train, test in cv(motif):
            ml_mat = mmap(log, psfm_from_motif(train))
            js_mat = mmap(log, js_psfm(train, target=target))
            ml_ll = mean(score_seq(ml_mat, site) for site in test)
            js_ll = mean(score_seq(js_mat, site) for site in test)
            ml_lls.append(ml_ll)
            js_lls.append(js_ll)
        avg_ml_ll, avg_js_ll = mean(ml_lls), mean(js_lls)
        all_mls.append(avg_ml_ll)
        all_js.append(avg_js_ll)
        print avg_ml_ll, avg_js_ll, avg_ml_ll < avg_js_ll
    return all_mls, all_js
Ejemplo n.º 8
0
def on_off_experiment2(num_motifs=100,
                       filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"):
    """compare MI vs Gini on biological_motifs"""
    bio_motifs = [getattr(tfdf, tf) for tf in tfdf.tfs]
    Ns = map(len, bio_motifs)
    spoofses = [
        spoof_on_off_motif(motif, num_motifs=num_motifs, trials=1)
        for motif in bio_motifs
    ]
    spoof_ginises = mmap(motif_gini, tqdm(spoofses))
    spoof_mises = mmap(total_motif_mi, tqdm(spoofses))
    cors, ps = [], []
    for ginis, mis in zip(ginises, mises):
        cor, p = pearsonr(ginis, mis)
        cors.append(cor)
        ps.append(p)
    q = fdr(ps)

    plt.scatter(cors,
                ps,
                filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf")
    plt.plot([-1, 1], [q, q],
             linestyle='--',
             label="FDR-Adjusted Significance Level")
    plt.semilogy()
    plt.legend()
    plt.xlabel("Pearson Correlation Coefficient")
    plt.ylabel("P value")
    plt.xlim([-1, 1])
    plt.ylim([10**-4, 1 + 1])
    cor_ps = zip(cors, ps)
    sig_negs = [(c, p) for (c, p) in cor_ps if c < 0 and p < q]
    sig_poses = [(c, p) for (c, p) in cor_ps if c > 0 and p < q]
    insigs = [(c, p) for (c, p) in cor_ps if p > q]

    def weighted_correlation(cor_p_Ns):
        cors, ps, Ns = transpose(cor_p_Ns)
        return sum([cor * N for (cor, N) in zip(cors, Ns)]) / sum(Ns)

    plt.title(
        "Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs"
    )
    maybesave(filename)
def make_clusters_with_k(motif, k):
    print "k:", k
    L = len(motif[0])
    N = float(len(motif))
    clusters = [[] for i in range(k)]
    print "len clusters:", len(clusters)
    for site in motif:
        i = random.randrange(k)
        clusters[i].append(site)
    print "finished initializing"
    pssms = [
        mmap(log, psfm_from_motif_(cluster, L, pc=1)) for cluster in clusters
    ]
    alphas = [len(cluster) / N for cluster in clusters]

    def log_likelihood():
        return sum(
            log(
                sum(alpha * exp(score_seq(pssm, site))
                    for alpha, pssm in zip(alphas, pssms))) for site in motif)

    last_ll = 0
    done_yet = False
    #for i in range(iterations):
    while not done_yet:
        cur_ll = log_likelihood()
        print "log likelihood:", cur_ll
        if last_ll == cur_ll:
            done_yet = True
            break
        else:
            last_ll = cur_ll
        clusters = [[] for i in range(k)]
        for site in motif:
            i = argmax([score_seq(pssm, site) for pssm in pssms])
            clusters[i].append(site)
        pssms = [
            mmap(log, psfm_from_motif_(cluster, L, pc=1))
            for cluster in clusters
        ]
    return clusters, log_likelihood()
def estimate_stationary_statistic(matrix,n,Ne,T,samples_per_bin=10):
    """given matrix, Ne and statistic T, estimate <T> under stationary
    distribution"""
    L = len(matrix)
    N = n*L
    nu = Ne - 1
    ringer = ringer_motif(matrix,n)
    all_sampless = [[mutate_motif_k_times(ringer,k) for i in range(samples_per_bin)] for k in trange(N)]
    Tss = mmap(T,all_sampless)
    log_fss = mmap(lambda motif:log_fitness(matrix,motif,G),all_sampless)
     # better expressed as exp(nu*log(f)), but numeric issues
    log_bz_weightss = [[(nu*log_f) for log_f in log_fs] for rho,log_fs in enumerate(log_fss)]
    #Z = sum([mean(bz_weights)*rho_weight(rho,n,L) for rho,bz_weights in enumerate(bz_weightss)])
    log_Z = logsum([logmean(log_bz_weights) + log_rho_weight(rho,n,L)
                    for rho,log_bz_weights in enumerate(log_bz_weightss)])
    # summands = [rho_weight(rho,n,L)*mean(t*bz_weight/Z for t,bz_weight in zip(ts,bz_weights))
    #             for rho,(ts,bz_weights) in enumerate(zip(Tss,bz_weightss))]
    log_summands = logsum([logmean([log(t*exp(log_bz_w)) for t,log_bz_w in zip(ts,log_bz_weights)])
                           + log_rho_weight(rho,n,L)
                           for rho,(ts,log_bz_weights) in enumerate(zip(Tss,log_bz_weightss))])
    return exp(log_summands - log_Z)
def estimate_stationary_statistic(matrix, n, Ne, T, samples_per_bin=10):
    """given matrix, Ne and statistic T, estimate <T> under stationary
    distribution by importance sampling perturbations from ringer"""
    L = len(matrix)
    N = n * L
    nu = Ne - 1
    ringer = ringer_motif(matrix, n)
    all_sampless = [[
        mutate_motif_k_times(ringer, k) for i in range(samples_per_bin)
    ] for k in trange(N)]
    Tss = mmap(T, all_sampless)
    fss = mmap(lambda motif: fitness(matrix, motif, G), all_sampless)
    # better expressed as exp(nu*log(f)), but numeric issues
    bz_weightss = [[(f**nu) for f in fs] for rho, fs in enumerate(fss)]
    Z = sum([
        mean(bz_weights) * 4**rho for rho, bz_weights in enumerate(bz_weightss)
    ])
    summands = [
        4**rho * mean(t * bz_weight / Z
                      for t, bz_weight in zip(ts, bz_weights))
        for rho, (ts, bz_weights) in enumerate(zip(Tss, bz_weightss))
    ]
    return sum(summands)
def site_sampling_methods_study(n=50, num_motifs=10, plot=True):
    """validate that the three proposed sampling methods:

    brute force
    rejection sampling
    metropolis hastings

    do in fact sample from the same distribution
    """

    L = 10
    sigma = 1
    matrix = sample_matrix(L, sigma)
    Ne = 5
    mu = -10
    print "bf"
    t0 = time.time()
    bf_motifs = [sample_motif_bf(matrix, mu, Ne, n,verbose=True)
                 for i in trange(num_motifs)]
    bf_time = time.time() - t0
    print "ar"
    t0 = time.time()
    ar_motifs = [sample_motif_ar(matrix, mu, Ne, n)
                 for i in range(num_motifs)]
    ar_time = time.time() - t0
    print "mh"
    t0 = time.time()
    mh_motifs = [sample_motif_mh(matrix, mu, Ne, n)
                 for i in range(num_motifs)]
    mh_time = time.time() - t0
    icss = mmap(motif_ic,[bf_motifs, ar_motifs, mh_motifs])
    print "ics:", map(mean_ci, icss)
    print "time per motif:", [t/num_motifs
                              for t in [bf_time, ar_time, mh_time]]
    if plot:
        plt.boxplot(icss)
    for xs, ys in choose2(icss):
        print mannwhitneyu(xs,ys)
Ejemplo n.º 13
0
def uniform_motif_with_ic_rw(n,
                             L,
                             desired_ic,
                             epsilon=0.1,
                             p=None,
                             iterations=None,
                             num_chains=8,
                             x0=None,
                             beta=None):
    if p is None:
        p = 2.0 / (n * L)

    def Q(motif):
        return mutate_motif_p(motif, p)

    def f(motif):
        return abs(motif_ic(motif) - desired_ic) < epsilon

    if type(iterations) is int:
        if x0 is None:
            x0 = uniform_motif_with_ic_imh(n,
                                           L,
                                           desired_ic,
                                           epsilon=epsilon,
                                           iterations=1,
                                           beta=beta)[0]
        chain = mh(f, proposal=Q, x0=x0, iterations=iterations)
        return chain
    elif iterations == "harmonic":
        ar = 1.0 / 5
        iterations = int(n * L * harmonic(n * L) / ar)
        print "iterations:", iterations
        if x0 is None:
            x0 = uniform_motif_with_ic_imh(n,
                                           L,
                                           desired_ic,
                                           epsilon=epsilon,
                                           iterations=1)[0]
        chain = mh(f, proposal=Q, x0=x0, iterations=iterations)
        return chain
    else:  #use gelman rubin criterion
        x0s = [
            uniform_motif_with_ic_imh(n,
                                      L,
                                      desired_ic,
                                      epsilon=epsilon,
                                      iterations=1)[0]
            for i in range(num_chains)
        ]
        iterations = 100
        converged = False
        chains = [[] for _ in range(num_chains)]
        while not converged:
            for chain, x0 in zip(chains, x0s):
                chain.extend(
                    mh(f,
                       proposal=Q,
                       x0=x0,
                       iterations=iterations,
                       verbose=False))
            ic_chains = mmap(motif_ic, chains)
            R_hat, neff = gelman_rubin(ic_chains)
            if R_hat < 1.1:
                return chains
            else:
                x0s = [chain[-1] for chain in chains]
                iterations *= 2
Ejemplo n.º 14
0
def normalize_matrix(xss):
    Z = float(sum(map(sum, xss)))
    return mmap(lambda x: x / Z, xss)