Exemple #1
0
def cumsum_test():
    arca_reads = get_arca_reads(1000000)
    true_rdm = density_from_reads(arca_reads, G)
    pssm = make_pssm(Escherichia_coli.ArcA)
    comb_rdm = true_rdm[0] + true_rdm[1]
    print "fwd_scores"
    fwd_scores = score_genome_np(pssm, genome)
    print "rev_scores"
    rev_scores = score_genome_np(pssm, wc(genome))
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    probs = np.exp(scores)/np.sum(np.exp(scores))
    print "sorting scores"
    score_js = sorted_indices(scores)[::-1] # order scores from greatest to least
    print "sorting probs"
    prob_js = sorted_indices(probs)[::-1] # ditto
    plt.plot(cumsum(rslice(comb_rdm, score_js)), label="scores")
    plt.plot(cumsum(rslice(comb_rdm, prob_js)), label="boltzmann probs")
    comb_rdm_copy = list(comb_rdm)
    controls = 5
    for i in range(controls):
        print i
        random.shuffle(comb_rdm_copy)
        plt.plot(cumsum(comb_rdm_copy), color='r')
    plt.legend(loc=0)
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.show()
Exemple #2
0
def L_vs_sigma_plot(filename=None, with_bio=False):
    if with_bio:
        tfdf = extract_motif_object_from_tfdf()
        motifs = [getattr(tfdf, tf) for tf in tfdf.tfs]
        Ls = [len(motif[0]) for motif in motifs]
        cs = [len(motif) for motif in motifs]
        ics = [motif_ic(motif) for motif in motifs]
        ic_density = [ic / L for ic, L in zip(ics, Ls)]
        sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs]
        ginis = [motif_gini(motif, correct=False) for motif in motifs]
        mi_density = [
            total_motif_mi(motif) / choose(L, 2)
            for motif, L in zip(motifs, Ls)
        ]
    min_sigma = 0.1
    max_sigma = 10
    plt.xlim(0, max_sigma)
    plt.ylim(0, 60)
    plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)),
             label="Binding Transition")
    plt.plot([min_sigma, max_sigma],
             [log(G, 2) / 2, log(G, 2) / 2],
             linestyle='--',
             label="Info Theory Threshold")
    # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)),
    #          linestyle='--',label="Zero Discrimination Asymptote")
    if with_bio:
        plt.scatter(sigmas, Ls, label="Biological Motifs")
    plt.xlabel("sigma")
    plt.ylabel("L")
    plt.legend()
    maybesave(filename)
def validate_sample_motif_neglect_fg2(iterations=50000):
    """compare fg_neglect sampling to MCMC"""
    bio_motif = Escherichia_coli.LexA
    n = len(bio_motif)
    L = len(bio_motif[0])
    matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)]
    ringer = ringer_motif(matrix,n)
    Ne = 2.375 
    random_motifs = [sample_motif_neglect_fg(matrix,n,Ne) for i in trange(iterations)]
    random_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(random_motifs)]
    random_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(random_motifs)]
    random_ics = map(motif_ic,random_motifs)
    _, chain = sella_hirsch_mh(matrix=matrix,init="ringer",Ne=Ne,n=n,iterations=iterations)
    chain_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(chain)]
    chain_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(chain)]
    chain_ics = map(motif_ic,chain)
    plt.subplot(1,2,1)
    plt.scatter(random_rhos,random_log_fs)
    plt.scatter(chain_rhos,chain_log_fs,color='g')
    plt.xlabel("rho")
    plt.ylabel("log fitness")
    plt.subplot(1,2,2)
    plt.scatter(random_rhos,random_ics)
    plt.scatter(chain_rhos,chain_ics,color='g')
    plt.xlabel("rho")
    plt.ylabel("IC")
def make_ecoli_df():
    Ls = []
    Ls_adj = []
    ns = []
    sigmas = []
    labels = []
    motif_ics = []
    motif_ics_per_base = []
    for tf in Escherichia_coli.tfs:
        sites = getattr(Escherichia_coli, tf)
        L = len(sites[0])
        n = len(sites)
        ns.append(n)
        L_adj = len(sites[0]) + log2(n)
        sigma = mean((map(sd, make_pssm(sites))))
        Ls.append(L)
        Ls_adj.append(L_adj)
        motif_ics.append(motif_ic(sites))
        motif_ics_per_base.append(motif_ic(sites) / float(L))
        sigmas.append(sigma)
    df = pd.DataFrame(
        {
            "L": Ls,
            "n": ns,
            "sigma": sigmas,
            "motif_ic": motif_ics,
            "info_density": motif_ics_per_base
        },
        index=Escherichia_coli.tfs)
    return df
def test_estimate_stationary_statistic_ref_framework():
    matrix = make_pssm(Escherichia_coli.LexA)
    n = len(Escherichia_coli.LexA)
    Nes = np.linspace(1,5,10)
    pred,obs = transpose([test_estimate_stationary_statistic_ref(matrix,n,Ne,T=motif_ic) for Ne in Nes])
    plt.plot(Nes,pred)
    plt.plot(Nes,obs)
    return pred,obs
Exemple #6
0
def spoof_motif(motif, T):
    n = len(motif)
    L = len(motif[0])
    bio_ic = motif_ic(motif)
    sigma = 2 * mean(map(sd, make_pssm(motif)))  # XXX REVSIT THIS ISSUE
    ic_from_Ne = lambda Ne: predict_stat(n,
                                         L,
                                         sigma,
                                         Ne,
                                         G=5 * 10**6,
                                         T=lambda rho: mean_ic_from_rho(
                                             rho, n, L))
    Ne = bisect_interval(lambda Ne: ic_from_Ne(Ne) - bio_ic, 0.01, 5)
    return predict_stat(n, L, sigma, Ne, T)
Exemple #7
0
def arca_motif_comparison():
    arca_reads = get_arca_reads()
    true_rdm = density_from_reads(arca_reads, G)
    pssm = make_pssm(Escherichia_coli.ArcA)
    plt.plot(true_rdm[0])
    plt.plot(true_rdm[1])
    fwd_scores, rev_scores = score_genome_np(pssm, genome)
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    sites = concat([(site, wc(site)) for site in Escherichia_coli.ArcA])
    site_locations = [m.start(0) for site in sites
                      for m in re.finditer(site, genome)]
    site_locations_np = np.zeros(G)
    for site_loc in site_locations:
        site_locations_np[site_loc] = 1
    plt.plot(site_locations_np)
    plt.plot(scores)
def validate_sample_motif_neglect_fg():
    """compare fg_neglect sampling to random mutation: indeed shows better fitness at given rho"""
    bio_motif = Escherichia_coli.LexA
    n = len(bio_motif)
    L = len(bio_motif[0])
    matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)]
    ringer = ringer_motif(matrix,n)
    random_motifs = [mutate_motif_k_times(ringer,k) for k in range(n*L)]
    random_motifs2 = [sample_motif_neglect_fg(matrix,n,Ne) for Ne in np.linspace(1,10,n*L)]
    random_rhos = [motif_hamming_distance(ringer,motif) for motif in random_motifs]
    random_log_fs = [log_fitness(matrix,motif,G) for motif in random_motifs]
    random_rhos2 = [motif_hamming_distance(ringer,motif) for motif in random_motifs2]
    random_log_fs2 = [log_fitness(matrix,motif,G) for motif in random_motifs2]
    plt.plot(random_rhos,random_log_fs)
    plt.plot(random_rhos2,random_log_fs2)
    plt.plot(random_rhos3,random_log_fs3)
def Ne_from_motif(bio_motif,interp_rounds,iterations=50000):
    """Given a motif, return Ne that matches mean IC"""
    bio_ic = motif_ic(bio_motif)
    n = len(bio_motif)
    L = len(bio_motif[0])
    matrix = [[-ep for ep in row] for row in  make_pssm(bio_motif)]
    print len(matrix)
    def f(Ne,iterations=iterations):
        print "Ne",Ne
        _,chain = sella_hirsch_mh(matrix=matrix,n=n,Ne=Ne,iterations=iterations,init='ringer')
        return mean(map(motif_ic,chain[iterations/2:])) - bio_ic
    # lo,hi = 1,5
    # data = []
    # for _ in xrange(interp_rounds):
    #     guess = (lo + hi)/2.0
    #     y = f(guess)
    #     print lo,hi,guess,y
    #     data.append((guess,y))
    #     if y > 0:
    #         hi = guess
    #     else:
    #         lo = guess
    # return data
    Ne_min = 1
    Ne_max = 5
    while f(Ne_max) < 0:
        print "increasing Ne max"
        Ne_max *= 2
    xs, ys= transpose([(Ne,f(Ne)) for Ne in np.linspace(Ne_min,Ne_max,interp_rounds)])
    # now find an interpolant.  We desire smallest sigma of gaussian
    # interpolant such that function has at most one inflection point
    interp_sigmas = np.linspace(0.01,1,100)
    interps = [gaussian_interp(xs,ys,sigma=s) for s in interp_sigmas]
    for i,(sigma, interp) in enumerate(zip(interp_sigmas,interps)):
        print i,sigma
        if num_inflection_points(map(interp,np.linspace(Ne_min,Ne_max,100))) == 1:
            "found 1 inflection point"
            break
    print sigma
    Ne = bisect_interval(interp,Ne_min,Ne_max)
    return Ne
Exemple #10
0
def power_law_exploration():
    """Are the read densities for a given bin power-law distributed?"""
    print "getting arca reads"
    arca_reads = get_arca_reads(1000000)
    print "computing read density map"
    true_rdm = density_from_reads(arca_reads, G)
    comb_rdm = true_rdm[0] + true_rdm[1]
    pssm = make_pssm(Escherichia_coli.ArcA)
    print "scoring"
    fwd_scores, rev_scores = score_genome_np(pssm, genome)
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    d = defaultdict(list)
    print "tabulating"
    for i in xrange(G):
        score = int(scores[i])
        d[score].append(comb_rdm[i])
    print "plotting"
    for key in sorted(d.keys()):
        counts = Counter(d[key])
        Z = float(sum(counts.values()))
        plt.plot(sorted(counts.keys()),
                 [counts[k]/Z for k in sorted(counts.keys())], label=key)
    plt.loglog()
    plt.show()
def jackknife_distribution(motif):
    scores = []
    for i in range(len(motif)):
        motif_p = [site for j, site in enumerate(motif) if not i == j]
        scores.append(score_seq(make_pssm(motif_p), motif[i]))
    return scores
def make_ecoli_sigma_L_plot():
    Ls = []
    Ls_adj = []
    ns = []
    sigmas = []
    labels = []
    motif_ics = []
    motif_ics_per_base = []
    for tf in Escherichia_coli.tfs:
        sites = getattr(Escherichia_coli, tf)
        L = len(sites[0])
        n = len(sites)
        ns.append(n)
        L_adj = len(sites[0]) + log2(n)
        sigma = mean((map(sd, make_pssm(sites))))
        Ls.append(L)
        Ls_adj.append(L_adj)
        motif_ics.append(motif_ic(sites))
        motif_ics_per_base.append(motif_ic(sites) / float(L))
        sigmas.append(sigma)
        labels.append(tf)
    sigma_space = np.linspace(0.1, 3, 10)
    crit_lambs_actual = map(
        lambda sigma: critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100),
        tqdm(sigma_space))
    plt.subplot(1, 6, 1)
    plt.scatter(sigmas, Ls)
    for L, sigma, label in zip(Ls, sigmas, labels):
        plt.annotate(label, xy=(sigma, L))
    plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space))
    plt.plot(
        *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space))
    plt.plot(sigma_space, crit_lambs_actual)
    plt.subplot(1, 6, 2)
    plt.scatter(sigmas, Ls_adj)
    for L_adj, sigma, label in zip(Ls_adj, sigmas, labels):
        plt.annotate(label, xy=(sigma, L_adj))
    plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space))
    plt.plot(
        *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space))
    plt.plot(sigma_space, crit_lambs_actual)
    preds = [critical_lamb(sigma, G=4.5 * 10**6) for sigma in tqdm(sigmas)]
    preds_actual = [
        critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100)
        for sigma in tqdm(sigmas)
    ]
    plt.subplot(1, 6, 3)
    plt.scatter(preds, Ls)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.title("Preds vs Ls")
    print "Preds vs Ls", pearsonr(preds, Ls)
    plt.plot([0, 30], [0, 30])
    plt.subplot(1, 6, 4)
    plt.scatter(preds, Ls_adj)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.plot([0, 30], [0, 30])
    plt.title("Preds vs Ls_adj")
    print "Preds vs Ls_adj", pearsonr(preds, Ls_adj)
    plt.subplot(1, 6, 5)
    plt.scatter(preds_actual, Ls)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.plot([0, 30], [0, 30])
    plt.title("Preds_actual vs Ls")
    print "Preds_actual vs Ls", pearsonr(preds_actual, Ls)
    plt.subplot(1, 6, 6)
    plt.scatter(preds_actual, Ls_adj)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.plot([0, 30], [0, 30])
    plt.title("Preds_actual vs Ls_adj")
    print "Preds_actual vs Ls_adj", pearsonr(preds_actual, Ls_adj)
    return Ls, sigmas