Exemple #1
0
def rolling_window_correlation_exp():
    """What is the radius of the rolling average window that maximizes
    the correlation between TRAP probabilities, chIP-seq read intensities?"""
    pssm = PSSM(arac_motif)
    genome = get_ecoli_genome()
    traps = pssm.slide_trap(genome)
    beta = 1.61  #approximately = 1/(kBT) in kcal/mol @ room temp
    z = sum(exp(-beta * ep) for ep in traps)
    probs = [exp(-beta * ep) / z for ep in traps]
    corrs = ([(k,
               spearmanr(probs[:10**5],
                         circular_rolling_average(chip_seq_data[:10**5],
                                                  k))[0])
              for k in verbose_gen(range(250, 750, 25))])
    plt.plot(*transpose(corrs))
    plt.xlabel("Rolling Window Radius (bp)")
    plt.ylabel(r"Spearman $\rho$")

    def smartwrap(text):
        from textwrap import wrap
        return "\n".join(wrap(text))

    plt.subplots_adjust(top=0.8)
    plt.title(
        smartwrap(
            "Spearman Correlation vs. Radius of Rolling Average for TRAP probabilities, ChIP-seq reads (AraC a) on bp 1-10**5 of E. coli genome"
        ))
    plt.savefig("correlation_vs_rolling_average_radius.png", dpi=400)
    """Conclusion: correlation is maximized for window radius 400-500
Exemple #2
0
def unflip_motif(motif):
    """Given a collection of possibly reverse complemented sites,unflip them"""
    from sufficache import PSSM
    mutable_motif = motif[:]
    for i,site in enumerate(motif):
        loo_motif = [s for (j,s) in enumerate(motif) if not i == j]
        pssm = PSSM(loo_motif)
        fd_score = pssm.score(site,both_strands=False)
        bk_score = pssm.score(wc(site),both_strands=False)
        print site
        print fd_score,bk_score
        if bk_score > fd_score:
            mutable_motif[i] = wc(site)
    return mutable_motif
def hill_coefficient_exp(tf_name,approx=False):
    """What is the effective hill coefficient of a binding site?"""
    motif = getattr(Escherichia_coli,tf_name)
    pssm = PSSM(motif)
    real_copies = copy_dict[tf_name][0]
    approx_copies = copy_dict[tf_name][1]
    copies = real_copies if not approx else approx_copies
    ns = []
    x_ks = []
    for site in motif:
        site_energy = pssm.trap(site)
        xs = copies
        ys = map(lambda mu:fermi_dirac(site_energy,mu),mus)
        plt.plot(xs,ys)
        x_k,n = fit_hill_function(xs,ys)
        print site,site.operon,n,x_k
        x_ks.append(x_k)
        ns.append(n)
    #plt.semilogx()
    #plt.show()
    return ns,x_ks
def rolling_window_correlation_exp():
    """What is the radius of the rolling average window that maximizes
    the correlation between TRAP probabilities, chIP-seq read intensities?"""
    pssm = PSSM(arac_motif)
    genome = get_ecoli_genome()
    traps = pssm.slide_trap(genome)
    beta = 1.61 #approximately = 1/(kBT) in kcal/mol @ room temp
    z = sum(exp(-beta*ep) for ep in traps)
    probs = [exp(-beta*ep)/z for ep in traps]
    corrs = ([(k,spearmanr(probs[:10**5],
                           circular_rolling_average(chip_seq_data[:10**5],k))[0])
              for k in verbose_gen(range(250,750,25))])
    plt.plot(*transpose(corrs))
    plt.xlabel("Rolling Window Radius (bp)")
    plt.ylabel(r"Spearman $\rho$")
    def smartwrap(text):
        from textwrap import wrap
        return "\n".join(wrap(text))
    plt.subplots_adjust(top=0.8)
    plt.title(smartwrap("Spearman Correlation vs. Radius of Rolling Average for TRAP probabilities, ChIP-seq reads (AraC a) on bp 1-10**5 of E. coli genome"))
    plt.savefig("correlation_vs_rolling_average_radius.png",dpi=400)
    """Conclusion: correlation is maximized for window radius 400-500
        return "".join([line.strip() for line in f.readlines()[1:]])
        
genome = get_mycobacterium_genome()

#Site1 in smollett_data does not appear in Myco genome.  (We use
#strain H37Rv; they use strain 1424 which is derived from the former.)
#For this reason, we need to revise the genome in order to stitch the site in.

start_coordinate = (3811492 #start position of region listed in Table 1
                    +158) # position of strongest site, relative to
                          # start position

site1 = smollett_sites[1]
revised_genome = subst(genome,site1,start_coordinate)
    
model = PSSM(smollett_sites.values())

traps = model.slide_trap(revised_genome)
exact_copies = [sum(fd_probs(traps,mu,beta)) for mu in verbose_gen(mus)]
z = sum(exp(-beta*ep) for ep in traps)
approx_copies = [approximate_copy_number_from_mu(traps,mu,z)
                     for mu in verbose_gen(mus)]
absolute_ns_energy = -8 #kBT = -5 kca/mol
width = len(site1)
ep_ns = 2*width + absolute_ns_energy #Assume binding energy is -2kbt/match
offset = lambda ep:log(exp(-beta*ep) + exp(-beta*ep_ns))/-beta
ns_traps = map(offset,traps)

coordinates = [smollett_data[i][0] for i in range(1,25+1)]
scores = [smollett_data[i][1] for i in range(1,25+1)]
regions = [genome[start_pos:end_pos+18] for (start_pos,end_pos) in coordinates]
"""This script generates a binding landscape for each TF"""

import sys
sys.path.append("src/sufficache")
from sufficache import PSSM
sys.path.append("data/motifs")
from motifs import *
from chem_pot_utils import get_ecoli_genome
from array import array

if __name__ == "__main__":
    "usage: generate_binding_landscapys.py tf_name [control]"
    tf_name = sys.argv[1]
    control = len(sys.argv) == 3 and sys.argv[2] == "control"
    genome = get_ecoli_genome() if not control else random_site(len(get_ecoli_genome()))
    print "Generating %s landscape for %s " % ("Control" * control,tf_name)
    tf = getattr(Escherichia_coli,tf_name)
    pssm = PSSM(tf)
    binding_energies = pssm.slide_trap(genome)
    arr = array('f')
    arr.extend(binding_energies)
    if not control:
        fname = "results/binding_landscapes/%s_genome_binding_landscape.dat" % tf_name
    else:
        fname = "results/binding_landscapes/%s_control_binding_landscape.dat" % tf_name
    with open(fname,'w') as f:
        arr.tofile(f)