Exemple #1
0
def rolling_window_correlation_exp():
    """What is the radius of the rolling average window that maximizes
    the correlation between TRAP probabilities, chIP-seq read intensities?"""
    pssm = PSSM(arac_motif)
    genome = get_ecoli_genome()
    traps = pssm.slide_trap(genome)
    beta = 1.61  #approximately = 1/(kBT) in kcal/mol @ room temp
    z = sum(exp(-beta * ep) for ep in traps)
    probs = [exp(-beta * ep) / z for ep in traps]
    corrs = ([(k,
               spearmanr(probs[:10**5],
                         circular_rolling_average(chip_seq_data[:10**5],
                                                  k))[0])
              for k in verbose_gen(range(250, 750, 25))])
    plt.plot(*transpose(corrs))
    plt.xlabel("Rolling Window Radius (bp)")
    plt.ylabel(r"Spearman $\rho$")

    def smartwrap(text):
        from textwrap import wrap
        return "\n".join(wrap(text))

    plt.subplots_adjust(top=0.8)
    plt.title(
        smartwrap(
            "Spearman Correlation vs. Radius of Rolling Average for TRAP probabilities, ChIP-seq reads (AraC a) on bp 1-10**5 of E. coli genome"
        ))
    plt.savefig("correlation_vs_rolling_average_radius.png", dpi=400)
    """Conclusion: correlation is maximized for window radius 400-500
def rolling_window_correlation_exp():
    """What is the radius of the rolling average window that maximizes
    the correlation between TRAP probabilities, chIP-seq read intensities?"""
    pssm = PSSM(arac_motif)
    genome = get_ecoli_genome()
    traps = pssm.slide_trap(genome)
    beta = 1.61 #approximately = 1/(kBT) in kcal/mol @ room temp
    z = sum(exp(-beta*ep) for ep in traps)
    probs = [exp(-beta*ep)/z for ep in traps]
    corrs = ([(k,spearmanr(probs[:10**5],
                           circular_rolling_average(chip_seq_data[:10**5],k))[0])
              for k in verbose_gen(range(250,750,25))])
    plt.plot(*transpose(corrs))
    plt.xlabel("Rolling Window Radius (bp)")
    plt.ylabel(r"Spearman $\rho$")
    def smartwrap(text):
        from textwrap import wrap
        return "\n".join(wrap(text))
    plt.subplots_adjust(top=0.8)
    plt.title(smartwrap("Spearman Correlation vs. Radius of Rolling Average for TRAP probabilities, ChIP-seq reads (AraC a) on bp 1-10**5 of E. coli genome"))
    plt.savefig("correlation_vs_rolling_average_radius.png",dpi=400)
    """Conclusion: correlation is maximized for window radius 400-500
genome = get_mycobacterium_genome()

#Site1 in smollett_data does not appear in Myco genome.  (We use
#strain H37Rv; they use strain 1424 which is derived from the former.)
#For this reason, we need to revise the genome in order to stitch the site in.

start_coordinate = (3811492 #start position of region listed in Table 1
                    +158) # position of strongest site, relative to
                          # start position

site1 = smollett_sites[1]
revised_genome = subst(genome,site1,start_coordinate)
    
model = PSSM(smollett_sites.values())

traps = model.slide_trap(revised_genome)
exact_copies = [sum(fd_probs(traps,mu,beta)) for mu in verbose_gen(mus)]
z = sum(exp(-beta*ep) for ep in traps)
approx_copies = [approximate_copy_number_from_mu(traps,mu,z)
                     for mu in verbose_gen(mus)]
absolute_ns_energy = -8 #kBT = -5 kca/mol
width = len(site1)
ep_ns = 2*width + absolute_ns_energy #Assume binding energy is -2kbt/match
offset = lambda ep:log(exp(-beta*ep) + exp(-beta*ep_ns))/-beta
ns_traps = map(offset,traps)

coordinates = [smollett_data[i][0] for i in range(1,25+1)]
scores = [smollett_data[i][1] for i in range(1,25+1)]
regions = [genome[start_pos:end_pos+18] for (start_pos,end_pos) in coordinates]
normalized_scores = [score/len(region) for score,region in zip(scores,regions)]
select_traps = [traps[start_pos:end_pos] for (start_pos,end_pos) in coordinates]
"""This script generates a binding landscape for each TF"""

import sys
sys.path.append("src/sufficache")
from sufficache import PSSM
sys.path.append("data/motifs")
from motifs import *
from chem_pot_utils import get_ecoli_genome
from array import array

if __name__ == "__main__":
    "usage: generate_binding_landscapys.py tf_name [control]"
    tf_name = sys.argv[1]
    control = len(sys.argv) == 3 and sys.argv[2] == "control"
    genome = get_ecoli_genome() if not control else random_site(len(get_ecoli_genome()))
    print "Generating %s landscape for %s " % ("Control" * control,tf_name)
    tf = getattr(Escherichia_coli,tf_name)
    pssm = PSSM(tf)
    binding_energies = pssm.slide_trap(genome)
    arr = array('f')
    arr.extend(binding_energies)
    if not control:
        fname = "results/binding_landscapes/%s_genome_binding_landscape.dat" % tf_name
    else:
        fname = "results/binding_landscapes/%s_control_binding_landscape.dat" % tf_name
    with open(fname,'w') as f:
        arr.tofile(f)