Esempio n. 1
0
def find_flanking_regions(start,stop,strand,site,fna_filename):
    genome = genome_from_fna(fna_filename)
    offset = 100
    ufr = genome[start-offset:start]
    dfr = genome[stop:stop+offset]
    if strand == -1:
        ufr,dfr = wc(dfr),wc(ufr)
        assert wc(ufr + site + dfr) in genome
    else:
        assert ufr + site + dfr in genome
    return ufr,dfr
Esempio n. 2
0
def cumsum_test():
    arca_reads = get_arca_reads(1000000)
    true_rdm = density_from_reads(arca_reads, G)
    pssm = make_pssm(Escherichia_coli.ArcA)
    comb_rdm = true_rdm[0] + true_rdm[1]
    print "fwd_scores"
    fwd_scores = score_genome_np(pssm, genome)
    print "rev_scores"
    rev_scores = score_genome_np(pssm, wc(genome))
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    probs = np.exp(scores)/np.sum(np.exp(scores))
    print "sorting scores"
    score_js = sorted_indices(scores)[::-1] # order scores from greatest to least
    print "sorting probs"
    prob_js = sorted_indices(probs)[::-1] # ditto
    plt.plot(cumsum(rslice(comb_rdm, score_js)), label="scores")
    plt.plot(cumsum(rslice(comb_rdm, prob_js)), label="boltzmann probs")
    comb_rdm_copy = list(comb_rdm)
    controls = 5
    for i in range(controls):
        print i
        random.shuffle(comb_rdm_copy)
        plt.plot(cumsum(comb_rdm_copy), color='r')
    plt.legend(loc=0)
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.show()
Esempio n. 3
0
def update_scores_np(fwd_scores,rev_scores,fwd_i,fwd_j,dw,w,genome):
    G = len(genome)
    rel_fwd_base = {v:k for (k,v) in base_index.items()}[fwd_j]
    rel_rev_base = wc(rel_fwd_base)
    rev_i = w - fwd_i - 1
    fwd_dscores = (np.roll(np.array(list(genome)),-fwd_i) == rel_fwd_base) * dw
    rev_dscores = (np.roll(np.array(list(genome)),-rev_i) == rel_rev_base) * dw
    return fwd_scores + fwd_dscores,rev_scores + rev_dscores
Esempio n. 4
0
def arca_motif_comparison():
    arca_reads = get_arca_reads()
    true_rdm = density_from_reads(arca_reads, G)
    pssm = make_pssm(Escherichia_coli.ArcA)
    plt.plot(true_rdm[0])
    plt.plot(true_rdm[1])
    fwd_scores, rev_scores = score_genome_np(pssm, genome)
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    sites = concat([(site, wc(site)) for site in Escherichia_coli.ArcA])
    site_locations = [m.start(0) for site in sites
                      for m in re.finditer(site, genome)]
    site_locations_np = np.zeros(G)
    for site_loc in site_locations:
        site_locations_np[site_loc] = 1
    plt.plot(site_locations_np)
    plt.plot(scores)
Esempio n. 5
0
def find_site(site,fna_filename,return_all=False):
    genome = genome_from_fna(fna_filename)
    fwd_regexp = re.compile(site)
    rev_regexp = re.compile(wc(site))
    fwd_matches = [(lambda (start,stop):(start,stop,+1))(m.span()) for m in fwd_regexp.finditer(genome)]
    rev_matches = [(lambda (start,stop):(start,stop,-1))(m.span()) for m in rev_regexp.finditer(genome)]
    matches = fwd_matches + rev_matches
    #print matches
    if len(matches) == 1:
        print "found unique match for %s in %s" % (site,fna_filename)
        return head(matches) if not return_all else matches
    elif len(matches) > 1:
        print "found multiple matches for %s in %s" % (site,fna_filename)
        return (None,None,None) if not return_all else matches
    else:
        print "couldn't find' match for %s in %s" % (site,fna_filename)
        return (None,None,None) if not return_all else []
Esempio n. 6
0
 def model_f(dinuc):
     print dinuc
     oligos = model["oligo"].split()
     if dinuc in oligos:
         i = oligos.index(dinuc)
         d = {prop:model[prop][i]
              for prop in "twist tilt roll".split()}
     else:
         print "elsing"
         cunid = wc(dinuc)
         print cunid
         i = oligos.index(cunid)
         d = {prop:model[prop][i]
              for prop in "twist roll".split()}
         print "d"
         d["tilt"] = -model["tilt"][i] # flip sign of tilt if reverse complementing
     d["rise"] = model["rise"]
     print "returning d"
     return d
Esempio n. 7
0
def find_site_ref(site,fna_filename):
    """WRONG: rev matches are indexed backwards"""
    genome = genome_from_fna(fna_filename)
    regexp = re.compile(site)
    fwd_matches = [(lambda (start,stop):(start,stop,+1))(m.span()) for m in regexp.finditer(genome)]
    rev_matches = [(lambda (start,stop):(start,stop,-1))(m.span()) for m in regexp.finditer(wc(genome))]
    matches = fwd_matches + rev_matches
    print matches
    if len(matches) == 1:
        print "found unique match for %s in %s" % (site,fna_filename)
        return head(matches)
    elif len(matches) > 1:
        print "found multiple matches for %s in %s" % (site,fna_filename)
        return head(matches)
    else:
        print "couldn't find' match for %s in %s" % (site,fna_filename)
        return (None,None,None)