Esempio n. 1
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs = seqsD.values()
    for w in range(1, 7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1  #Pseudo count
            total = total + 1
        for nmer, count in nmersT[:]:
            try:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc] = nmersD[rc] + count
                total = total + 2 * count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1],
                                                             total)
        for nmer in _t:
            print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total)
        sys.stdout.flush()
Esempio n. 2
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total)
        for nmer in _t:
            print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total)
        sys.stdout.flush()
Esempio n. 3
0
def main(fastafile, outDirectory):  # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script.
    seqsD = Fasta.load(fastafile)
    seqs  = seqsD.values()
    
    output = []
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total))  # AD 02-27-09 added a '\n' to make file look right
        for nmer in _t:
            output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total))  # AD 02-27-09 added a '\n' to make file look right
        
        # open output file and write out results
        outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1])
        outFile = open(outFile, 'w')
        for index in output:
            outFile.write(index)
Esempio n. 4
0
 def study_seqs(self,seqs):
     for depth in range(1,6):
         nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES")
         total = 0
         for nmer,count in nmersT:
             total = total + count
             rc = MotifTools.revcomplement(nmer)
         for nmer,count in nmersT:
             f   = math.log(float(count)/total)/math.log(2)
             f_2 = math.log(0.5 * float(count)/total)/math.log(2)
             rc = MotifTools.revcomplement(nmer)
             if rc != nmer:
                 self.D[nmer] = f_2
                 self.D[rc]   = f_2
             else:
                 self.D[nmer] = f
     for depth in range(0):
         total = 0
         for k in self.D.keys():
             if len(k) == depth:
                 total = total + pow(2,self.D[k])
                 print k, pow(2,self.D[k])
         print depth,total
     self.highestorder = 5
Esempio n. 5
0
 def freq_from_seqs(self,seqs):
    self.highestorder = 6
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        self.nmers_by_size[w] = allnmers[:]
        nmersD = {}
        total = 0.0
        for nmer in allnmers: #Pseudo count
            nmersD[nmer] = 1 
            total = total + 1
        for nmer,count in nmersT:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        for nmer in nmersD.keys():
            rc = MotifTools.revcomplement(nmer)
            f  = nmersD[nmer]/total
            self.F[nmer] = f
            self.F[rc]   = f
Esempio n. 6
0
 def freq_from_seqs_old(self,seqs):
     self.highestorder = 4
     for depth in range(1,6):
         nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES")
         self.nmers_by_size[depth] = map(lambda x:x[0],nmersT)
         total = 0
         for nmer,count in nmersT:
             total = total + count
         for nmer,count in nmersT:
             rc = MotifTools.revcomplement(nmer)
             if nmer == rc:                       #correct top_nmers 
                 f   = float(count)/total         #palindrome count
             else:
                 f   = float(count)/total/2
             self.F[nmer] = f
             self.F[rc]   = f
     for depth in range(0):                       #For debugging
         total = 0
         for k in self.F.keys():
             if len(k) == depth:
                 total = total + self.F[k]
                 print k, self.F[k]
         print depth,total
Esempio n. 7
0
    def all_Wmers(self,N,seq):
        forw = []
        rev  = []
        seqrc = MotifTools.revcomplement(seq)
        Mlh = theMarkovBackground.highestorder
        Mlb = theMarkovBackground.logbackground
        MCP = theMarkovBackground.CP
        Fbg = Mlb(seq)
        Rbg = Mlb(seqrc)
        nmask = map(lambda x:1-x, self.mask)

        '''
        ?? QUESTION: Is it sensible to compute the background probabilities
        this way?
        
        1) BG of complementary strand is taken as equal to primary strand.
        2) Letters inside the motif window are not used for conditional probabilities.
           As a result, the calculation essentially breaks down to the log probability the
           background emits the sequence to the left of the window plus the log probability
           the background emits the sequence to the right.
        3) I\'ve worked out an efficient way to compute this by
           a) Compute the background probability for the entire probe/sequence
           b) (Quick) Compute logQdiff below
           c) Subtract
        '''

        for i in range(len(seq)-N+1):
            subseq = seq[i:i+N]

            '''Build Wmer information'''
            #Wtmp        = Wmer(subseq)
            left        = seq[0:i]
            right       = seq[i+N:]
            #Wtmp.lflank = left
            #Wtmp.rflank = right
            #if i==0: Wtmp.src    = seq
            #Wtmp.srcQ   = Fbg
            #Wtmp.i      = i

            '''This is the fast way'''
            logQdiff = Mlb(left[-Mlh:] + subseq + right[0:Mlh]) - Mlb(left[-Mlh:]) - Mlb(right[0:Mlh])
            logQtot = Fbg - logQdiff

            '''Add a bit back for intervening bases in the "gap" '''
            gapbg = 0
            for p in range(N):
                gapbg = gapbg + MCP[subseq[p]] * nmask[p]
            logQtot = logQtot + gapbg

            '''Build Wmer-reverse complement information'''
            #Wtmprc = Wmer(Wtmp.rc)
            #Wtmprc.lflank = seqrc[0:-(i+N)]  #Check this in case it is ever necessary
            #if i!=0:
            #    Wtmprc.rflank = seqrc[-i:]   #Necessary [11-12-02]
            #else:
            #    Wtmprc.rflank = ''
            #Wtmprc.logQtot = Wtmp.logQtot
            #Wtmprc.srcQ    = Wtmp.srcQ
            #Wtmprc.i       = i
            forw.append(logQtot)
            rev.append(logQtot)
        W = []
        W.extend(forw)
        W.extend(rev)
        #seq.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, W))
        #MDsupport.printdouble(seq.c_wmerbgs,len(W))
        return(W)
Esempio n. 8
0
 def has_wmer(self,wmer):
     rc = MotifTools.revcomplement(wmer)
     if (wmer in self.wmers) or (rc in self.wmers):
         return(1)
     else:
         return(0)
Esempio n. 9
0
def probOvlp(A,B,thresh=0.7,verbose=None):
    if A.width >= B.width:
        Wide, Narrow = A, B
    else:
        Wide, Narrow = B, A

    RC = MotifTools.revcomplement
    if 1:
        newWide  = Wide[-1,Wide.width+1]
        if Wide.__dict__.has_key('bestWide'):
            bestWide = Wide.bestWide
        else:
            bestWideD = {}
            for x in newWide.bestseqs(thresh*newWide.maxscore):
                bestWideD[x] = 1
            for x in bestWideD.keys():
                bestWideD[RC(x)] = 1
            Wide.bestWide = bestWideD.keys()
            bestWide = Wide.bestWide
        Wide = newWide
    
        if Narrow.__dict__.has_key('bestNarrow'):
            bestNarrow = Narrow.bestNarrow
        else:
            bestNarrowD = {}
            for x in Narrow.bestseqs(thresh*Narrow.maxscore):
                bestNarrowD[x] = 1
            for x in bestNarrowD.keys():
                bestNarrowD[RC(x)] = 1
            bestNarrow = bestNarrowD.keys()
            Narrow.bestNarrow = bestNarrow
        
    #bestWide   = [x[1] for x in Wide.bestseqs  (thresh*Wide.maxscore)  ]
    #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)]

    countNarrow = len(bestNarrow)
    countWide   = len(bestWide)

    numtotal    = math.pow(4,Wide.width)
    fudgefactor = math.pow(4,Wide.width - Narrow.width)

    bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide]

    countBoth = 0
    for i in range(len(bestNarrow)):
        m_narrow = bestNarrow[i]
        delj = []

        for j in range(len(bestWideTups)):
            if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0):
                countBoth += 1
                delj.append(j)

        delj.reverse()  #Chew in from the back
        for j in delj:
            del(bestWideTups[j])


    if verbose: print '%10d %10d %10d %10d | %10d  %5d '%(
        countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width),
    
    p = Arith.hypgeomsummore(countWide,                 #Num Interesting
                             numtotal,                  #All k-mers
                             countNarrow * fudgefactor, #Number picked
                             countBoth                ) #Number found
    return p
Esempio n. 10
0
def probOvlp(A, B, thresh=0.7, verbose=None):
    if A.width >= B.width:
        Wide, Narrow = A, B
    else:
        Wide, Narrow = B, A

    RC = MotifTools.revcomplement
    if 1:
        newWide = Wide[-1, Wide.width + 1]
        if Wide.__dict__.has_key('bestWide'):
            bestWide = Wide.bestWide
        else:
            bestWideD = {}
            for x in newWide.bestseqs(thresh * newWide.maxscore):
                bestWideD[x] = 1
            for x in bestWideD.keys():
                bestWideD[RC(x)] = 1
            Wide.bestWide = bestWideD.keys()
            bestWide = Wide.bestWide
        Wide = newWide

        if Narrow.__dict__.has_key('bestNarrow'):
            bestNarrow = Narrow.bestNarrow
        else:
            bestNarrowD = {}
            for x in Narrow.bestseqs(thresh * Narrow.maxscore):
                bestNarrowD[x] = 1
            for x in bestNarrowD.keys():
                bestNarrowD[RC(x)] = 1
            bestNarrow = bestNarrowD.keys()
            Narrow.bestNarrow = bestNarrow

    #bestWide   = [x[1] for x in Wide.bestseqs  (thresh*Wide.maxscore)  ]
    #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)]

    countNarrow = len(bestNarrow)
    countWide = len(bestWide)

    numtotal = math.pow(4, Wide.width)
    fudgefactor = math.pow(4, Wide.width - Narrow.width)

    bestWideTups = [(x, MotifTools.revcomplement(x)) for x in bestWide]

    countBoth = 0
    for i in range(len(bestNarrow)):
        m_narrow = bestNarrow[i]
        delj = []

        for j in range(len(bestWideTups)):
            if (bestWideTups[j][0].find(m_narrow) >=
                    0) or (bestWideTups[j][1].find(m_narrow) >= 0):
                countBoth += 1
                delj.append(j)

        delj.reverse()  #Chew in from the back
        for j in delj:
            del (bestWideTups[j])

    if verbose:
        print '%10d %10d %10d %10d | %10d  %5d ' % (
            countWide, numtotal, countNarrow * fudgefactor, countBoth,
            countNarrow, Wide.width - Narrow.width),

    p = Arith.hypgeomsummore(
        countWide,  #Num Interesting
        numtotal,  #All k-mers
        countNarrow * fudgefactor,  #Number picked
        countBoth)  #Number found
    return p