def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1, 7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer, count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2 * count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1], total) for nmer in _t: print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total) sys.stdout.flush()
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total) for nmer in _t: print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total) sys.stdout.flush()
def main(fastafile, outDirectory): # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script. seqsD = Fasta.load(fastafile) seqs = seqsD.values() output = [] for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total)) # AD 02-27-09 added a '\n' to make file look right for nmer in _t: output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total)) # AD 02-27-09 added a '\n' to make file look right # open output file and write out results outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1]) outFile = open(outFile, 'w') for index in output: outFile.write(index)
def study_seqs(self,seqs): for depth in range(1,6): nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES") total = 0 for nmer,count in nmersT: total = total + count rc = MotifTools.revcomplement(nmer) for nmer,count in nmersT: f = math.log(float(count)/total)/math.log(2) f_2 = math.log(0.5 * float(count)/total)/math.log(2) rc = MotifTools.revcomplement(nmer) if rc != nmer: self.D[nmer] = f_2 self.D[rc] = f_2 else: self.D[nmer] = f for depth in range(0): total = 0 for k in self.D.keys(): if len(k) == depth: total = total + pow(2,self.D[k]) print k, pow(2,self.D[k]) print depth,total self.highestorder = 5
def freq_from_seqs(self,seqs): self.highestorder = 6 for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') self.nmers_by_size[w] = allnmers[:] nmersD = {} total = 0.0 for nmer in allnmers: #Pseudo count nmersD[nmer] = 1 total = total + 1 for nmer,count in nmersT: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass for nmer in nmersD.keys(): rc = MotifTools.revcomplement(nmer) f = nmersD[nmer]/total self.F[nmer] = f self.F[rc] = f
def freq_from_seqs_old(self,seqs): self.highestorder = 4 for depth in range(1,6): nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES") self.nmers_by_size[depth] = map(lambda x:x[0],nmersT) total = 0 for nmer,count in nmersT: total = total + count for nmer,count in nmersT: rc = MotifTools.revcomplement(nmer) if nmer == rc: #correct top_nmers f = float(count)/total #palindrome count else: f = float(count)/total/2 self.F[nmer] = f self.F[rc] = f for depth in range(0): #For debugging total = 0 for k in self.F.keys(): if len(k) == depth: total = total + self.F[k] print k, self.F[k] print depth,total
def all_Wmers(self,N,seq): forw = [] rev = [] seqrc = MotifTools.revcomplement(seq) Mlh = theMarkovBackground.highestorder Mlb = theMarkovBackground.logbackground MCP = theMarkovBackground.CP Fbg = Mlb(seq) Rbg = Mlb(seqrc) nmask = map(lambda x:1-x, self.mask) ''' ?? QUESTION: Is it sensible to compute the background probabilities this way? 1) BG of complementary strand is taken as equal to primary strand. 2) Letters inside the motif window are not used for conditional probabilities. As a result, the calculation essentially breaks down to the log probability the background emits the sequence to the left of the window plus the log probability the background emits the sequence to the right. 3) I\'ve worked out an efficient way to compute this by a) Compute the background probability for the entire probe/sequence b) (Quick) Compute logQdiff below c) Subtract ''' for i in range(len(seq)-N+1): subseq = seq[i:i+N] '''Build Wmer information''' #Wtmp = Wmer(subseq) left = seq[0:i] right = seq[i+N:] #Wtmp.lflank = left #Wtmp.rflank = right #if i==0: Wtmp.src = seq #Wtmp.srcQ = Fbg #Wtmp.i = i '''This is the fast way''' logQdiff = Mlb(left[-Mlh:] + subseq + right[0:Mlh]) - Mlb(left[-Mlh:]) - Mlb(right[0:Mlh]) logQtot = Fbg - logQdiff '''Add a bit back for intervening bases in the "gap" ''' gapbg = 0 for p in range(N): gapbg = gapbg + MCP[subseq[p]] * nmask[p] logQtot = logQtot + gapbg '''Build Wmer-reverse complement information''' #Wtmprc = Wmer(Wtmp.rc) #Wtmprc.lflank = seqrc[0:-(i+N)] #Check this in case it is ever necessary #if i!=0: # Wtmprc.rflank = seqrc[-i:] #Necessary [11-12-02] #else: # Wtmprc.rflank = '' #Wtmprc.logQtot = Wtmp.logQtot #Wtmprc.srcQ = Wtmp.srcQ #Wtmprc.i = i forw.append(logQtot) rev.append(logQtot) W = [] W.extend(forw) W.extend(rev) #seq.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, W)) #MDsupport.printdouble(seq.c_wmerbgs,len(W)) return(W)
def has_wmer(self,wmer): rc = MotifTools.revcomplement(wmer) if (wmer in self.wmers) or (rc in self.wmers): return(1) else: return(0)
def probOvlp(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh*Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4,Wide.width) fudgefactor = math.pow(4,Wide.width - Narrow.width) bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del(bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d '%( countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore(countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth ) #Number found return p
def probOvlp(A, B, thresh=0.7, verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1, Wide.width + 1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh * newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh * Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4, Wide.width) fudgefactor = math.pow(4, Wide.width - Narrow.width) bestWideTups = [(x, MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del (bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d ' % ( countWide, numtotal, countNarrow * fudgefactor, countBoth, countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore( countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth) #Number found return p