def orforflistmatch(s_orf,orflist): """ Which annotations associated with s_orf are significantly overrepresented in orflist? """ if not _orfs_by_go: load_GO() norfs = len(orflist) totorfs = len(_gos_by_orf.keys()) categories = [] for anno in annotations(s_orf): categories.append(anno.desc) sigs = [] for category in categories: all_by_cat = annotation2orfs(category) nall = len(all_by_cat) fracall = float(nall)/float(totorfs) sub_by_cat = [x for x in orflist if x in all_by_cat] nsub = len(sub_by_cat) if norfs == 0: fracsub = 0 else: fracsub = float(nsub)/float(norfs) sig = Arith.hypgeomsummore(nall,totorfs,norfs,nsub) sigs.append( (sig, category, fracall, fracsub, sub_by_cat) ) #if s_orf == SGD.gene2orf('NRG1'): # print '@ NRG1 %5.2e %-40s %4d %4d %4d %4d '%( # sig,category,nall,totorfs,norfs,nsub) sigs.sort() #for sig in sigs: print sig return sigs
def orflist2categories_long(orflist,thresh=0.05): """ Which categories are overrepresented among orflist. Thresh is applied after Bonferroni correction. Returns: Sorted list of tuples, [(signifance, category, fracall, fracsub, orflist), ...] """ if not _orfs_by_go: load_GO() norfs = len(orflist) totorfs = len(_gos_by_orf.keys()) #Determine which categories are described by the set categories = [] for orf in orflist: for anno in annotations(orf): if anno.desc not in categories: categories.append(anno.desc) totcats = float(len(categories)) sigs = [] for category in categories: all_by_cat = annotation2orfs(category) nall = len(all_by_cat) fracall = float(nall)/float(totorfs) sub_by_cat = [x for x in orflist if x in all_by_cat] nsub = len(sub_by_cat) fracsub = float(nsub)/float(norfs) sig = Arith.hypgeomsummore(nall,totorfs,norfs,nsub) * totcats #print category,sig,nall,totorfs,norfs,nsub,' ',totcats sigs.append( (sig, category, fracall, fracsub, sub_by_cat) ) sigs.sort() ans = [] for sigdata in sigs: sig, category, fracall, fracsub, orfs = sigdata if sig > thresh: continue ans.append(sigdata) return ans
def probOvlp(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh*Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4,Wide.width) fudgefactor = math.pow(4,Wide.width - Narrow.width) bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del(bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d '%( countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore(countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth ) #Number found return p
def probOvlp(A, B, thresh=0.7, verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1, Wide.width + 1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh * newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh * Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4, Wide.width) fudgefactor = math.pow(4, Wide.width - Narrow.width) bestWideTups = [(x, MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del (bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d ' % ( countWide, numtotal, countNarrow * fudgefactor, countBoth, countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore( countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth) #Number found return p