def getSeqFreqs(seqDict): """"Returns tuple with correct values for motility AT_bias,GC_bias keywords as calculated from seqDict.""" # These look weird bc motility expects these info in a weird way. # I have confirmed from Titus Brown (motility author) that this is correct. GC_bias = seqStats(seqDict)['percentGC']/2 AT_bias = 0.5 - GC_bias return AT_bias,GC_bias
def getSeqFreqs(seqDict): """"Returns DICT with correct values for TAMO AT_bias,GC_bias keywords as calculated from seqDict.""" data = seqStats(seqDict) seqFreqs = {} seqFreqs['A'] = float(data['aCnt'])/data['nonNs'] seqFreqs['C'] = float(data['cCnt'])/data['nonNs'] seqFreqs['G'] = float(data['gCnt'])/data['nonNs'] seqFreqs['T'] = float(data['tCnt'])/data['nonNs'] return seqFreqs
def toTAMOmotifs(motifList,seqData=None): """Returns a list of TAMO motif Objects when given a list of the native motif Objs. If seqData is a dict, it is assumed to be nucFreqs (seqData['A']==.26), else, it is assumed to be a path to a seqPopulation that the motifs will be used on for stats to be calculated against.""" from TAMO.MotifTools import Motif_from_counts if not seqData: if type(seqData) == type({}): try: aFreq = seqData['A'] cFreq = seqData['C'] gFreq = seqData['G'] tFreq = seqData['T'] except KeyError: raise InvalidOptionError("toTAMOmotifs: Unrecognized key in seqData as dict.") elif type(seqData) == type(''): sDict = ParseFastA(seqData).toDict() seqData = seqStats(sDict) del(sDict) tot = float(seqData['nonNs']) aFreq = seqData['aCnt']/tot cFreq = seqData['cCnt']/tot gFreq = seqData['gCnt']/tot tFreq = seqData['tCnt']/tot else: aFreq = 0.25 cFreq = 0.25 gFreq = 0.25 tFreq = 0.25 tList = [] for m in motifList: counts = [] for i in range(len(m)): counts.append({'A':m.pwm['A'][i],'C':m.pwm['C'][i],'G':m.pwm['G'][i],'T':m.pwm['T'][i]}) t = Motif_from_counts(counts,beta=0.01,bg={'A':aFreq,'C':cFreq,'G':gFreq,'T':tFreq}) t.id = m.id try: t.sigvalue = m.sigvalue except AttributeError: pass tList.append(t) return tList
def main(): #+++++++++++ File Parseing Etc +++++++++++ desc = """Calls the folowing funcs: 'add this'""" usage = """python %prog args""" parser = optparse.OptionParser(usage=usage, description=desc) parser.add_option('--motifs', type='str',default=None, help="""Path to motif file (default=%default).""") parser.add_option('--motif-type', type='str',default='scope', help="""Format of motif file (default=%default).""") parser.add_option('--thresh', type='float',default=0.005, help="""P-value threshold for motif score cut-off (default=%default).""") parser.add_option('--promoters', type='str',default=None, help="""Path to fasta file with promoter population (default=%default).""") parser.add_option('--plen', type='int',default=1000, help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""") parser.add_option('--genes', type='string',default=None, help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""") parser.add_option('--job', type='string',default='int(time())', help="""String to identify this run (default=%default).""") parser.add_option('--out', type='string',default=None, help="""Path to results dir -- must use full path (default=%default).""") parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0, help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""") parser.add_option('--from-possum',default=False, help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""") parser.add_option('--verbose',action='store_true',default=False, help="""Include if stdOut/stdErr is desired. (default=%default).""") parser.add_option('--check-seqs',action='store_true',default=False, help="""Print info about promoter sequences and exit. (default=%default).""") parser.add_option('--expect',action='store_true',default=False, help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""") (opts, args) = parser.parse_args() # +++++ Argument Validations +++++ if len(sys.argv) == 1: parser.print_help() exit(0) if not opts.out: raise InvalidOptionError("--out argument is required.") if not opts.genes: raise InvalidOptionError('--genes argument is required.') if opts.job == 'int(time())': opts.job = int(time()) if not opts.from_possum: if not opts.motifs or opts.motif_type or opts.thresh or opts.promoters or opts.plen: raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required: --motif-type --thresh --plen""") else: if not opts.motifs: raise InvalidOptionError("When using --from-possum, --motifs should be the PSSM file used to generate this particular hit set.") # +++++ Lets Begin +++++ if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv))) if opts.verbose: sys.stdout.write('preparing out directory...\n') outBaseStr = getOutBaseStr(opts.out,opts.job) mkdirp(opts.out) if opts.verbose: sys.stdout.write('building seqDict...\n') seqDict = getSeqs(opts.promoters,opts.plen) if opts.check_seqs: seqStats(seqDict,show=True) exit(0) else: seqData = seqStats(seqDict,show=False) if opts.expect: opts.expect = seqData['medLen']*opts.thresh*2 else: opts.expect = 0 # --- am i doing the searching myself? --- if not opts.from_possum: # -- yes -- if opts.verbose: sys.stdout.write('building motifList...\n') motifList = getMotifs(opts.motifs,opts.motif_type) if opts.verbose: sys.stdout.write('getting nucFreqs...\n') halfAT,halfGC = getSeqFreqs(seqDict) if opts.verbose: sys.stdout.write('building hitDict...\n') motifHits = getEvalHitDict(motifList,seqDict,pThresh=opts.thresh,halfAT=halfAT,halfGC=halfGC) else: # -- Oh thank god, no! All I have to do is some parseing! -- if opts.verbose: sys.stdout.write('skipping to building hitDict step...\n') pACs = getPossumProfileACs(opts.motifs) possumTable = getPossumHitTable(opts.from_possum,headers=possumHeaders) motifHits = getPossumHitDict(possumTable,seqDict.keys(),pACs) motifList = makeMotifListFromPossum(pACs) # create list of DummyPlug MotifObjs for compatibility if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n') foregroundSeqs = getForeground(opts.genes) outData = {} if opts.verbose: sys.stdout.write('calculating real data p-values...\n') realData = motifHyprGeoEnrichment(motifList,motifHits,foregroundSeqs,opts.expect) appendData(realData,outData) for i in range(opts.fdrs): if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i)) if opts.verbose: sys.stdout.write('\tgetting random forground...\n') rForground = getRandomForground(foregroundSeqs,seqDict) if opts.verbose: sys.stdout.write('\tcalculating p-values...\n') ctrlData = motifHyprGeoEnrichment(motifList,motifHits,rForground,opts.expect) appendData(ctrlData,outData) if opts.verbose: sys.stdout.write('writting outData...\n') writeDataTable(outBaseStr,outData,motifList) if opts.verbose: sys.stdout.write('plotting histograms...\n') for m in motifList: plotHist(outBaseStr,outData,m.id)
def main(): #+++++++++++ File Parseing Etc +++++++++++ desc = """Calls the folowing funcs: 'add this'""" usage = """python %prog args""" parser = optparse.OptionParser(usage=usage, description=desc) parser.add_option('--motifs', type='str',default=None, help="""Path to motif file (default=%default).""") parser.add_option('--motif-type', type='str',default='scope', help="""Format of motif file (default=%default).""") parser.add_option('--motif-filter', type='str',default='lambda x: x', help="""Filter fuction to allow filtering of motifs in motif file. Its a lambda function. The default returns all motifs. If you don't understand this, please leave it alone. (default=%default).""") parser.add_option('--thresh', type='float',default=0.75, help="""Fractional score threshold for motif score cut-off (default=%default).""") parser.add_option('--promoters', type='str',default=None, help="""Path to fasta file with promoter population (default=%default).""") parser.add_option('--plen', type='int',default=None, help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""") parser.add_option('--genes', type='string',default=None, help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""") parser.add_option('--job', type='string',default='int(time())', help="""String to identify this run (default=%default).""") parser.add_option('--out', type='string',default=None, help="""Path to results dir -- must use full path (default=%default).""") parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0, help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""") ##parser.add_option('--from-possum',default=False, ##help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""") parser.add_option('--verbose',action='store_true',default=False, help="""Include if stdOut/stdErr is desired. (default=%default).""") parser.add_option('--check-seqs',action='store_true',default=False, help="""Print info about promoter sequences and exit. (default=%default).""") ##parser.add_option('--expect',action='store_true',default=False, ##help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""") (opts, args) = parser.parse_args() # +++++ Argument Validations +++++ if len(sys.argv) == 1: parser.print_help() exit(0) if not opts.out: raise InvalidOptionError("--out argument is required.") if not opts.genes: raise InvalidOptionError('--genes argument is required.') if opts.job == 'int(time())': opts.job = int(time()) if not (opts.motifs or opts.motif_type or opts.thresh or opts.promoters): raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required: --motif-type --thresh""") if not opts.motif_filter.startswith('lambda x:'): raise InvalidOptionError("**ERROR: the --motif-function option must begin with 'lambda x:'**") else: opts.motif_filter = eval(opts.motif_filter) if opts.plen: try: opts.plen = int(opts.plen) except ValueError: raise InvalidOptionError("**ERROR: the --plen option must be a number**") # +++++ Lets Begin +++++ if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv))) if opts.verbose: sys.stdout.write('preparing out directory...\n') outBaseStr = getOutBaseStr(opts.out,opts.job) mkdirp(opts.out) if opts.verbose: sys.stdout.write('building seqDict...\n') probeset = getProbSet(opts.promoters,opts.thresh) #print len(probeset.probes.items()[0][1]) #print probeset.probes.items()[0][1] if opts.plen: if opts.verbose: sys.stdout.write("adjusting promoter lengths to no more than %s and conserving the 3' ends...\n" % (opts.plen)) for s in probeset.probes.iteritems(): probeset.probes[s[0]]= s[1][-opts.plen:] #print len(probeset.probes.items()[0][1]) #print probeset.probes.items()[0][1] seqDict = getSeqs(probeset) if opts.check_seqs: seqStats(seqDict,show=True) exit(0) if opts.verbose: sys.stdout.write('getting nucFreqs...\n') nucFreqs = getSeqFreqs(seqDict) if opts.verbose: sys.stdout.write('building filtered motifList...\n') motifList = getMotifs(opts.motifs,nucFreqs,opts.motif_type) preFilt = len(motifList) motifList = filterMotifs(motifList,key=opts.motif_filter) postFilt = len(motifList) if opts.verbose: sys.stdout.write('using %s of %s motifs...\n' % (postFilt,preFilt)) if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n') foregroundSeqs = getForeground(opts.genes) outData = {} if opts.verbose: sys.stdout.write('calculating real data p-values...\n') realData = motifHyprGeoEnrichmentTAMO(motifList,probeset,foregroundSeqs,factor=opts.thresh,bestFactor=False) appendData(realData,outData) for i in range(opts.fdrs): if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i)) if opts.verbose: sys.stdout.write('\tgetting random forground...\n') rForground = getRandomForground(foregroundSeqs,seqDict) if opts.verbose: sys.stdout.write('\tcalculating p-values...\n') ctrlData = motifHyprGeoEnrichmentTAMO(motifList,probeset,rForground,factor=opts.thresh,bestFactor=False) appendData(ctrlData,outData) if opts.verbose: sys.stdout.write('writting outData...\n') writeDataTable(outBaseStr,outData,motifList) if opts.verbose: sys.stdout.write('plotting histograms...\n') for m in motifList: plotHist(outBaseStr,outData,m.id)