Python seqStats Examples, rSeq.utils.stats.seqStats Python Examples

Example #1

0

Show file

File: calculateMotifHyprGeoEnrichments.py Project: asntech/rSeqPipeline

def getSeqFreqs(seqDict):
    """"Returns tuple with correct values for motility
    AT_bias,GC_bias keywords as calculated from seqDict."""
    
    # These look weird bc motility expects these info in a weird way.
    # I have confirmed from Titus Brown (motility author) that this is correct.
    GC_bias = seqStats(seqDict)['percentGC']/2
    AT_bias = 0.5 - GC_bias
    return AT_bias,GC_bias

Example #2

0

Show file

File: calculateMotifHyprGeoEnrichments_TAMO.py Project: asntech/rSeqPipeline

def getSeqFreqs(seqDict):
    """"Returns DICT with correct values for TAMO
    AT_bias,GC_bias keywords as calculated from seqDict."""
    data = seqStats(seqDict)
    seqFreqs = {}
    seqFreqs['A'] = float(data['aCnt'])/data['nonNs']
    seqFreqs['C'] = float(data['cCnt'])/data['nonNs']
    seqFreqs['G'] = float(data['gCnt'])/data['nonNs']
    seqFreqs['T'] = float(data['tCnt'])/data['nonNs']

    return seqFreqs

Example #3

0

Show file

File: motifs.py Project: asntech/rSeqPipeline

def toTAMOmotifs(motifList,seqData=None):
    """Returns a list of TAMO motif Objects when given
    a list of the native motif Objs. 
    If seqData is a dict, it is assumed to be
    nucFreqs (seqData['A']==.26),
    else, it is assumed to be a path to a seqPopulation
    that the motifs will be used on for stats to be
    calculated against."""
    
    from TAMO.MotifTools import Motif_from_counts
    if not seqData:
        if type(seqData) == type({}):
            try:
                aFreq = seqData['A']
                cFreq = seqData['C']
                gFreq = seqData['G']
                tFreq = seqData['T']
            except KeyError:
                raise InvalidOptionError("toTAMOmotifs: Unrecognized key in seqData as dict.")
        elif type(seqData) == type(''):
            sDict = ParseFastA(seqData).toDict()
            seqData = seqStats(sDict)
            del(sDict)
            tot = float(seqData['nonNs'])
            aFreq = seqData['aCnt']/tot
            cFreq = seqData['cCnt']/tot
            gFreq = seqData['gCnt']/tot
            tFreq = seqData['tCnt']/tot
    else:
        aFreq = 0.25
        cFreq = 0.25
        gFreq = 0.25
        tFreq = 0.25
    
    tList = []
    for m in motifList:
        counts = []
        for i in range(len(m)):
            counts.append({'A':m.pwm['A'][i],'C':m.pwm['C'][i],'G':m.pwm['G'][i],'T':m.pwm['T'][i]})
        t = Motif_from_counts(counts,beta=0.01,bg={'A':aFreq,'C':cFreq,'G':gFreq,'T':tFreq})
        t.id = m.id
        try:
            t.sigvalue = m.sigvalue
        except AttributeError:
            pass
        tList.append(t)
    return tList

Example #4

0

Show file

File: calculateMotifHyprGeoEnrichments.py Project: asntech/rSeqPipeline

def main():
        #+++++++++++ File Parseing Etc +++++++++++
    desc = """Calls the folowing funcs: 'add this'"""
    
    usage = """python %prog args"""
    parser = optparse.OptionParser(usage=usage, description=desc)
    
    parser.add_option('--motifs', type='str',default=None,
                      help="""Path to motif file (default=%default).""")
    parser.add_option('--motif-type', type='str',default='scope',
                      help="""Format of motif file (default=%default).""")
    parser.add_option('--thresh', type='float',default=0.005,
                      help="""P-value threshold for motif score cut-off (default=%default).""")
    parser.add_option('--promoters', type='str',default=None,
                      help="""Path to fasta file with promoter population (default=%default).""")
    parser.add_option('--plen', type='int',default=1000,
                      help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""")
    parser.add_option('--genes', type='string',default=None,
                      help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""")
    parser.add_option('--job', type='string',default='int(time())',
                      help="""String to identify this run (default=%default).""")
    parser.add_option('--out', type='string',default=None,
                      help="""Path to results dir -- must use full path (default=%default).""")
    parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0,
                      help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""")
    parser.add_option('--from-possum',default=False,
                      help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""")    
    parser.add_option('--verbose',action='store_true',default=False,
                      help="""Include if stdOut/stdErr is desired. (default=%default).""")
    parser.add_option('--check-seqs',action='store_true',default=False,
                      help="""Print info about promoter sequences and exit. (default=%default).""")
    parser.add_option('--expect',action='store_true',default=False,
                      help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""")
    

    
    (opts, args) = parser.parse_args()
    
    # +++++ Argument Validations +++++
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)
    if not opts.out:
        raise InvalidOptionError("--out argument is required.")
    if not opts.genes:
        raise InvalidOptionError('--genes argument is required.')
    if opts.job == 'int(time())':
        opts.job = int(time())
    if not opts.from_possum:
        if not opts.motifs or opts.motif_type or opts.thresh or opts.promoters or opts.plen:
            raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required:
            --motif-type
            --thresh
            --plen""")
    else:
        if not opts.motifs:
            raise InvalidOptionError("When using --from-possum, --motifs should be the PSSM file used to generate this particular hit set.")
    
    # +++++ Lets Begin +++++
    if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv)))
    
    if opts.verbose: sys.stdout.write('preparing out directory...\n')
    outBaseStr = getOutBaseStr(opts.out,opts.job)
    mkdirp(opts.out)
    
    if opts.verbose: sys.stdout.write('building seqDict...\n')
    seqDict = getSeqs(opts.promoters,opts.plen)
    if opts.check_seqs:
        seqStats(seqDict,show=True)
        exit(0)
    else:
        seqData = seqStats(seqDict,show=False)
    if opts.expect:
        opts.expect = seqData['medLen']*opts.thresh*2
    else:
        opts.expect = 0
        
    # --- am i doing the searching myself? ---
    if not opts.from_possum:
        # -- yes --
    
        if opts.verbose: sys.stdout.write('building motifList...\n')
        motifList      = getMotifs(opts.motifs,opts.motif_type)
    
        if opts.verbose: sys.stdout.write('getting nucFreqs...\n')
        halfAT,halfGC  = getSeqFreqs(seqDict)
    
        if opts.verbose: sys.stdout.write('building hitDict...\n')
        motifHits      = getEvalHitDict(motifList,seqDict,pThresh=opts.thresh,halfAT=halfAT,halfGC=halfGC)
    else:
        # -- Oh thank god, no!  All I have to do is some parseing! --
        if opts.verbose: sys.stdout.write('skipping to building hitDict step...\n')
        pACs        = getPossumProfileACs(opts.motifs)
        possumTable = getPossumHitTable(opts.from_possum,headers=possumHeaders)
        motifHits   = getPossumHitDict(possumTable,seqDict.keys(),pACs)
        motifList   = makeMotifListFromPossum(pACs) # create list of DummyPlug MotifObjs for compatibility
    
    if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n')
    foregroundSeqs = getForeground(opts.genes)
    outData        = {}
    
    if opts.verbose: sys.stdout.write('calculating real data p-values...\n')
    realData = motifHyprGeoEnrichment(motifList,motifHits,foregroundSeqs,opts.expect)
    appendData(realData,outData)
    
    
    for i in range(opts.fdrs):
        if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i))
        if opts.verbose: sys.stdout.write('\tgetting random forground...\n')
        rForground = getRandomForground(foregroundSeqs,seqDict)
        
        if opts.verbose: sys.stdout.write('\tcalculating p-values...\n')
        ctrlData   = motifHyprGeoEnrichment(motifList,motifHits,rForground,opts.expect)
        appendData(ctrlData,outData)
    
    if opts.verbose: sys.stdout.write('writting outData...\n')
    writeDataTable(outBaseStr,outData,motifList)
    
    
    if opts.verbose: sys.stdout.write('plotting histograms...\n')
    for m in motifList:
        plotHist(outBaseStr,outData,m.id)

Example #5

0

Show file

File: calculateMotifHyprGeoEnrichments_TAMO.py Project: asntech/rSeqPipeline

def main():
        #+++++++++++ File Parseing Etc +++++++++++
    desc = """Calls the folowing funcs: 'add this'"""
    
    usage = """python %prog args"""
    parser = optparse.OptionParser(usage=usage, description=desc)
    
    parser.add_option('--motifs', type='str',default=None,
                      help="""Path to motif file (default=%default).""")
    parser.add_option('--motif-type', type='str',default='scope',
                      help="""Format of motif file (default=%default).""")
    parser.add_option('--motif-filter', type='str',default='lambda x: x',
                      help="""Filter fuction to allow filtering of motifs in motif file.  Its a lambda function.  The default returns all motifs. If you don't understand this, please leave it alone. (default=%default).""")
    parser.add_option('--thresh', type='float',default=0.75,
                      help="""Fractional score threshold for motif score cut-off (default=%default).""")
    parser.add_option('--promoters', type='str',default=None,
                      help="""Path to fasta file with promoter population (default=%default).""")
    parser.add_option('--plen', type='int',default=None,
                      help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""")
    parser.add_option('--genes', type='string',default=None,
                      help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""")
    parser.add_option('--job', type='string',default='int(time())',
                      help="""String to identify this run (default=%default).""")
    parser.add_option('--out', type='string',default=None,
                      help="""Path to results dir -- must use full path (default=%default).""")
    parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0,
                      help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""")
    ##parser.add_option('--from-possum',default=False,
                      ##help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""")    
    parser.add_option('--verbose',action='store_true',default=False,
                      help="""Include if stdOut/stdErr is desired. (default=%default).""")
    parser.add_option('--check-seqs',action='store_true',default=False,
                      help="""Print info about promoter sequences and exit. (default=%default).""")
    ##parser.add_option('--expect',action='store_true',default=False,
                      ##help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""")
    

    
    (opts, args) = parser.parse_args()
    
    # +++++ Argument Validations +++++
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)
    if not opts.out:
        raise InvalidOptionError("--out argument is required.")
    if not opts.genes:
        raise InvalidOptionError('--genes argument is required.')
    if opts.job == 'int(time())':
        opts.job = int(time())
    if not (opts.motifs or opts.motif_type or opts.thresh or opts.promoters):
        raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required:
        --motif-type
        --thresh""")
    if not opts.motif_filter.startswith('lambda x:'):
        raise InvalidOptionError("**ERROR: the --motif-function option must begin with 'lambda x:'**")
    else:
        opts.motif_filter = eval(opts.motif_filter)
    if opts.plen:
        try:
            opts.plen = int(opts.plen)
        except ValueError:
            raise InvalidOptionError("**ERROR: the --plen option must be a number**")
    
    # +++++ Lets Begin +++++
    if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv)))
    
    if opts.verbose: sys.stdout.write('preparing out directory...\n')
    outBaseStr = getOutBaseStr(opts.out,opts.job)
    mkdirp(opts.out)
    
    if opts.verbose: sys.stdout.write('building seqDict...\n')
    probeset = getProbSet(opts.promoters,opts.thresh)
    #print len(probeset.probes.items()[0][1])
    #print probeset.probes.items()[0][1]
    if opts.plen:
        if opts.verbose: sys.stdout.write("adjusting promoter lengths to no more than %s and conserving the 3' ends...\n" % (opts.plen))
        for s in probeset.probes.iteritems():
            probeset.probes[s[0]]= s[1][-opts.plen:]
        #print len(probeset.probes.items()[0][1])
        #print probeset.probes.items()[0][1]
            
    seqDict  = getSeqs(probeset)
    if opts.check_seqs:
        seqStats(seqDict,show=True)
        exit(0)
    
    if opts.verbose: sys.stdout.write('getting nucFreqs...\n')
    nucFreqs  = getSeqFreqs(seqDict)    

    if opts.verbose: sys.stdout.write('building filtered motifList...\n')
    motifList = getMotifs(opts.motifs,nucFreqs,opts.motif_type)
    preFilt   = len(motifList)
    motifList = filterMotifs(motifList,key=opts.motif_filter)
    postFilt  = len(motifList)
    if opts.verbose: sys.stdout.write('using %s of %s motifs...\n' % (postFilt,preFilt))
    
    if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n')
    foregroundSeqs = getForeground(opts.genes)
    outData        = {}
    
    if opts.verbose: sys.stdout.write('calculating real data p-values...\n')
    realData = motifHyprGeoEnrichmentTAMO(motifList,probeset,foregroundSeqs,factor=opts.thresh,bestFactor=False)
    appendData(realData,outData)
    
    
    for i in range(opts.fdrs):
        if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i))
        if opts.verbose: sys.stdout.write('\tgetting random forground...\n')
        rForground = getRandomForground(foregroundSeqs,seqDict)
        
        if opts.verbose: sys.stdout.write('\tcalculating p-values...\n')
        ctrlData   = motifHyprGeoEnrichmentTAMO(motifList,probeset,rForground,factor=opts.thresh,bestFactor=False)
        appendData(ctrlData,outData)
    
    if opts.verbose: sys.stdout.write('writting outData...\n')
    writeDataTable(outBaseStr,outData,motifList)
    
    
    if opts.verbose: sys.stdout.write('plotting histograms...\n')
    for m in motifList:
        plotHist(outBaseStr,outData,m.id)