Example #1
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile

    motifs = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ', fsaname
    fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict')
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs" % len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if motif.numbound == 0:
            matching = PROBESET.matching_ids(motif, [], factor=0.7)
            matchbound = [x for x in matching if x in probes]
            motif.numbound = len(probes)
            motif.nummotif = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    MotifTools.save_motifs(motifs, outname)
Example #2
0
def parse():
    global probefile, PROBESET, letter, fsafile
    try:
        idx = sys.argv.index('-genome')
        del sys.argv[idx]
        probefile = sys.argv[idx]
        del sys.argv[idx]
        PROBESET = MotifMetrics.ProbeSet(probefile)
        PROBESET.factor = 0.7
    except:
        pass
    try:
        idx = sys.argv.index('-letter')
        del sys.argv[idx]
        letter = sys.argv[idx]
        del sys.argv[idx]
    except:
        pass
    try:
        idx = sys.argv.index('-f')
        del sys.argv[idx]
        fsafile = sys.argv[idx]
        del sys.argv[idx]
    except:
        pass
Example #3
0
def parse():
    global probefile, PROBESET
    try:
        idx = sys.argv.index('-genome')
        del sys.argv[idx]
        probefile = sys.argv[idx]
        del sys.argv[idx]
        PROBESET = MotifMetrics.ProbeSet(probefile)
        PROBESET.factor = 0.65
    except: pass
Example #4
0
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)
Example #5
0
def motifs2tamo(motifs, outname):
    global probefile, PROBESET
    
    fsaname = find_fsa(outname)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
    MotifTools.save_motifs(motifs,outname)
Example #6
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile
    
    motifs  = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ',fsaname
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if motif.numbound == 0:
            matching            = PROBESET.matching_ids(motif,[],factor=0.7)
            matchbound          = [x for x in matching if x in probes]
            motif.numbound      = len(probes)
            motif.nummotif      = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA    == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass
        
    MotifTools.save_motifs(motifs,outname)
Example #7
0
def main():
    short_opts = 'f:'
    long_opts = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile=']
    try:
        opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    fastafile = ''
    top_count = 10
    top_pcnt = None
    genome = 'YEAST'
    w_start = 8
    w_stop = 15
    bgfile = MDSCAN_DIR + 'yeast_int.bg'
    for opt, value in opts:
        if opt == '-f': fastafile = value
        if opt == '--genome': genome = value
        if opt == '--top': top_count = int(value)
        if opt == '--pcnt': top_pcnt = float(value)
        if opt == '--range':
            w_start, w_stop = [int(x) for x in value.split(',')]

    print "#" + ' '.join(sys.argv)
    probeids = Fasta.keys(fastafile)
    Genome = MotifMetrics.ProbeSet(genome)

    probeids = Genome.filter(probeids)

    if top_pcnt:
        top_count = max(top_count, int(top_pcnt / 100.0 * len(probeids)))

    theMeta = metaMDscan(fastafile, w_start, w_stop, top_count)

    for m in theMeta.motifs:
        m.pvalue = Genome.p_value(m, probeids, 'v')
        m.church = Genome.church(m, probeids, 'v')
        sys.stdout.flush()

    theMeta.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    print_motifs(theMeta.motifs)
Example #8
0
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)
Example #9
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file>" % (re.sub('^.*/', '', sys.argv[0]))
        print "     [-genome genomefile.fsa]   Genome file (for computing Enrichment, etc..."
        print "     [-bfile  file          ]   File for Markov Background Model"
        print '     [-bigdata              ]   Adds "-maxsize 2000000" for large datasets'
        sys.exit(1)

    fastafile = sys.argv[1]
    width = 0
    valid_tfs = []
    iter = 10
    genome = 'YEAST'
    xtra = ''
    bfile = None

    for tok, i in zip(sys.argv, range(len(sys.argv))):
        if tok == '-w': width = int(sys.argv[i + 1])
        elif tok == '-human': genome = 'HUMAN'
        elif tok == '-H250': genome = 'HUMAN_250'
        elif tok == '-Ch22': genome = 'Ch22'
        elif tok == '-genome': genome = sys.argv[i + 1]
        elif tok == '-bigdata': xtra = '-maxsize 2000000'
        elif tok == '-bfile': bfile = sys.argv[i + 1]

    theMeme = Meme(fastafile, width, xtra, genome, bfile)
    Genome = MotifMetrics.ProbeSet(genome)
    ids = theMeme.probes
    #ids     = Genome.ids_from_file(fastafile)

    motifs = theMeme.motifs
    for motif in motifs:
        motif.pvalue = Genome.p_value(motif, ids, 'v')
        for valid_tf in valid_tfs:
            motif.valid = Validate.validate(motif, valid_tf, '', 'Want Tuple')

    print_motifs(motifs)

    print '#' * 80
    for line in theMeme.lines:
        print line,
Example #10
0
def random_seqs(numseq=50,genome='YEAST',want_dict=None):
    global PROBESETS, BADPROBES, BADPROBEFILES, ALL_IDS
    if PROBESETS.has_key(genome): probeset = PROBESETS[genome]
    else:
        probeset = MotifMetrics.ProbeSet(genome)
        PROBESETS[genome] = probeset
    if not BADPROBES:
        _d = {}
        for file in BADPROBEFILES:
            F = open(file)
            for id in [x.strip() for x in F.readlines()]:
                _d[id] = 1
            F.close()
        BADPROBES = _d.keys()
        simfilter= GenerateFastas.SimilarFilter(50)
        all_ids  = [x for x in probeset.probes.keys() if (x not in BADPROBES)]
        ALL_IDS  = simfilter.filter(all_ids)

    ids = ALL_IDS
    randomids= []
    count    = 0
    numids   = len(ids)
    while 1:
        randomid = ids[int(random.random() * numids)]
        if randomid not in randomids:
            randomids.append(randomid)
            count = count + 1
        if count >= numseq: break
    if not want_dict:
        seqs  = []
        for randomid in randomids:
            seqs.append( probeset.probes[randomid] )
    else:
        seqs = {}
        for randomid in randomids:
            seqs[randomid] = probeset.probes[randomid]
    return(seqs)
Example #11
0
    def go(self):
        """Execution function: coordinates options used then uses TAMO.MotifMetrics to
        find kmers with good enrichment in listOfLinkedSeqs. Catches the output in 
        self.output for access from MDAP."""
        

        
        # set metric thresholds here
        pVal_thresh     = 0.01
        church_thresh   = 0.01
        binomial_thresh = 0.01

        # # # # # # # # # # # # #
        # ::THIN THE HEARD PHASE::
        # Are we using a range or a single size? Then make a list of all kmers in range
        # that are present in at least 10% of linkedSeqs (top_nmers_seqs()) to reduce
        # needless kmer testing in the metrics phase.
        
        theShortList = []
        
        if self.kmerRange:
            for k in range(self.kmerRange[0],self.kmerRange[1]):
                kmers = MotifMetrics.top_nmers_seqs(k, self.linkedSeqs_seqs)
                print '%s %smers found.' % (len(kmers), k)
                theShortList.extend(kmers)
        else:
            theShortList = MotifMetrics.top_nmers_seqs(self.kmerSize, self.linkedSeqs_seqs)
            print '%s %smers found.' % (len(theShortList), self.kmerSize)
            
        # Convert theShortList into list of motif objs not just strings
        # REASON: church routine asks the motif for its width.
        for i in range(0,len(theShortList)):
            theShortList[i] = MotifTools.Motif_from_text(theShortList[i])
            
        # # # # # # # # # # # #
        # ::METRICS PHASE::
        # Using theShortList, calculate the:
        #       --------METRICS----------   --METHOD CALL--
        #     - HyperGeometric Enrichment      (p_value)
        #     - Group Specificity Score        (church)
        #     - Over-representation            (binomial)
        #
        # Retain those kmers that recieve the cut-off score or better in at least one
        # of the above metrics.
        
        # list with indexes as follows [kmer, p_value, church, binomial]
        keepers = []  
        
        t1 = time()
        count = 1
        shortList_Len = len(theShortList)
        for kmer in theShortList:
            p_value  = self.allSeqs.p_value(kmer, self.linkedSeqs_ids, factor=0.75)
            church   = 'NA' #self.allSeqs.church(kmer, self.linkedSeqs_ids)
            binomial = 'NA' #self.allSeqs.binomial(kmer, self.linkedSeqs_ids)
            
            if p_value <= pVal_thresh or church <= church_thresh or binomial <= binomial_thresh:
                keepers.append([kmer, p_value, church, binomial])
                print '%s\t%s\t--\t%s of %s' % (kmer, p_value, count, shortList_Len)
            count+=1
        t2 = time()
        self.output = keepers
        print 'Calculating the metrics took %.3f min.' % ((t2-t1)/60) 
        
        # Create a formated string to be printed to a file in MDAP class.
        toFile = ['#kmer\tp_value\tchurch\tbinomial\n']
        for i in keepers:
            toFile.append('%s\t%s\t%s\t%s\n' % (i[0].oneletter,i[1],i[2],i[3]))   # AD added ".oneletter" to i[0] to remove the " (1)" from output
            
        self.toFile = toFile
 
# Change log since last commit:
# 02-26-09 -- added MemeWrap._getMaxSize()
# 02-26-09 -- added MemeWrap._getWidthOption()
# 02-26-09 -- added MemeWrap._get_bFile()
# 02-27-09 -- added MemeWrap._getExtraArgs()
        
Example #12
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)
Example #13
0
def main():
    print "#" + ' '.join([x.replace(' ', '\ ') for x in sys.argv])
    parse_opts()
    ARGS = getarg('args')

    GLOBALS['GENOME'] = MotifMetrics.ProbeSet(getarg('genome'))
    print '# Loaded %s' % getarg('genome')

    badprobes = []
    for f in BADPROBES:
        b = [x.strip() for x in open(f).readlines()]
        badprobes.extend(b)
    d = getarg('DATA')
    p = getarg('GENOME')
    S = SimilarFilter(50)
    experiments = getarg('expts')
    top = getarg('top')
    THRESH = getarg('pvalue')
    NO_FILTER = getarg('nofilter')
    ratioabove = getarg('ratioabove')

    if not experiments: experiments = d.experiments

    for expt in experiments:
        e = expt

        if top:
            _tups = d.scores(e)
            _tups.sort(lambda x, y: cmp(x[0], y[0]))
            unfiltered = [x[1] for x in _tups[0:top]]
        elif ratioabove:
            unfiltered = d.ratioabove(e, ratioabove)
        else:
            unfiltered = d.bound(e, THRESH)

        badfiltered = [x for x in unfiltered if not (x in badprobes)]
        #badfiltered  = unfiltered # Turn back on for real data
        if len(unfiltered) - len(badfiltered) > 2: unfiltered = badfiltered
        #else:   continue    # Necessary when only wanting to regenerate problemed data

        bound_ids = p.filter(unfiltered)
        filtered_ids = bound_ids

        print '### Removed ', (len(bound_ids) -
                               len(S.filter(bound_ids))), 'from ', expt

        if not NO_FILTER:
            filtered_ids = p.filter(S.filter(bound_ids))

        #filtered_ids = bound_ids # Turn back on for real data
        if NO_FILTER:
            print '#%-15s   %3d    ' % (expt, len(bound_ids))
        else:
            print '#%-15s   Before %3d    After %3d ' % (expt, len(bound_ids),
                                                         len(filtered_ids))
        if len(unfiltered) - len(bound_ids) > 2:
            diff = [x for x in unfiltered
                    if (not x in bound_ids)]  #l_andnot(unfiltered,bound_ids)
            print '%-15s  %3d probes (out of %3d) without predicted sequences ' % (
                expt, len(diff), len(unfiltered))
            for _p in diff:
                print '# Absent in (%s) %s' % (expt, _p)
        #continue #Comment this

        #sort
        final_ids, final_scores = [], []
        _tups = d.scores(e)  #Sometimes redundant, but who cares?
        _tups.sort(lambda x, y: cmp(x[0], y[0]))
        for score, id in _tups:
            #if (score <= THRESH) and (id in filtered_ids):
            if (id in filtered_ids):  #Does this break everything?
                final_ids.append(id)
                final_scores.append('%8.4e' % score)
        if final_scores:
            print "#%% %-15s %s" % (expt, final_scores[-1])
        else:
            print "#%% %-15s None" % (expt)
        s = p.fsa_string_from_ids(final_ids, final_scores)
        if len(s) == 0: continue
        f = expt + '.fsa'
        f = re.sub(' ', '_', f)
        FID = open(f, 'w')
        FID.write(s)
        FID.close()
        sys.stdout.flush()
Example #14
0
from TAMO.MD.Meme import Meme 
from TAMO import Clustering
#from TAMO.DataSources import GO
from time import time

TC8_path = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-8.fas'
TC8_ids  = Fasta.ids(TC8_path)
TC8_seqs = Fasta.seqs(TC8_path)
allSeqs  = MotifMetrics.ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas')

outFile  = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-8_MotifMetrics.5-12.txt'

roughBestKmers = []

for i in range(6,10):
    imers = MotifMetrics.top_nmers_seqs(i,TC8_seqs)
    roughBestKmers.extend(imers)
    print '%s %smers found.' % (len(imers), i)
    
kmerMetrics = ['Kmer\thGeoPval\tBinomOverRep\n']
    
for kmer in roughBestKmers:
    hGeoPval = allSeqs.Enrichment(kmer, TC8_ids)
    binom   = allSeqs.overrep(kmer,TC8_ids)
    kmerMetrics.append('%s\t%s\t%s\n' % (kmer,hGeoPval,binom))
    
    
outFile = open(outFile,'w')
outFile.writelines(kmerMetrics)

print "Done."