Example #1
0
    def go(self):
        """Execution function: coordinates options used then uses TAMO.MotifMetrics to
        find kmers with good enrichment in listOfLinkedSeqs. Catches the output in 
        self.output for access from MDAP."""
        

        
        # set metric thresholds here
        pVal_thresh     = 0.01
        church_thresh   = 0.01
        binomial_thresh = 0.01

        # # # # # # # # # # # # #
        # ::THIN THE HEARD PHASE::
        # Are we using a range or a single size? Then make a list of all kmers in range
        # that are present in at least 10% of linkedSeqs (top_nmers_seqs()) to reduce
        # needless kmer testing in the metrics phase.
        
        theShortList = []
        
        if self.kmerRange:
            for k in range(self.kmerRange[0],self.kmerRange[1]):
                kmers = MotifMetrics.top_nmers_seqs(k, self.linkedSeqs_seqs)
                print '%s %smers found.' % (len(kmers), k)
                theShortList.extend(kmers)
        else:
            theShortList = MotifMetrics.top_nmers_seqs(self.kmerSize, self.linkedSeqs_seqs)
            print '%s %smers found.' % (len(theShortList), self.kmerSize)
            
        # Convert theShortList into list of motif objs not just strings
        # REASON: church routine asks the motif for its width.
        for i in range(0,len(theShortList)):
            theShortList[i] = MotifTools.Motif_from_text(theShortList[i])
            
        # # # # # # # # # # # #
        # ::METRICS PHASE::
        # Using theShortList, calculate the:
        #       --------METRICS----------   --METHOD CALL--
        #     - HyperGeometric Enrichment      (p_value)
        #     - Group Specificity Score        (church)
        #     - Over-representation            (binomial)
        #
        # Retain those kmers that recieve the cut-off score or better in at least one
        # of the above metrics.
        
        # list with indexes as follows [kmer, p_value, church, binomial]
        keepers = []  
        
        t1 = time()
        count = 1
        shortList_Len = len(theShortList)
        for kmer in theShortList:
            p_value  = self.allSeqs.p_value(kmer, self.linkedSeqs_ids, factor=0.75)
            church   = 'NA' #self.allSeqs.church(kmer, self.linkedSeqs_ids)
            binomial = 'NA' #self.allSeqs.binomial(kmer, self.linkedSeqs_ids)
            
            if p_value <= pVal_thresh or church <= church_thresh or binomial <= binomial_thresh:
                keepers.append([kmer, p_value, church, binomial])
                print '%s\t%s\t--\t%s of %s' % (kmer, p_value, count, shortList_Len)
            count+=1
        t2 = time()
        self.output = keepers
        print 'Calculating the metrics took %.3f min.' % ((t2-t1)/60) 
        
        # Create a formated string to be printed to a file in MDAP class.
        toFile = ['#kmer\tp_value\tchurch\tbinomial\n']
        for i in keepers:
            toFile.append('%s\t%s\t%s\t%s\n' % (i[0].oneletter,i[1],i[2],i[3]))   # AD added ".oneletter" to i[0] to remove the " (1)" from output
            
        self.toFile = toFile
 
# Change log since last commit:
# 02-26-09 -- added MemeWrap._getMaxSize()
# 02-26-09 -- added MemeWrap._getWidthOption()
# 02-26-09 -- added MemeWrap._get_bFile()
# 02-27-09 -- added MemeWrap._getExtraArgs()
        
Example #2
0
from TAMO.MD.Meme import Meme 
from TAMO import Clustering
#from TAMO.DataSources import GO
from time import time

TC8_path = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-8.fas'
TC8_ids  = Fasta.ids(TC8_path)
TC8_seqs = Fasta.seqs(TC8_path)
allSeqs  = MotifMetrics.ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas')

outFile  = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-8_MotifMetrics.5-12.txt'

roughBestKmers = []

for i in range(6,10):
    imers = MotifMetrics.top_nmers_seqs(i,TC8_seqs)
    roughBestKmers.extend(imers)
    print '%s %smers found.' % (len(imers), i)
    
kmerMetrics = ['Kmer\thGeoPval\tBinomOverRep\n']
    
for kmer in roughBestKmers:
    hGeoPval = allSeqs.Enrichment(kmer, TC8_ids)
    binom   = allSeqs.overrep(kmer,TC8_ids)
    kmerMetrics.append('%s\t%s\t%s\n' % (kmer,hGeoPval,binom))
    
    
outFile = open(outFile,'w')
outFile.writelines(kmerMetrics)

print "Done."