Ejemplo n.º 1
0
def info2seeds(N,infofile,probefile,species='YEAST'):
    G    = ProbeSet(species)
    IDs  = G.ids_from_file(probefile)
    Q    = EM.theMarkovBackground.zeroth()
 
    seqs = Fasta.seqs(infofile)
    
    if not N:
        nmers = seqs
    else:
        nmers= MotifTools.top_nmers(N,seqs)
        if len(nmers) > 1000: nmers = nmers[0:1000]
        
    print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile)
    sys.stdout.flush()
    
    nmers_scoresT = []
    for nmer in nmers:
        if nmer.isalpha():
            p = G.p_value(nmer,IDs,'') #'verbose'
            nmers_scoresT.append((nmer,p))
    nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1]))
    last = min(20,len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = MotifTools.Motif('',Q)
        m.compute_from_text(seq,0.1)
        models.append(m)
    for tup in nmers_scoresT[0:40]:
        print tup
    return(models)
Ejemplo n.º 2
0
def estimate_frequency(motif,k,samples=100000,thresh=0.7):
    #Build sequences
    estimate   = -30
    total      = 0
    totalcount = 0
    for i in range(40):
        long_string = 'ACGT'*(int(float(samples)*k/4))
        long_string = list(long_string)
        random.shuffle(long_string)
        random.shuffle(long_string)
        random.shuffle(long_string)
        long_string = ''.join(long_string)
        seqD = {}
        for i in range(samples):
            offset = k*i
            seqD[i] = long_string[offset:offset+k]
        P = ProbeSet(genome=seqD)
        count = P.count_matching_probes(motif,thresh=thresh)
        total      += float(samples)
        totalcount += float(count)
        f = totalcount/total
        d = math.fabs(f-estimate)/(estimate+0.00000001)
        estimate = f
        if d < 1e-4: break
        if i > 2 and totalcount > 100: break
        #print '%10d %10d %12.3e  %12.3e'%(totalcount, total, f, d)
    return estimate
Ejemplo n.º 3
0
def estimate_frequency(motif, k, samples=100000, thresh=0.7):
    #Build sequences
    estimate = -30
    total = 0
    totalcount = 0
    for i in range(40):
        long_string = 'ACGT' * (int(float(samples) * k / 4))
        long_string = list(long_string)
        random.shuffle(long_string)
        random.shuffle(long_string)
        random.shuffle(long_string)
        long_string = ''.join(long_string)
        seqD = {}
        for i in range(samples):
            offset = k * i
            seqD[i] = long_string[offset:offset + k]
        P = ProbeSet(genome=seqD)
        count = P.count_matching_probes(motif, thresh=thresh)
        total += float(samples)
        totalcount += float(count)
        f = totalcount / total
        d = math.fabs(f - estimate) / (estimate + 0.00000001)
        estimate = f
        if d < 1e-4: break
        if i > 2 and totalcount > 100: break
        #print '%10d %10d %12.3e  %12.3e'%(totalcount, total, f, d)
    return estimate
Ejemplo n.º 4
0
def probOvlpBinomial(A,B,thresh=0.7,verbose=None):
    if A.width >= B.width:
        Wide, Narrow = A, B
    else:
        Wide, Narrow = B, A

    RC = MotifTools.revcomplement
    newWide  = Wide[-1,Wide.width+1]
    if Wide.__dict__.has_key('bestWide'):
        bestWide = Wide.bestWide
    else:
        bestWideD = {}
        for x in newWide.bestseqs(thresh*newWide.maxscore):
            bestWideD[x[1]] = 1
        for x in bestWideD.keys():
            bestWideD[RC(x)] = 1
        Wide.bestWide = bestWideD.keys()
        bestWide = Wide.bestWide
    Wide = newWide

    D={}
    for i in range(len(bestWide)):
        D[i] = bestWide[i]
    P = ProbeSet(genome=D)
    matchNarrow = P.count_matching_probes(Narrow,thresh=thresh)
    
    if matchNarrow == 0:
        p = 1.0
        return p
    
    if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {}
    if Narrow.probNarrow.has_key(Wide.width):
        probNarrow = Narrow.probNarrow[Wide.width]
    else:
        probNarrow = estimate_frequency(Narrow,Wide.width,thresh=thresh)
        Narrow.probNarrow[Wide.width] = probNarrow

    p = Arith.binomialsumtail(probNarrow,len(bestWide),matchNarrow)
    print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s'%(
        Arith.nlog10(p),probNarrow,len(bestWide),matchNarrow,
        A.family,A,B.family,B)


    return p
Ejemplo n.º 5
0
def probOvlpBinomial(A, B, thresh=0.7, verbose=None):
    if A.width >= B.width:
        Wide, Narrow = A, B
    else:
        Wide, Narrow = B, A

    RC = MotifTools.revcomplement
    newWide = Wide[-1, Wide.width + 1]
    if Wide.__dict__.has_key('bestWide'):
        bestWide = Wide.bestWide
    else:
        bestWideD = {}
        for x in newWide.bestseqs(thresh * newWide.maxscore):
            bestWideD[x[1]] = 1
        for x in bestWideD.keys():
            bestWideD[RC(x)] = 1
        Wide.bestWide = bestWideD.keys()
        bestWide = Wide.bestWide
    Wide = newWide

    D = {}
    for i in range(len(bestWide)):
        D[i] = bestWide[i]
    P = ProbeSet(genome=D)
    matchNarrow = P.count_matching_probes(Narrow, thresh=thresh)

    if matchNarrow == 0:
        p = 1.0
        return p

    if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {}
    if Narrow.probNarrow.has_key(Wide.width):
        probNarrow = Narrow.probNarrow[Wide.width]
    else:
        probNarrow = estimate_frequency(Narrow, Wide.width, thresh=thresh)
        Narrow.probNarrow[Wide.width] = probNarrow

    p = Arith.binomialsumtail(probNarrow, len(bestWide), matchNarrow)
    print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s' % (Arith.nlog10(
        p), probNarrow, len(bestWide), matchNarrow, A.family, A, B.family, B)

    return p
Ejemplo n.º 6
0
from TAMO import MotifTools 
from TAMO.seq import Fasta 
from TAMO.MotifMetrics import ProbeSet 
from TAMO.MD.AlignAce import AlignAce 
from TAMO.MD.MDscan import MDscan 
from TAMO.MD.Meme import Meme 
#from TAMO.DataSources import GO
from time import time

fastaPath    = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-96.oneLine.fas'
clusterIDS   = Fasta.ids(fastaPath)
totalSeqs    = ProbeSet(fastaPath)  # !! this is wrong should proly be goodAffys

MDbg         = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.MD.bg'

outFile      = '/Users/biggus/Documents/James/Data/ClusterDefs/testTAMOmetrics.txt'

#theAce = AlignAce(fastaPath,width=10)

print 'running MDscan...'
tMD_1 = time()
MDmotifs   = MDscan(fastaPath) #,bgfile=MDbg)
tMD_2 = time()
MD_time = tMD_2-tMD_1
print 'MDscan took %.5f sec == %.3f min.\nMDscan found %s motifs.' % (MD_time,MD_time/60.0, len(MDmotifs.motifs))

print 'running MEME...'
tMeme_1 = time()
memeMotifs = Meme(fastaPath)
tMeme_2 = time()
Meme_time = tMeme_2-tMeme_1
Ejemplo n.º 7
0
for f in pklFiles:
    unPikls[f.split('/')[-1]] = pickle.load(open(f,'r'))

motifs = []    
for i in unPikls:
    for j in range(len(unPikls[i])):
        fastaSourceName = unPikls[i][j].lines[1].split()[2].split('/')[-1]
        motifs.append([fastaSourceName,unPikls[i][j].results])
    
# Adjust each motif to the species being looked at
for m in motifs:
    for i in
    motifs[i][1].new_bg(speciesBK)


probSet = ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas')

# get and print motif and pVal(s)
out = '#outFile\tMotif\tHyperGeoPval (%s)\tfrac (%s)\tBestHG (pval)\tfrac (bestHG_ScoreThresh)\tBestHG (scoreThresh)\tbinoPval (%s)\tbinoPval (bestHG_scoreThresh)' \
      % (dfltFactor,dfltFactor,dfltFactor)
print out
out = out+'\n'

for m in motifs:
    for i in m[1]:
        bestE = probSet.best_p_value(m[1],coRegSeqs)
        temp ='%s\t%s\t%.3e\t%.3f\t%.3e\t%.3f\t%.3f\t%.3e\t%.3e' % (m[0],
                                                      m[1].oneletter, 
                                                      probSet.Enrichment(m[1],coRegSeqs,factor=dfltFactor), 
                                                      probSet.frac(m[1],coRegSeqs,factor=dfltFactor),
                                                      bestE[0],
Ejemplo n.º 8
0
from gusPyCode.MDAP_proj.MDAP_defs import transfacLike2tamoMotif
from TAMO.MotifMetrics import ProbeSet
from TAMO import MotifTools
import os


listPath = '/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/kMerPWMs/Clus2_kmerSearch.6-8mers.top10pcnt/Group5.txt'
probSetReal = ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas')
probSetShuf = ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.seqsShuffled.fas')
coRegSeqs = '/Users/biggus/Documents/James/Data/ReClustering/kmedsPear33Clus50x_2/Clus2_247genes.genes.txt'
coRegSeqs = map(lambda l: l.strip(), open(coRegSeqs, 'rU').readlines())

# set real or shuffled genome
#probSet=probSetReal

motifs = map(lambda l: l.strip(), open(listPath, 'rU').readlines())

for m in range(len(motifs)):
    motifs[m] = MotifTools.Motif_from_text(motifs[m])


group = 'Group5'




# calc HyperGeo for all motifs


print '#threshold = 75% of max score\n#Group\tone_letter\tpVal\tfraction_of_gene_set\tpVal (shuffled)\tfraction_of_gene_set (shuffled)'
for m in motifs:
Ejemplo n.º 9
0
#! /usr/bin/env python2.4

# Author: Alexandre S. Cristino
# email:  [email protected]
# Date:   16 june 2008
# Script: Scoring motifs in upstream regions of specific databases


import os,sys,string
from   TAMO              import MotifTools
from   TAMO.seq          import Fasta
from   TAMO.MotifMetrics import ProbeSet

promoters = ProbeSet(sys.argv[1])
geneset_ids = open(sys.argv[2]).read().split('\n')[:-1]
match_ids = []
prom_ids = promoters.probes.keys()
for id in geneset_ids:
  if id in prom_ids:
    match_ids.append(id)

motifs = MotifTools.load(sys.argv[3])
church = 0.05
rocauc = 0.1
pvalue = 0.05

print "Name\tMotif\tChurch\tRoc-auc\tP-value"
for m in motifs:
  m.church   = promoters.church  (m, match_ids)
#  m.ROC_auc  = promoters.ROC_AUC (m, match_ids)
  m.pvalue   = promoters.p_value (m, match_ids)
Ejemplo n.º 10
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print ""
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
#       print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -kmerseeds        Use kmers with best enrichment score as seeds for EM"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -g genome.fsa     Use specicied Fasta file as background (searches first for matching frequency file)"
#       print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
#       print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print 
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print 
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print 
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = [] #NOT USED
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    bgfile    = None

    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array

    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-seedbeta': seedbeta   = float(sys.argv[i+1])
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-kmerseeds' :   infofile   = 1
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1]) #NOT USED
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :   pad        = "TRUE"
        elif tok == '-bgfile':   bgfile     = sys.argv[i+1]
        elif tok == '-TF'    :  #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION)
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    if infofile: infofile = fastafile

    if bgfile:
        EM.loadMarkovBackground(bgfile)
    elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(species)
    else:
        EM.theMarkovBackground = EM.Zeroth()

    fsaD     = Fasta.load(fastafile)
    Fasta.delN(fsaD)
    seqs     = fsaD.values()
    probes   = fsaD.keys()
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count,len(seqs))])

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width,infofile,fastafile,species))
        else:
            print 'Error: need to specify motif width w/ .info file'
    
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS: #NOT USED
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
            tf.append(m)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        left  = MotifTools.Motif_from_text('@')
        right = MotifTools.Motif_from_text('N')
        for m in seedmodels:
            newmodels.append(left + m + right)
            print left + m + right
        seedmodels = newmodels

    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s,[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    theEM.param['gamma']        = gamma
    theEM.seqs.extend(all_seqs)
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))


    '''
    Compute some metrics
    '''
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)
    
    for C in theEM.candidates:
        if not pmax:
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            C.pssm.church = Genome.church(C.pssm,ids)
            C.pssm.frac   = Genome.frac(C.pssm,probes,None,0.7)
        else:
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

        matching             = Genome.matching_ids(C.pssm,[],factor=0.7)
        matchbound           = [x for x in matching if x in probes]
        C.pssm.numbound      = len(probes)
        C.pssm.nummotif      = len(matching)
        C.pssm.numboundmotif = len(matchbound)
        sys.stdout.flush()

    
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:  #NOT USED
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
        else:
            vstring = ''
        C.pssm._maxscore()     #Recomputed

        MotifTools.print_motif(C.pssm,20,i)
        sys.stdout.flush()
        continue
    
        #Antiquated stuff  -- Remove !!
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)
            

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0) #Avoid ridiculous python cleanup times