def info2seeds(N,infofile,probefile,species='YEAST'): G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = Fasta.seqs(infofile) if not N: nmers = seqs else: nmers= MotifTools.top_nmers(N,seqs) if len(nmers) > 1000: nmers = nmers[0:1000] print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile) sys.stdout.flush() nmers_scoresT = [] for nmer in nmers: if nmer.isalpha(): p = G.p_value(nmer,IDs,'') #'verbose' nmers_scoresT.append((nmer,p)) nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1])) last = min(20,len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = MotifTools.Motif('',Q) m.compute_from_text(seq,0.1) models.append(m) for tup in nmers_scoresT[0:40]: print tup return(models)
def estimate_frequency(motif,k,samples=100000,thresh=0.7): #Build sequences estimate = -30 total = 0 totalcount = 0 for i in range(40): long_string = 'ACGT'*(int(float(samples)*k/4)) long_string = list(long_string) random.shuffle(long_string) random.shuffle(long_string) random.shuffle(long_string) long_string = ''.join(long_string) seqD = {} for i in range(samples): offset = k*i seqD[i] = long_string[offset:offset+k] P = ProbeSet(genome=seqD) count = P.count_matching_probes(motif,thresh=thresh) total += float(samples) totalcount += float(count) f = totalcount/total d = math.fabs(f-estimate)/(estimate+0.00000001) estimate = f if d < 1e-4: break if i > 2 and totalcount > 100: break #print '%10d %10d %12.3e %12.3e'%(totalcount, total, f, d) return estimate
def estimate_frequency(motif, k, samples=100000, thresh=0.7): #Build sequences estimate = -30 total = 0 totalcount = 0 for i in range(40): long_string = 'ACGT' * (int(float(samples) * k / 4)) long_string = list(long_string) random.shuffle(long_string) random.shuffle(long_string) random.shuffle(long_string) long_string = ''.join(long_string) seqD = {} for i in range(samples): offset = k * i seqD[i] = long_string[offset:offset + k] P = ProbeSet(genome=seqD) count = P.count_matching_probes(motif, thresh=thresh) total += float(samples) totalcount += float(count) f = totalcount / total d = math.fabs(f - estimate) / (estimate + 0.00000001) estimate = f if d < 1e-4: break if i > 2 and totalcount > 100: break #print '%10d %10d %12.3e %12.3e'%(totalcount, total, f, d) return estimate
def probOvlpBinomial(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x[1]] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide D={} for i in range(len(bestWide)): D[i] = bestWide[i] P = ProbeSet(genome=D) matchNarrow = P.count_matching_probes(Narrow,thresh=thresh) if matchNarrow == 0: p = 1.0 return p if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {} if Narrow.probNarrow.has_key(Wide.width): probNarrow = Narrow.probNarrow[Wide.width] else: probNarrow = estimate_frequency(Narrow,Wide.width,thresh=thresh) Narrow.probNarrow[Wide.width] = probNarrow p = Arith.binomialsumtail(probNarrow,len(bestWide),matchNarrow) print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s'%( Arith.nlog10(p),probNarrow,len(bestWide),matchNarrow, A.family,A,B.family,B) return p
def probOvlpBinomial(A, B, thresh=0.7, verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement newWide = Wide[-1, Wide.width + 1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh * newWide.maxscore): bestWideD[x[1]] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide D = {} for i in range(len(bestWide)): D[i] = bestWide[i] P = ProbeSet(genome=D) matchNarrow = P.count_matching_probes(Narrow, thresh=thresh) if matchNarrow == 0: p = 1.0 return p if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {} if Narrow.probNarrow.has_key(Wide.width): probNarrow = Narrow.probNarrow[Wide.width] else: probNarrow = estimate_frequency(Narrow, Wide.width, thresh=thresh) Narrow.probNarrow[Wide.width] = probNarrow p = Arith.binomialsumtail(probNarrow, len(bestWide), matchNarrow) print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s' % (Arith.nlog10( p), probNarrow, len(bestWide), matchNarrow, A.family, A, B.family, B) return p
from TAMO import MotifTools from TAMO.seq import Fasta from TAMO.MotifMetrics import ProbeSet from TAMO.MD.AlignAce import AlignAce from TAMO.MD.MDscan import MDscan from TAMO.MD.Meme import Meme #from TAMO.DataSources import GO from time import time fastaPath = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-96.oneLine.fas' clusterIDS = Fasta.ids(fastaPath) totalSeqs = ProbeSet(fastaPath) # !! this is wrong should proly be goodAffys MDbg = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.MD.bg' outFile = '/Users/biggus/Documents/James/Data/ClusterDefs/testTAMOmetrics.txt' #theAce = AlignAce(fastaPath,width=10) print 'running MDscan...' tMD_1 = time() MDmotifs = MDscan(fastaPath) #,bgfile=MDbg) tMD_2 = time() MD_time = tMD_2-tMD_1 print 'MDscan took %.5f sec == %.3f min.\nMDscan found %s motifs.' % (MD_time,MD_time/60.0, len(MDmotifs.motifs)) print 'running MEME...' tMeme_1 = time() memeMotifs = Meme(fastaPath) tMeme_2 = time() Meme_time = tMeme_2-tMeme_1
for f in pklFiles: unPikls[f.split('/')[-1]] = pickle.load(open(f,'r')) motifs = [] for i in unPikls: for j in range(len(unPikls[i])): fastaSourceName = unPikls[i][j].lines[1].split()[2].split('/')[-1] motifs.append([fastaSourceName,unPikls[i][j].results]) # Adjust each motif to the species being looked at for m in motifs: for i in motifs[i][1].new_bg(speciesBK) probSet = ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas') # get and print motif and pVal(s) out = '#outFile\tMotif\tHyperGeoPval (%s)\tfrac (%s)\tBestHG (pval)\tfrac (bestHG_ScoreThresh)\tBestHG (scoreThresh)\tbinoPval (%s)\tbinoPval (bestHG_scoreThresh)' \ % (dfltFactor,dfltFactor,dfltFactor) print out out = out+'\n' for m in motifs: for i in m[1]: bestE = probSet.best_p_value(m[1],coRegSeqs) temp ='%s\t%s\t%.3e\t%.3f\t%.3e\t%.3f\t%.3f\t%.3e\t%.3e' % (m[0], m[1].oneletter, probSet.Enrichment(m[1],coRegSeqs,factor=dfltFactor), probSet.frac(m[1],coRegSeqs,factor=dfltFactor), bestE[0],
from gusPyCode.MDAP_proj.MDAP_defs import transfacLike2tamoMotif from TAMO.MotifMetrics import ProbeSet from TAMO import MotifTools import os listPath = '/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/kMerPWMs/Clus2_kmerSearch.6-8mers.top10pcnt/Group5.txt' probSetReal = ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas') probSetShuf = ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.seqsShuffled.fas') coRegSeqs = '/Users/biggus/Documents/James/Data/ReClustering/kmedsPear33Clus50x_2/Clus2_247genes.genes.txt' coRegSeqs = map(lambda l: l.strip(), open(coRegSeqs, 'rU').readlines()) # set real or shuffled genome #probSet=probSetReal motifs = map(lambda l: l.strip(), open(listPath, 'rU').readlines()) for m in range(len(motifs)): motifs[m] = MotifTools.Motif_from_text(motifs[m]) group = 'Group5' # calc HyperGeo for all motifs print '#threshold = 75% of max score\n#Group\tone_letter\tpVal\tfraction_of_gene_set\tpVal (shuffled)\tfraction_of_gene_set (shuffled)' for m in motifs:
#! /usr/bin/env python2.4 # Author: Alexandre S. Cristino # email: [email protected] # Date: 16 june 2008 # Script: Scoring motifs in upstream regions of specific databases import os,sys,string from TAMO import MotifTools from TAMO.seq import Fasta from TAMO.MotifMetrics import ProbeSet promoters = ProbeSet(sys.argv[1]) geneset_ids = open(sys.argv[2]).read().split('\n')[:-1] match_ids = [] prom_ids = promoters.probes.keys() for id in geneset_ids: if id in prom_ids: match_ids.append(id) motifs = MotifTools.load(sys.argv[3]) church = 0.05 rocauc = 0.1 pvalue = 0.05 print "Name\tMotif\tChurch\tRoc-auc\tP-value" for m in motifs: m.church = promoters.church (m, match_ids) # m.ROC_auc = promoters.ROC_AUC (m, match_ids) m.pvalue = promoters.p_value (m, match_ids)
def main(): if len(sys.argv) < 2: print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0])) print "Options include:" print "" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" # print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -kmerseeds Use kmers with best enrichment score as seeds for EM" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -g genome.fsa Use specicied Fasta file as background (searches first for matching frequency file)" # print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" # print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels= [] species = 'YEAST' valid_tfs = [] #NOT USED gapped_syl= None gapflank = 0 gapweight = 0.2 enrichfact= 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None bgfile = None seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array '''Parse command-line arguments''' for tok,i in zip(sys.argv,xrange(len(sys.argv))): if tok == '-top' : seed_count = int(sys.argv[i+1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior' : seed_s.append(sys.argv[i+1]) elif tok == '-beta' : beta = float(sys.argv[i+1]) elif tok == '-seedbeta': seedbeta = float(sys.argv[i+1]) elif tok == '-gamma' : gamma = float(sys.argv[i+1]) elif tok == '-delta' : deltamin = float(sys.argv[i+1]) elif tok == '-kmerseeds' : infofile = 1 elif tok == '-valid' : valid_tfs.append(sys.argv[i+1]) #NOT USED elif tok == '-w' : width = sys.argv[i+1] elif tok == '-width' : width = sys.argv[i+1] elif tok == '-gap' : gapped_syl = sys.argv[i+1] elif tok == '-gapflank' :gapflank = int(sys.argv[i+1]) elif tok == '-gapweight':gapweight = float(sys.argv[i+1]) elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1]) elif tok == '-pmax' : pmax = 1 elif tok == '-Y2K' : species = "YEAST_2000_UP" elif tok == '-Y5C' : species = "YEAST_500_UP" elif tok == '-B' : species = "BAC_ORF" elif tok == '-Ch22' : species = "Ch22" elif tok == '-genome': species = sys.argv[i+1] elif tok == '-pad' : pad = "TRUE" elif tok == '-bgfile': bgfile = sys.argv[i+1] elif tok == '-TF' : #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION) TFSEEDS = 1 for j in range(i+1,len(sys.argv)): if re.match('M0',sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human' : _s = '' if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1] else: _s = '' species = 'HUMAN'+_s if infofile: infofile = fastafile if bgfile: EM.loadMarkovBackground(bgfile) elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(species) else: EM.theMarkovBackground = EM.Zeroth() fsaD = Fasta.load(fastafile) Fasta.delN(fsaD) seqs = fsaD.values() probes = fsaD.keys() all_seqs = seqs seed_s.extend(seqs[0:min(seed_count,len(seqs))]) if infofile and width=='info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width,infofile,fastafile,species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: #NOT USED tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*','',key.split()[0])) m.seedtxt = '%-24s %s'%(m,key) tf.append(m) tf.sort(lambda x,y: cmp(x.seednum,y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x:'N'+x+'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] left = MotifTools.Motif_from_text('@') right = MotifTools.Motif_from_text('N') for m in seedmodels: newmodels.append(left + m + right) print left + m + right seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s,[],width,"VERBOSE") if beta: theEM.beta = beta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta theEM.param['gamma'] = gamma theEM.seqs.extend(all_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) ''' Compute some metrics ''' print "#Loading Genome %s"%species ; sys.stdout.flush() Genome = ProbeSet(species,enrichfact) ids = Genome.ids_from_file(fastafile) for C in theEM.candidates: if not pmax: C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose') C.pssm.church = Genome.church(C.pssm,ids) C.pssm.frac = Genome.frac(C.pssm,probes,None,0.7) else: (p,frac) = Genome.best_p_value(C.pssm,ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:",p,frac matching = Genome.matching_ids(C.pssm,[],factor=0.7) matchbound = [x for x in matching if x in probes] C.pssm.numbound = len(probes) C.pssm.nummotif = len(matching) C.pssm.numboundmotif = len(matchbound) sys.stdout.flush() ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue)) for C,i in zip(theEM.candidates,range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: #NOT USED _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed MotifTools.print_motif(C.pssm,20,i) sys.stdout.flush() continue #Antiquated stuff -- Remove !! print "Log-odds matrix for Motif %3d %s"%(i,C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s"%(i,m.seedtxt) if m.source: print "Source: ",m.source if m.gamma: print "Gamma: %7.5f"%m.gamma if m.threshold: print "Threshold: %5.2f"%m.threshold #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s'%( math.fabs(math.log(C.pssm.church)/math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%"%( C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq,i,C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*'*len(seq) print "MAP Score: %f"%C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times