def showdiffXvert(motif, seq, OVLP_FCN=None, DIFF_FCN=None): ''' The funtion converts the sequence to a Motif, computes the D of the best alignment, and prints the alignment that generated that D. ''' MSOdiff = minshortestoverhangdiff if not OVLP_FCN: OVLP_FCN = lambda A, B: min(min(A.width, B.width) - 1, 7) bg = motif.background other = MotifTools.Motif_from_text(seq, bg=bg) ovlp = OVLP_FCN(motif, other) diff = MSOdiff(motif, other, ovlp, DFUNC=DIFF_FCN) offset, rcflag = MSOdiff(motif, other, ovlp, 'want_offset', DFUNC=DIFF_FCN) if rcflag: m = other.revcomp() else: m = other print 'MSOdiff: %8.4f %s%s%s' % (diff, ' ' * 15, motif.oneletter, ' ' * (30 - motif.width)) print ' %8s %s%s%s' % (' ', ' ' * (15 + offset), m.oneletter, ' ' * (30 - offset - other.width)) return diff
def main(): fsa_fcn = up_and_no_N parse() FID = open(sys.argv[1]) tokss = [x.strip().split(',') for x in FID.readlines()] FID.close() D = {} for expt,motif,score,source in tokss: print expt,motif if expt == 'Category': continue if motif == 'x': continue motif = MotifTools.Motif_from_text(motif) motif.kellis = float(score) motif.source = source try: D[expt].append(motif) except: D[expt] = [motif] for expt,motifs in D.items(): root = expt ext = 'cons' if root[0:3] == 'Rnd': num = re.sub('.*_','',root) if len(num) == 1: root = re.sub('_','_00',root) else: root = re.sub('_','_0',root) root = re.sub('Rnd','random_',root) outname = '%s.t%s'%(root,ext) print '%-18s --> %s'%(root,outname) sys.stdout.flush() motifs2tamo(motifs,outname) try: pass #tamo2tamo(filename,outname) except: print "Error: Could not convert %s [[ %s ]]"%( filename, outname)
def main(): if len(sys.argv) < 2: print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0])) print "Options include:" print "" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" # print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -kmerseeds Use kmers with best enrichment score as seeds for EM" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -g genome.fsa Use specicied Fasta file as background (searches first for matching frequency file)" # print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" # print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels= [] species = 'YEAST' valid_tfs = [] #NOT USED gapped_syl= None gapflank = 0 gapweight = 0.2 enrichfact= 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None bgfile = None seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array '''Parse command-line arguments''' for tok,i in zip(sys.argv,xrange(len(sys.argv))): if tok == '-top' : seed_count = int(sys.argv[i+1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior' : seed_s.append(sys.argv[i+1]) elif tok == '-beta' : beta = float(sys.argv[i+1]) elif tok == '-seedbeta': seedbeta = float(sys.argv[i+1]) elif tok == '-gamma' : gamma = float(sys.argv[i+1]) elif tok == '-delta' : deltamin = float(sys.argv[i+1]) elif tok == '-kmerseeds' : infofile = 1 elif tok == '-valid' : valid_tfs.append(sys.argv[i+1]) #NOT USED elif tok == '-w' : width = sys.argv[i+1] elif tok == '-width' : width = sys.argv[i+1] elif tok == '-gap' : gapped_syl = sys.argv[i+1] elif tok == '-gapflank' :gapflank = int(sys.argv[i+1]) elif tok == '-gapweight':gapweight = float(sys.argv[i+1]) elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1]) elif tok == '-pmax' : pmax = 1 elif tok == '-Y2K' : species = "YEAST_2000_UP" elif tok == '-Y5C' : species = "YEAST_500_UP" elif tok == '-B' : species = "BAC_ORF" elif tok == '-Ch22' : species = "Ch22" elif tok == '-genome': species = sys.argv[i+1] elif tok == '-pad' : pad = "TRUE" elif tok == '-bgfile': bgfile = sys.argv[i+1] elif tok == '-TF' : #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION) TFSEEDS = 1 for j in range(i+1,len(sys.argv)): if re.match('M0',sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human' : _s = '' if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1] else: _s = '' species = 'HUMAN'+_s if infofile: infofile = fastafile if bgfile: EM.loadMarkovBackground(bgfile) elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(species) else: EM.theMarkovBackground = EM.Zeroth() fsaD = Fasta.load(fastafile) Fasta.delN(fsaD) seqs = fsaD.values() probes = fsaD.keys() all_seqs = seqs seed_s.extend(seqs[0:min(seed_count,len(seqs))]) if infofile and width=='info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width,infofile,fastafile,species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: #NOT USED tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*','',key.split()[0])) m.seedtxt = '%-24s %s'%(m,key) tf.append(m) tf.sort(lambda x,y: cmp(x.seednum,y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x:'N'+x+'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] left = MotifTools.Motif_from_text('@') right = MotifTools.Motif_from_text('N') for m in seedmodels: newmodels.append(left + m + right) print left + m + right seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s,[],width,"VERBOSE") if beta: theEM.beta = beta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta theEM.param['gamma'] = gamma theEM.seqs.extend(all_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) ''' Compute some metrics ''' print "#Loading Genome %s"%species ; sys.stdout.flush() Genome = ProbeSet(species,enrichfact) ids = Genome.ids_from_file(fastafile) for C in theEM.candidates: if not pmax: C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose') C.pssm.church = Genome.church(C.pssm,ids) C.pssm.frac = Genome.frac(C.pssm,probes,None,0.7) else: (p,frac) = Genome.best_p_value(C.pssm,ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:",p,frac matching = Genome.matching_ids(C.pssm,[],factor=0.7) matchbound = [x for x in matching if x in probes] C.pssm.numbound = len(probes) C.pssm.nummotif = len(matching) C.pssm.numboundmotif = len(matchbound) sys.stdout.flush() ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue)) for C,i in zip(theEM.candidates,range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: #NOT USED _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed MotifTools.print_motif(C.pssm,20,i) sys.stdout.flush() continue #Antiquated stuff -- Remove !! print "Log-odds matrix for Motif %3d %s"%(i,C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s"%(i,m.seedtxt) if m.source: print "Source: ",m.source if m.gamma: print "Gamma: %7.5f"%m.gamma if m.threshold: print "Threshold: %5.2f"%m.threshold #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s'%( math.fabs(math.log(C.pssm.church)/math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%"%( C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq,i,C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*'*len(seq) print "MAP Score: %f"%C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times
# generate PWM for indicated motifs #module load TAMO (on hpc) import sys from TAMO import MotifTools #input file: motif list input=open(sys.argv[1], 'r') motif=[] for line in input: if line.startswith("#"): pass else: line=line.strip("\n").strip("\r").strip() motif.append(line) input.close() print (motif) pw=[] for i in range(0, len(motif)): m=MotifTools.Motif_from_text(motif[i]) pw.append(m) MotifTools.save_motifs(pw,sys.argv[1]+'.tamo')
def averagetxts(txts, ovlp=2, VERBOSE=1): motifs = [MotifTools.Motif_from_text(x.strip()) for x in txts] return averagemotifs(motifs, ovlp, VERBOSE=VERBOSE)
def testdiff(t1, t2, OVLP_FCN=None, DIFF_FCN=None): m1 = MotifTools.Motif_from_text(t1, bg=MotifTools.YEAST_BG) showdiffXvert(m1, t2, OVLP_FCN, DIFF_FCN)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="]) except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7) ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) probes = Fasta.load(fastafile) if motiffile: motifs.extend(TAMO.tamofile2motifs(motiffile)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))