def SGDData(): root = TAMO.paths.SGDdir urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' files = ['chromosomal_feature/SGD_features.tab', 'chromosomal_feature/dbxref.tab', 'chromosomal_feature/chromosome_length.tab', 'sequence/GenBank/yeast_nrpep.fasta.gz', 'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz', ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab') ] chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split() files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] ) downloadfiles(root,urlroot,files) from TAMO.seq import Fasta print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)" D = {} for chr in chrs: _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr)) id, seq = _d.items()[0] if chr[0] == '0': chr = chr[1] D['chr%s %s'%(chr,id)] = seq Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
def genomebg(infile,outfile): EXE = MDSCAN_DIR + 'genomebg.linux' fsaD = Fasta.load(infile) tmpfsa = tempfile.mktemp() Fasta.write(fsaD,tmpfsa,linelen=1000000000) CMD = '%s -i %s -o %s'%(EXE,tmpfsa,outfile) FID = os.popen('( %s ;) 2>&1'%CMD,'r') for line in FID.readlines(): print line if FID.close(): print "Exited" os.unlink(tmpfsa)
def genomebg(infile, outfile): EXE = MDSCAN_DIR + 'genomebg.linux' fsaD = Fasta.load(infile) tmpfsa = tempfile.mktemp() Fasta.write(fsaD, tmpfsa, linelen=1000000000) CMD = '%s -i %s -o %s' % (EXE, tmpfsa, outfile) FID = os.popen('( %s ;) 2>&1' % CMD, 'r') for line in FID.readlines(): print line if FID.close(): print "Exited" os.unlink(tmpfsa)
def memefiles2tamo(files, tamoname): global probefile, PROBESET, fsafile motifs = [] for filename in files: print ">>>SDFSD>F ",filename if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa') elif re.search('\.meme.*$',filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile if fsafile: fsaname = fsafile else: fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) if 0 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs,tamoname)
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total) for nmer in _t: print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total) sys.stdout.flush()
def loadMiRNAs(miRNA_Path): """ Takes fasta file of mature miRNAs. Returns dict. """ return Fasta.load(miRNA_Path)
def info2seeds(N,infofile,probefile,species='YEAST'): G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = Fasta.seqs(infofile) if not N: nmers = seqs else: nmers= MotifTools.top_nmers(N,seqs) if len(nmers) > 1000: nmers = nmers[0:1000] print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile) sys.stdout.flush() nmers_scoresT = [] for nmer in nmers: if nmer.isalpha(): p = G.p_value(nmer,IDs,'') #'verbose' nmers_scoresT.append((nmer,p)) nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1])) last = min(20,len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = MotifTools.Motif('',Q) m.compute_from_text(seq,0.1) models.append(m) for tup in nmers_scoresT[0:40]: print tup return(models)
def main(fastafile, outDirectory): # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script. seqsD = Fasta.load(fastafile) seqs = seqsD.values() output = [] for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total)) # AD 02-27-09 added a '\n' to make file look right for nmer in _t: output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total)) # AD 02-27-09 added a '\n' to make file look right # open output file and write out results outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1]) outFile = open(outFile, 'w') for index in output: outFile.write(index)
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1, 7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer, count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2 * count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1], total) for nmer in _t: print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total) sys.stdout.flush()
def calcStats(fastaPath): seqFile = Fasta.load(fastaPath) combinedSeq = '' for each in seqFile: combinedSeq += seqFile[each] combinedSeq= combinedSeq.upper() seqs = len(seqFile) totNucs = len(combinedSeq) aCnt = combinedSeq.count('A') cCnt = combinedSeq.count('C') gCnt = combinedSeq.count('G') tCnt = combinedSeq.count('T') nCnt = combinedSeq.count('N') nonNs = aCnt+cCnt+gCnt+tCnt n2tot = float(nCnt)/len(combinedSeq) n2nonN = float(nCnt)/nonNs percentGC = (float(gCnt)+cCnt)/nonNs return {'seqLen':seqs, 'totNucs':totNucs, 'aCnt':aCnt, 'cCnt':cCnt, 'gCnt':gCnt, 'tCnt':tCnt, 'nCnt':nCnt, 'nonNs':nonNs, 'n2tot':n2tot, 'n2nonN':n2nonN, 'percentGC':percentGC}
def orf2pseq(orf): global _orfpseqs if not _orfpseqs: from TAMO.seq import Fasta _orfpseqs = Fasta.load(_ORFPSEQS) for _orf, pseq in _orfpseqs.items(): if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1] if _orfpseqs.has_key(orf): return _orfpseqs[orf] else: return ''
def orf2pseq(orf): global _orfpseqs if not _orfpseqs: from TAMO.seq import Fasta _orfpseqs = Fasta.load(_ORFPSEQS) for _orf,pseq in _orfpseqs.items(): if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1] if _orfpseqs.has_key(orf): return _orfpseqs[orf] else: return ''
def train_final(self, model, fg, bg, N, beta): input_seqs = [] for s in fg: iseq = self.all_probes[s].upper() iseq = re.sub(";","",iseq) if (re.search("N", iseq)): iseq = re.sub("N","",iseq) if (len(iseq)>0): input_seqs.append(iseq) if (self.refine): final_motif = self.train_model(model, input_seqs, beta) else: final_motif = self.models[model] train_pos = self.get_LLRs(final_motif, fg) train_neg = self.get_LLRs(final_motif, bg) over_sampled_positive = self.SMOTE([train_pos], N, N)[0] #Train SVM to classify our training set c_vals = [1.0e-10, 1.0e-4, 1.0e-3, 1.0e-2, 0.05, 0.1, 1.0, 10.0, 100.0] best_classifier = None best_err = 1.0 for c in c_vals: classifier = self.SVM_train(over_sampled_positive, train_neg, c) train_err = self.SVM_test(classifier, over_sampled_positive, train_neg) if (train_err<best_err): best_err = train_err best_classifier = classifier (train_err, fp, fn) = self.SVM_test(best_classifier, train_pos, [], 1) if (self.dump): motif = {} no_motif = {} for name, val in zip(fg,train_pos): train_err = self.SVM_test(best_classifier, [val], []) if (train_err): no_motif[name] = self.all_probes[name] else: motif[name] = self.all_probes[name] motif_fsa = self.motif_file.split('.')[0] + '.pos.fsa' no_motif_fsa = self.motif_file.split('.')[0] + '.neg.fsa' Fasta.write(motif, motif_fsa) Fasta.write(no_motif, no_motif_fsa) return((final_motif, best_classifier, fn))
def loadSeqs(fastaPathList): """ Takes list of paths. Returns single dict full of seqs found in the files. Converts softMasking to hard. """ rDict = {} for path in fastaPathList: rDict.update(Fasta.load(path)) bioDefs.softMaskDict2HardMask(rDict) return rDict
def get_seq(chr,start=None,stop=None): global ChrD if not ChrD: from TAMO.seq import Fasta ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa') if (type(chr) != type('')) or (chr.find('chr') != 0): # 1 -> chr1, 'X' -> chrX chr = 'chr%s'%chr if (start == None) and chr.find(':') > 0: # chr4:454-465 -> chr4, 454, 465 _chr,_range = chr.split(':') chr = _chr start, end = _range.split('-') start, end = int(start), int(end) return ChrD[chr][start-1:end]
def get_seq(chr, start=None, stop=None): global ChrD if not ChrD: from TAMO.seq import Fasta ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa') if (type(chr) != type('')) or (chr.find('chr') != 0): # 1 -> chr1, 'X' -> chrX chr = 'chr%s' % chr if (start == None) and chr.find(':') > 0: # chr4:454-465 -> chr4, 454, 465 _chr, _range = chr.split(':') chr = _chr start, end = _range.split('-') start, end = int(start), int(end) return ChrD[chr][start - 1:end]
def swp2swp(swp): 'Converts, when possible, from P014543 to ADR1_YEAST' 'Only works for yeast right now' global _swp2swp if not _swp2swp: lines = Fasta.keys(_SWPFASTA, key_func=lambda x: x) for line in lines: toks = line.split() text_name = toks[1] numeric_name = toks[2] if text_name[0:2] == 'SW' and numeric_name[0] == 'P': _swp2swp[text_name[3:]] = numeric_name _swp2swp[numeric_name] = text_name[3:] if _swp2swp.has_key(swp): return _swp2swp[swp]
def swp2swp(swp): 'Converts, when possible, from P014543 to ADR1_YEAST' 'Only works for yeast right now' global _swp2swp if not _swp2swp: lines = Fasta.keys(_SWPFASTA,key_func=lambda x:x) for line in lines: toks = line.split() text_name = toks[1] numeric_name = toks[2] if text_name[0:2] == 'SW' and numeric_name[0] == 'P': _swp2swp[text_name[3:]] = numeric_name _swp2swp[numeric_name] = text_name[3:] if _swp2swp.has_key(swp): return _swp2swp[swp]
def go(self): """Execution function: runs TAMO.MD.Meme.Meme and catches the output in self.output for access from MDAP.""" import time # write a temp fasta file of coregulated seqs to use as input to Meme(file=TempFasta) ctimeStr = time.ctime().replace(' ','_') fileName = 'tempFastaOfCoRegSeqs.MDAP.%s.fas' %(ctimeStr) tFasta = open(fileName, 'w') tFastaTxt = Fasta.text(self.coRegSeqs[0]) tFasta.write(tFastaTxt) # Call TAMO to do its thing: self.output = Meme(file=fileName, width='', extra_args=self.extra_args, bfile=self.bfile) # delete temp file os.remove(fileName)
def LoadDNA(verbose=False): ############################################################################### # # Read DNA seqeuence # Extract sub-sequence to model # Define rules for DNA # ############################################################################### START_POS = 0 dna = "" fastafile = params.GetString(DNA_section,"FILE") if (fastafile): chromo = params.GetString(DNA_section,"CHR") chr_start = params.GetInt(DNA_section,"START") chr_end = params.GetInt(DNA_section,"END") if (not chr_end): chr_end = params.GetInt(DNA_section,"LENGTH") chr_end += chr_start if verbose: print ("Loading fasta: [%s]\n"%fastafile) seqs = Fasta.load(fastafile) seqkeys = seqs.keys() seqkeys.sort() n = 0 for chr in seqkeys: n += len(seqs[chr]) if verbose: print("Genome length = %d, # chromosomes = %d\n"%(n, len(seqkeys))) if (seqs.has_key(chromo)): seq = seqs[chromo] if verbose: print("Chr[%s] = %d nt\n"%(chromo,len(seq))) dna = seq[chr_start:chr_end] if verbose: print("DNA[%d:%d] = %d nt\n"%(chr_start,chr_end,len(dna))) else: if verbose: print("Cannot find [%s] chromosome in %s\n"%(chromo, filename)) if (verbose): print("DNA:[%s]\n"%dna) return dna
def spawnOrthoGroups(promoterFileList,nWayOrthoList): """Takes promoterFileList<listOfPaths> and nWayOrthoList<listOfLists> and spawns the orthoGroup objects in a dictionary with keys = 'geneName1:geneName2:etc' that will be used to run the combined hypergeometric analysis.""" # validation assert type(promoterFileList) == type([]), \ '''promoterFileList must be a list of file paths. You provided type: "%s"'''\ % (type(promoterFileList)) assert type(promoterFileList[0]) == type(''), \ '''promoterFileList must be a list of file paths. promoterFileList[0] != type(''): "%s"'''\ % (type(promoterFileList[0])) # load promoters allPromoters = {} for i in range(len(promoterFileList)): oneGenome = Fasta.file2dict(promoterFileList[i]) for j in oneGenome: allKeys = allPromoters.keys() assert j not in allKeys, \ '''Detected duplicate gene name in promoterFileList! "%s"'''\ % (j) allPromoters[j] = oneGenome[j] # Build Groups orthoGroups = {} for i in range(len(nWayOrthoList)): groupDict = {} for j in range(len(nWayOrthoList[i])): if allPromoters[nWayOrthoList[i][j]]: groupDict[nWayOrthoList[i][j]] = allPromoters[nWayOrthoList[i][j]] else: break # we do not want orthoGroups that are missing members if len(groupDict) != len(nWayOrthoList[i]): break # we do not want orthoGroups that are missing members else: nWayOrthoList[i].sort() orthoGroups[':'.join(nWayOrthoList[i])] = OrthoGroup(groupDict) return orthoGroups
def main(): short_opts = 'f:' long_opts = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile='] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() fastafile = '' top_count = 10 top_pcnt = None genome = 'YEAST' w_start = 8 w_stop = 15 bgfile = MDSCAN_DIR + 'yeast_int.bg' for opt, value in opts: if opt == '-f': fastafile = value if opt == '--genome': genome = value if opt == '--top': top_count = int(value) if opt == '--pcnt': top_pcnt = float(value) if opt == '--range': w_start, w_stop = [int(x) for x in value.split(',')] print "#" + ' '.join(sys.argv) probeids = Fasta.keys(fastafile) Genome = MotifMetrics.ProbeSet(genome) probeids = Genome.filter(probeids) if top_pcnt: top_count = max(top_count, int(top_pcnt / 100.0 * len(probeids))) theMeta = metaMDscan(fastafile, w_start, w_stop, top_count) for m in theMeta.motifs: m.pvalue = Genome.p_value(m, probeids, 'v') m.church = Genome.church(m, probeids, 'v') sys.stdout.flush() theMeta.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) print_motifs(theMeta.motifs)
def LoadDNA(): ############################################################################### # # Read DNA seqeuence # Extract sub-sequence to model # Define rules for DNA # ############################################################################### START_POS = 0 dna = "" fastafile = params.GetString(DNA_section, "FILE") if (fastafile): chromo = params.GetString(DNA_section, "CHR") chr_start = params.GetInt(DNA_section, "START") chr_end = params.GetInt(DNA_section, "END") if (not chr_end): chr_end = params.GetInt(DNA_section, "LENGTH") chr_end += chr_start print("Loading fasta: [%s]\n" % fastafile) seqs = Fasta.load(fastafile) seqkeys = seqs.keys() seqkeys.sort() n = 0 for chr in seqkeys: n += len(seqs[chr]) print("Genome length = %d, # chromosomes = %d\n" % (n, len(seqkeys))) if (seqs.has_key(chromo)): seq = seqs[chromo] print("Chr[%s] = %d nt\n" % (chromo, len(seq))) dna = seq[chr_start:chr_end] print("DNA[%d:%d] = %d nt\n" % (chr_start, chr_end, len(dna))) else: print("Cannot find [%s] chromosome in %s\n" % (chromo, filename)) print("DNA:[%s]\n" % dna) return dna
def swp_find_and_format(swp): global _swp_seqs if not _swp_seqs: _swp_seqs = Fasta.load(_SWPFASTA,key_func=lambda x:x) hits = [] for key in _swp_seqs.keys(): if key[0:60].find(swp) >= 0: hits.append(key) if not hits: return None if len(hits) > 1: print "# Multiple matches found for %s:"%swp for hit in hits: print '#',hit return None hit = hits[0] seq = _swp_seqs[hit] txt = '' for i in range(0,len(seq),70): txt = txt + seq[i:i+70] + '\n' return txt
def go(self): """Execution function: coordinates options used and background GC calculation, then runs TAMO.MD.AlignAce.MetaAce and catches the output in self.output for access from MDAP. Output is TAMO.AligAce result object.""" import time # Calc GC background of genomic sequences representing the # entire data set if requested. if self.mdapOptions['background'] == 1: self.dataStats = seqStats.calcStats(self.mdapArgs[0]) self.gcback = self.dataStats['percentGC'] # write a temp fasta file of coregulated seqs to use as input to Meme(file=TempFasta) ctimeStr = time.ctime().replace(' ','_') fileName = 'tempFastaOfCoRegSeqs.MDAP.%s.fas' %(ctimeStr) tFasta = open(fileName, 'w') tFastaTxt = Fasta.text(self.coRegSeqs[0]) tFasta.write(tFastaTxt) # call TAMO to do its thing self.output = MetaAce(fileName, self.width, self.iterations, self.gcback) pass
def swp_find_and_format(swp): global _swp_seqs if not _swp_seqs: _swp_seqs = Fasta.load(_SWPFASTA, key_func=lambda x: x) hits = [] for key in _swp_seqs.keys(): if key[0:60].find(swp) >= 0: hits.append(key) if not hits: return None if len(hits) > 1: print "# Multiple matches found for %s:" % swp for hit in hits: print '#', hit return None hit = hits[0] seq = _swp_seqs[hit] txt = '' for i in range(0, len(seq), 70): txt = txt + seq[i:i + 70] + '\n' return txt
def __init__(self,fastaSeqs, motifDict, thresh=0.5,window=200): self.seqMaps = {} # Get seqs from fasta assert type(fastaSeqs) == type('string') or type(fastaSeqs) == type({}),\ 'MapLib arg(fastaSeqs) must be string pointing to file or a seqDict.' if type(fastaSeqs) == type('string'): seqs = Fasta.load(fastaSeqs) elif type(fastaSeqs) == type({}): seqs = fastaSeqs # Instantiate a SeqMap obj for each seq in seqs c = 0 for k in seqs: c += 1 assert c <= 250 realT1 = time() self.seqMaps[k] = SeqMap(k, seqs[k], motifDict, thresh=thresh, window=window) realT2 = time() print '%.4f\t%s' % (realT2-realT1,c)
def main(): short_opts = 'f:' long_opts = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile='] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() fastafile = '' top_count = 10 top_pcnt = None genome = 'YEAST' w_start = 8 w_stop = 15 bgfile = MDSCAN_DIR + 'yeast_int.bg' for opt,value in opts: if opt == '-f': fastafile = value if opt == '--genome': genome = value if opt == '--top': top_count = int(value) if opt == '--pcnt': top_pcnt = float(value) if opt == '--range': w_start,w_stop= [int(x) for x in value.split(',')] print "#" + ' '.join(sys.argv) probeids = Fasta.keys(fastafile) Genome = MotifMetrics.ProbeSet(genome) probeids = Genome.filter(probeids) if top_pcnt: top_count = max(top_count,int(top_pcnt/100.0 * len(probeids))) theMeta = metaMDscan(fastafile,w_start,w_stop,top_count) for m in theMeta.motifs: m.pvalue = Genome.p_value(m,probeids,'v') m.church = Genome.church(m,probeids,'v') sys.stdout.flush() theMeta.motifs.sort(lambda x,y: cmp(x.pvalue,y.pvalue)) print_motifs(theMeta.motifs)
def geneList2FastaDict(geneList, sourceFastaPath, hardMasked=True): """ Returns a Dict of requested fasta recs in form SeqName:Sequence. Defaults to HardMasked return seqeunces. """ sourceDict = Fasta.load(sourceFastaPath) # make new dict of all genes both in geneList AND sourceDict # new dict may be shorter than geneList!!!!!! newDict = {} for i in geneList: if sourceDict[i]: newDict[i] = sourceDict[i] print "%s genes names given, %s found." % (len(geneList), len(newDict)) if hardMasked: softMaskDict2HardMask(newDict) return newDict
def __init__(self,species='YEAST',seqs=''): if seqs: self.sourcefile = 'Runtime (%d sequences)'%len(seqs) elif species.find('.6MBG') >= 0: self.sourcefile = species elif species[0:5] == 'YEAST': self.sourcefile = TAMO.paths.Whiteheaddir+'Yeast6kArray/yeast.intergenic.6.freq' TAMO.paths.CHECK(self.sourcefile,'Whitehead') elif species[0:5] == 'HUMAN': self.sourcefile = TAMO.paths.Whiteheaddir+'Human13kArray/human_elongated_probbesQC250.6MBG' TAMO.paths.CHECK(self.sourcefile,'Whitehead') elif os.path.exists(re.sub('.fsa|.fasta','.6MBG',species)): self.sourcefile = re.sub('.fsa|.fasta','.6MBG',species) elif os.path.exists(species) and (species.find('.fsa') >=0): self.sourcefile = species print "EM.MarkovBackground: Computing background from %s"%species sys.stdout.flush() self.freqs_from_seqs(Fasta.seqs(species)) #elif os.path.exists(species): # self.sourcefile = species else: print 'EM.MarkovBackground: Unknown species %s, using Yeast' self.sourcefile = TAMO.paths.Whiteheaddir+'Yeast6kArray/yeast.intergenic.6.freq' TAMO.paths.CHECK(self.sourcefile,'Whitehead') self.species = species self.D = {} self.F = {} #Frequencies self.CP = {} #log2(Conditional Probabilities) CP['ACTG'] = p( G | ACT ) self.nmers_by_size = map(lambda x:[],range(0,10)) self.highestorder = 0 if seqs: print "EM.MarkovBackground: Computing background from %d sequences"%len(seqs) self.freq_from_seqs(seqs) else: self.freq_from_file() self.compute_conditional() self.totD = {}
def __init__(self, fg_file, bg_file, cv_level, markov_file): self.cv_level = cv_level self.randomize = 0 self.beta = 0.0 self.delta = 0.001 self.refine = 1 self.motif_file = 'dummy.out' self.dump = 0 self.family = '' self.datafiles = (fg_file,bg_file) MAX_FG = 2000 #LOAD MARKOV BACKGROUND# print "Loading Markov background file from %s"%markov_file EM.loadMarkovBackground(markov_file) ################################################################################## #divide input sequences into groups according to the desired cross-validation level ################################################################################### print "Processing input sequences...." self.fg_seqs = Fasta.load(fg_file) #load foreground sequences for key in self.fg_seqs.keys(): fseq = self.fg_seqs[key] self.fg_seqs[key] = fseq.split()[0] self.all_probes = Fasta.load(bg_file) #load background sequences Fasta.delN(self.fg_seqs) Fasta.delN(self.all_probes) #first delete any sequences from background that are present in foreground for key in self.fg_seqs.keys(): if (self.all_probes.has_key(key)): del self.all_probes[key] for key in self.all_probes.keys(): if ((len(self.all_probes[key])==0) or (re.search('[SWMKRY]', self.all_probes[key]))): del self.all_probes[key] print "deleting %s"%key while (len(self.fg_seqs.keys())>MAX_FG): del self.fg_seqs[self.fg_seqs.keys()[random.randint(0,(len(self.fg_seqs.keys())-1))]]
from TAMO.seq import Fasta from gusPyCode.defs import bioDefs miRNAFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.fa' seedFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.seeds.ctrl.fa' oligoType = 'control' # 'match' or 'control' assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".' # Load miRNA fastas into dict. miRNAs = Fasta.file2dict(miRNAFile) # Create new dict for seeds. seeds = {} # 1) Cycle through miRNA dict taking 7mers starting at pos 1 # and then pos2. Adapt key to reflect which. # 2) Convert to all uppers and convert U's to T's # 3) If oligoType == 'match', rvcmp each 7mer and adapt key # to reflect which. for miRNA in miRNAs: pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T') pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T') if oligoType == 'match': seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed) seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed) else: seeds[miRNA+'_ctrl_pos1'] = pos1_seed seeds[miRNA+'_ctrl_pos2'] = pos2_seed
def freq_from_fasta(self,fastafile): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() self.freq_from_seqs(seqs)
from TAMO.MotifTools import top_nmers,Motif from TAMO import MotifTools from TAMO.seq import Fasta from gusPyCode.defs.bioDefs import ifKmerInAll seqFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.fas' outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.8mersInAll.txt' kmerSize = 8 scoreThresh = 0.999999 seqs = Fasta.file2dict(seqFile) # create new dict to store the seqs' kmers seqsKmers = {} for i in seqs: seqsKmers[i] = top_nmers(kmerSize,[seqs[i]], purge_Ns = 1) # for some reason top_nmers fails silently if given str instead of list inAllSeqs = [] count = 0 for seq in seqsKmers: for kmer in seqsKmers[seq]: if ifKmerInAll(kmer,seqs,scoreThresh): if kmer not in inAllSeqs: inAllSeqs.append(kmer) count+=1 print count outFile = open(outFile, 'w')
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="]) except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7) ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) probes = Fasta.load(fastafile) if motiffile: motifs.extend(TAMO.tamofile2motifs(motiffile)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
def main(): if len(sys.argv) < 3: print "Usage: %s <fasta_file> [width = None ] [options]" % (re.sub( '^.*/', '', sys.argv[0])) print "Options include:" print " -valid <tf_name> Check answers against Transfac" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -info <file.info> for structural priors" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC " % ( sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 " % ( sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 " % (sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ', '\ ', x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' cbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels = [] species = 'YEAST' valid_tfs = [] gapped_syl = None gapflank = 0 gapweight = 0.2 enrichfact = 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None padlen = 0 thetas = [] seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array sp_seed = 0 '''Parse command-line arguments''' for tok, i in zip(sys.argv, xrange(len(sys.argv))): if tok == '-top': seed_count = int(sys.argv[i + 1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior': seed_s.append(sys.argv[i + 1]) elif tok == 'sp': sp_seed = 1 elif tok == '-beta': beta = float(sys.argv[i + 1]) elif tok == '-beta': seedbeta = float(sys.argv[i + 1]) elif tok == '-cbeta': cbeta = float(sys.argv[i + 1]) elif tok == '-thetas': for j in range(int(sys.argv[i + 1])): thetas.append(float(sys.argv[i + j + 2])) elif tok == '-gamma': gamma = float(sys.argv[i + 1]) elif tok == '-delta': deltamin = float(sys.argv[i + 1]) elif tok == '-info': infofile = sys.argv[i + 1] elif tok == '-valid': valid_tfs.append(sys.argv[i + 1]) elif tok == '-w': width = sys.argv[i + 1] elif tok == '-width': width = sys.argv[i + 1] elif tok == '-gap': gapped_syl = sys.argv[i + 1] elif tok == '-gapflank': gapflank = int(sys.argv[i + 1]) elif tok == '-gapweight': gapweight = float(sys.argv[i + 1]) elif tok == '-enrichfact': enrichfact = float(sys.argv[i + 1]) elif tok == '-pmax': pmax = 1 elif tok == '-Y2K': species = "YEAST_2000_UP" elif tok == '-Y5C': species = "YEAST_500_UP" elif tok == '-B': species = "BAC_ORF" elif tok == '-Ch22': species = "Ch22" elif tok == '-genome': species = sys.argv[i + 1] elif tok == '-pad': pad = sys.argv[i + 1] padlen = sys.argv[i + 2] elif tok == '-TF': TFSEEDS = 1 for j in range(i + 1, len(sys.argv)): if re.match('M0', sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human': _s = '' if sys.argv[i + 1].isdigit(): _s = '_' + sys.argv[i + 1] else: _s = '' species = 'HUMAN' + _s seqs = [] fsaD = Fasta.load(fastafile) probes = fsaD.keys() ''' for probeid in fsaD.keys(): seqs.append (fsaD [probeid]) ''' numprobes = len(probes) #print "numprobes: %i"%numprobes if not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(fastafile, numprobes, species) #seqs = EM.fasta2seqs(fastafile) all_seqs = seqs seed_s.extend(seqs[0:min(seed_count, len(seqs))]) #not necessary --- seed_c.extend(c_seqs[0:min(seed_count,len(seqs))]) if infofile and width == 'info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width, infofile, fastafile, species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*', '', key.split()[0])) m.seedtxt = '%-24s %s' % (m, key) tf.append(m) tf.sort(lambda x, y: cmp(x.seednum, y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x: 'N' + x + 'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] for m in seedmodels: newmodels.append(m[-2, m.width + 2]) seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s, [], [], width, "VERBOSE") if beta: theEM.beta = beta if cbeta: theEM.cbeta = cbeta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta if thetas: theEM.thetas = thetas theEM.param['gamma'] = gamma theEM.probeids.extend(probes) theEM.seqs.extend(all_seqs) #theEM.cons_seqs.extend(c_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) #sys.exit(0) ''' Compute some metrics ''' print "#Loading Genome %s" % species sys.stdout.flush() if species == 'human': Genome = ProbeSet('HUMAN', enrichfact) else: Genome = ProbeSet(species, enrichfact) ids = Genome.ids_from_file(fastafile) #fsaDict = Fasta.load(fastafile) #probes = fsaDict.keys() #cons_pickle = fastafile.split('.')[0] + '.cpickle' for C in theEM.candidates: #p_cons = conservation_pvalue(C.pssm,probes,fsaDict,ConsDict,4) #print p_cons if not pmax: w_dict = Genome.w_dict for key, i in zip(w_dict.keys(), range(len(C.pssm.thetas))): w_dict[key] = C.pssm.thetas[i] Genome.w_dict = w_dict C.pssm.pvalue = Genome.p_value(C.pssm, ids, 'verbose') #print "P-VAL: %f"%(Genome.p_value(C.pssm,ids,'verbose')*p_cons) C.pssm.church = Genome.church(C.pssm, ids) else: (p, frac) = Genome.best_p_value(C.pssm, ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:", p, frac for valid_tf in valid_tfs: C.pssm.valid = Validate.validate(C.pssm, valid_tf, 'Verbose', "Want Tuple") ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x, y: cmp(x.pssm.pvalue, y.pssm.pvalue)) for C, i in zip(theEM.candidates, range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3]) else: vstring = "(HIT %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed print "Log-odds matrix for Motif %3d %s" % (i, C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s" % (i, m.seedtxt) if m.source: print "Source: ", m.source if m.gamma: print "Gamma: %7.5f" % m.gamma if m.threshold: print "Threshold: %5.2f" % m.threshold if m.thetas != []: tstr = "thetas:" for theta in m.thetas: tstr = tstr + " " + str(theta) print tstr #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s' % (math.fabs( math.log(C.pssm.church) / math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s" % ( i, C, -math.log(C.pssm.pvalue) / math.log(10), vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%" % ( C.pssm.threshold, 100.0 * C.pssm.threshold / C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq, i, C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*' * len(seq) print "MAP Score: %f" % C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times
def info2seeds(N, infofile, probefile, species='YEAST'): if species == 'human': species = 'HUMAN' G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = [] if re.search('.info$', infofile): #I = infoana.Infofile(infofile,'DONT REMOVE QUERY') I = infoana.Infofile(infofile) print "# Loading infofile: %s" % infofile print I seqs = map(lambda x: 'NNNN%sNNNN' % x, I.bsites2seqs(50.0)) elif re.search('.fsa$', infofile): fsaDict = Fasta.load(infofile) probes = fsaDict.keys() #sequence_repository = KenzieSequences() cons_pickle = infofile.split('.')[0] + '.cpickle' try: CFH = open(cons_pickle, 'r') ConsDict = pickle.load(CFH) CFH.close() except: ConsDict = {} for probe in probes: seqs = [] cons = [] try: seq_list = G.alignments[probe] except: continue if (seq_list != []): cer_seq = seq_list[0][1] else: cer_seq = '' cer_seq = cer_seq.upper() numg = len(seq_list) - 1 for i in range(1, 4): try: seqs.append(seq_list[i][1].upper()) except: seqs.append('') cons.append([]) for position in range(len(cer_seq)): ref = cer_seq[position] for i in range(3): if (seqs[i] == ''): continue if (seqs[i][position] != ref): cons[i].append(1) else: cons[i].append(0) ConsDict[probe] = cons CFH = open(cons_pickle, 'w') pickle.dump(ConsDict, CFH) CFH.close() for probe in probes: superseq = '' try: seq_list = G.alignments[probe] except: continue for seq in seq_list: subseq = seq[1].replace('-', '') subseq = subseq.replace('.', '') seqs.append(subseq) if not N: nmers = seqs else: if (N < 11): nmers = ConvergeMotifTools.top_nmers(N, seqs) else: gaplen = N - 2 * (N / 3) gr = '' for i in range(gaplen): gr = gr + 'N' nmers = ConvergeMotifTools.top_nmers(N, seqs, 0, '', 1) gnmers = [] for nmer in nmers: gnmers.append(nmer[0:(N / 3)] + gr + nmer[(N / 3):2 * (N / 3)]) nmers = gnmers if len(nmers) > 201: nmers = nmers[0:200] print "Scoring enrichment of %d nmers from .info file" % len(nmers) nmers_scoresT = [] for nmer in nmers: if nmer[0:(N / 3)].isalpha(): p = G.p_value(nmer, IDs, 'verbose') #if (species=='Ciona'): ng = 2 #else: ng = 4 #p_cons = conservation_pvalue(nmer,IDs,fsaDict,ConsDict,ng) #if (p_cons<0.1): nmers_scoresT.append((nmer, p)) nmers_scoresT.sort(lambda x, y: cmp(x[1], y[1])) #for tup in nmers_scoresT: # print tup last = min(20, len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = ConvergeMotifTools.Motif('', Q) m.compute_from_text(seq, 0.1) models.append(m) return (models)
from TAMO.seq import Fasta fasFile = '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.fas' oFile1= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.sample2.fas' oFile2= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.test2.fas' firstDic, secDic = Fasta.random_split(fasFile,0.25) Fasta.write(firstDic,oFile1) Fasta.write(secDic,oFile2) print 'done'
from TAMO import MotifTools from TAMO.seq import Fasta from TAMO.MotifMetrics import ProbeSet from TAMO.MD.AlignAce import AlignAce from TAMO.MD.MDscan import MDscan from TAMO.MD.Meme import Meme #from TAMO.DataSources import GO from time import time fastaPath = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-96.oneLine.fas' clusterIDS = Fasta.ids(fastaPath) totalSeqs = ProbeSet(fastaPath) # !! this is wrong should proly be goodAffys MDbg = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.MD.bg' outFile = '/Users/biggus/Documents/James/Data/ClusterDefs/testTAMOmetrics.txt' #theAce = AlignAce(fastaPath,width=10) print 'running MDscan...' tMD_1 = time() MDmotifs = MDscan(fastaPath) #,bgfile=MDbg) tMD_2 = time() MD_time = tMD_2-tMD_1 print 'MDscan took %.5f sec == %.3f min.\nMDscan found %s motifs.' % (MD_time,MD_time/60.0, len(MDmotifs.motifs)) print 'running MEME...' tMeme_1 = time() memeMotifs = Meme(fastaPath) tMeme_2 = time() Meme_time = tMeme_2-tMeme_1
def motif_matrix(fsa, motif, outfile, genome='mm9'): if genome == 'hg18': markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m = MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F = Fasta.load(fsa, key_func=lambda x: x) seqs = F.values() n_seqs = len(seqs) n_motifs = len(m) SCORES = np.zeros((n_motifs, n_seqs), dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i, M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log( bg[letter]) / math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi, ma = AM.minscore, AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j, seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score = AM.bestscore(seq_fwd) mscore = (max_score - mi) / (ma - mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i, j] = mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile, SCORES, fmt='%1.3f')
from TAMO.seq import Fasta from gusPyCode.defs.bioDefs import geneList2FastaDict from gusPyCode.defs.mosqData import promoterSeqPaths geneList = map(lambda l: l.strip(), \ open('/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.gte2x.genes.txt', 'rU')) sourceFasta = promoterSeqPaths.Aa_2000bpUp_hardMasked_shuf1 oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.gte2x.masked.shuffled.1.fas' newFasta = geneList2FastaDict(geneList, sourceFasta, hardMasked=True) newFasta = Fasta.text(newFasta) oFile = open(oFile, 'w') oFile.write(newFasta) print 'Done'
def memefiles2tamo(files, tamoname): global probefile, PROBESET motifs = [] for filename in files: print ">>>SDFSD>F ", filename if re.search('\.ace$', filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile = filename.replace('.ace', '.fsa') elif re.search('\.meme.*$', filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile = re.sub('\..\.meme', '.meme', filename).replace('.meme', '.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key, seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if re.search('\.meme$', filename): motif.MAP = -math.log(motif.evalue) / math.log(10) if 1 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$', filename): mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs, tamoname)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="]) # AD added 'i' except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7 ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) elif opt == '-i': motiffile = value # AD added this option to ACTUALLY supply the tamo motif file at the command-line. The code to deal with motiffiles already existed. There was just no code for User to supply one. probes = Fasta.load(fastafile) if motiffile: for f in motiffile.split(','): # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs motifs.extend(MotifTools.load(f)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
def main(): if len(sys.argv) < 2: print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0])) print "Options include:" print "" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" # print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -kmerseeds Use kmers with best enrichment score as seeds for EM" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -g genome.fsa Use specicied Fasta file as background (searches first for matching frequency file)" # print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" # print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels= [] species = 'YEAST' valid_tfs = [] #NOT USED gapped_syl= None gapflank = 0 gapweight = 0.2 enrichfact= 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None bgfile = None seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array '''Parse command-line arguments''' for tok,i in zip(sys.argv,xrange(len(sys.argv))): if tok == '-top' : seed_count = int(sys.argv[i+1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior' : seed_s.append(sys.argv[i+1]) elif tok == '-beta' : beta = float(sys.argv[i+1]) elif tok == '-seedbeta': seedbeta = float(sys.argv[i+1]) elif tok == '-gamma' : gamma = float(sys.argv[i+1]) elif tok == '-delta' : deltamin = float(sys.argv[i+1]) elif tok == '-kmerseeds' : infofile = 1 elif tok == '-valid' : valid_tfs.append(sys.argv[i+1]) #NOT USED elif tok == '-w' : width = sys.argv[i+1] elif tok == '-width' : width = sys.argv[i+1] elif tok == '-gap' : gapped_syl = sys.argv[i+1] elif tok == '-gapflank' :gapflank = int(sys.argv[i+1]) elif tok == '-gapweight':gapweight = float(sys.argv[i+1]) elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1]) elif tok == '-pmax' : pmax = 1 elif tok == '-Y2K' : species = "YEAST_2000_UP" elif tok == '-Y5C' : species = "YEAST_500_UP" elif tok == '-B' : species = "BAC_ORF" elif tok == '-Ch22' : species = "Ch22" elif tok == '-genome': species = sys.argv[i+1] elif tok == '-pad' : pad = "TRUE" elif tok == '-bgfile': bgfile = sys.argv[i+1] elif tok == '-TF' : #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION) TFSEEDS = 1 for j in range(i+1,len(sys.argv)): if re.match('M0',sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human' : _s = '' if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1] else: _s = '' species = 'HUMAN'+_s if infofile: infofile = fastafile if bgfile: EM.loadMarkovBackground(bgfile) elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(species) else: EM.theMarkovBackground = EM.Zeroth() fsaD = Fasta.load(fastafile) Fasta.delN(fsaD) seqs = fsaD.values() probes = fsaD.keys() all_seqs = seqs seed_s.extend(seqs[0:min(seed_count,len(seqs))]) if infofile and width=='info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width,infofile,fastafile,species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: #NOT USED tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*','',key.split()[0])) m.seedtxt = '%-24s %s'%(m,key) tf.append(m) tf.sort(lambda x,y: cmp(x.seednum,y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x:'N'+x+'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] left = MotifTools.Motif_from_text('@') right = MotifTools.Motif_from_text('N') for m in seedmodels: newmodels.append(left + m + right) print left + m + right seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s,[],width,"VERBOSE") if beta: theEM.beta = beta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta theEM.param['gamma'] = gamma theEM.seqs.extend(all_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) ''' Compute some metrics ''' print "#Loading Genome %s"%species ; sys.stdout.flush() Genome = ProbeSet(species,enrichfact) ids = Genome.ids_from_file(fastafile) for C in theEM.candidates: if not pmax: C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose') C.pssm.church = Genome.church(C.pssm,ids) C.pssm.frac = Genome.frac(C.pssm,probes,None,0.7) else: (p,frac) = Genome.best_p_value(C.pssm,ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:",p,frac matching = Genome.matching_ids(C.pssm,[],factor=0.7) matchbound = [x for x in matching if x in probes] C.pssm.numbound = len(probes) C.pssm.nummotif = len(matching) C.pssm.numboundmotif = len(matchbound) sys.stdout.flush() ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue)) for C,i in zip(theEM.candidates,range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: #NOT USED _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed MotifTools.print_motif(C.pssm,20,i) sys.stdout.flush() continue #Antiquated stuff -- Remove !! print "Log-odds matrix for Motif %3d %s"%(i,C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s"%(i,m.seedtxt) if m.source: print "Source: ",m.source if m.gamma: print "Gamma: %7.5f"%m.gamma if m.threshold: print "Threshold: %5.2f"%m.threshold #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s'%( math.fabs(math.log(C.pssm.church)/math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%"%( C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq,i,C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*'*len(seq) print "MAP Score: %f"%C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times