def SGDData(): root = TAMO.paths.SGDdir urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' files = ['chromosomal_feature/SGD_features.tab', 'chromosomal_feature/dbxref.tab', 'chromosomal_feature/chromosome_length.tab', 'sequence/GenBank/yeast_nrpep.fasta.gz', 'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz', ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab') ] chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split() files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] ) downloadfiles(root,urlroot,files) from TAMO.seq import Fasta print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)" D = {} for chr in chrs: _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr)) id, seq = _d.items()[0] if chr[0] == '0': chr = chr[1] D['chr%s %s'%(chr,id)] = seq Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
def main(fastafile, outDirectory): # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script. seqsD = Fasta.load(fastafile) seqs = seqsD.values() output = [] for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total)) # AD 02-27-09 added a '\n' to make file look right for nmer in _t: output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total)) # AD 02-27-09 added a '\n' to make file look right # open output file and write out results outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1]) outFile = open(outFile, 'w') for index in output: outFile.write(index)
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total) for nmer in _t: print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total) sys.stdout.flush()
def loadMiRNAs(miRNA_Path): """ Takes fasta file of mature miRNAs. Returns dict. """ return Fasta.load(miRNA_Path)
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1, 7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer, count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2 * count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1], total) for nmer in _t: print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total) sys.stdout.flush()
def calcStats(fastaPath): seqFile = Fasta.load(fastaPath) combinedSeq = '' for each in seqFile: combinedSeq += seqFile[each] combinedSeq= combinedSeq.upper() seqs = len(seqFile) totNucs = len(combinedSeq) aCnt = combinedSeq.count('A') cCnt = combinedSeq.count('C') gCnt = combinedSeq.count('G') tCnt = combinedSeq.count('T') nCnt = combinedSeq.count('N') nonNs = aCnt+cCnt+gCnt+tCnt n2tot = float(nCnt)/len(combinedSeq) n2nonN = float(nCnt)/nonNs percentGC = (float(gCnt)+cCnt)/nonNs return {'seqLen':seqs, 'totNucs':totNucs, 'aCnt':aCnt, 'cCnt':cCnt, 'gCnt':gCnt, 'tCnt':tCnt, 'nCnt':nCnt, 'nonNs':nonNs, 'n2tot':n2tot, 'n2nonN':n2nonN, 'percentGC':percentGC}
def orf2pseq(orf): global _orfpseqs if not _orfpseqs: from TAMO.seq import Fasta _orfpseqs = Fasta.load(_ORFPSEQS) for _orf, pseq in _orfpseqs.items(): if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1] if _orfpseqs.has_key(orf): return _orfpseqs[orf] else: return ''
def orf2pseq(orf): global _orfpseqs if not _orfpseqs: from TAMO.seq import Fasta _orfpseqs = Fasta.load(_ORFPSEQS) for _orf,pseq in _orfpseqs.items(): if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1] if _orfpseqs.has_key(orf): return _orfpseqs[orf] else: return ''
def genomebg(infile,outfile): EXE = MDSCAN_DIR + 'genomebg.linux' fsaD = Fasta.load(infile) tmpfsa = tempfile.mktemp() Fasta.write(fsaD,tmpfsa,linelen=1000000000) CMD = '%s -i %s -o %s'%(EXE,tmpfsa,outfile) FID = os.popen('( %s ;) 2>&1'%CMD,'r') for line in FID.readlines(): print line if FID.close(): print "Exited" os.unlink(tmpfsa)
def genomebg(infile, outfile): EXE = MDSCAN_DIR + 'genomebg.linux' fsaD = Fasta.load(infile) tmpfsa = tempfile.mktemp() Fasta.write(fsaD, tmpfsa, linelen=1000000000) CMD = '%s -i %s -o %s' % (EXE, tmpfsa, outfile) FID = os.popen('( %s ;) 2>&1' % CMD, 'r') for line in FID.readlines(): print line if FID.close(): print "Exited" os.unlink(tmpfsa)
def __init__(self, fg_file, bg_file, cv_level, markov_file): self.cv_level = cv_level self.randomize = 0 self.beta = 0.0 self.delta = 0.001 self.refine = 1 self.motif_file = 'dummy.out' self.dump = 0 self.family = '' self.datafiles = (fg_file,bg_file) MAX_FG = 2000 #LOAD MARKOV BACKGROUND# print "Loading Markov background file from %s"%markov_file EM.loadMarkovBackground(markov_file) ################################################################################## #divide input sequences into groups according to the desired cross-validation level ################################################################################### print "Processing input sequences...." self.fg_seqs = Fasta.load(fg_file) #load foreground sequences for key in self.fg_seqs.keys(): fseq = self.fg_seqs[key] self.fg_seqs[key] = fseq.split()[0] self.all_probes = Fasta.load(bg_file) #load background sequences Fasta.delN(self.fg_seqs) Fasta.delN(self.all_probes) #first delete any sequences from background that are present in foreground for key in self.fg_seqs.keys(): if (self.all_probes.has_key(key)): del self.all_probes[key] for key in self.all_probes.keys(): if ((len(self.all_probes[key])==0) or (re.search('[SWMKRY]', self.all_probes[key]))): del self.all_probes[key] print "deleting %s"%key while (len(self.fg_seqs.keys())>MAX_FG): del self.fg_seqs[self.fg_seqs.keys()[random.randint(0,(len(self.fg_seqs.keys())-1))]]
def loadSeqs(fastaPathList): """ Takes list of paths. Returns single dict full of seqs found in the files. Converts softMasking to hard. """ rDict = {} for path in fastaPathList: rDict.update(Fasta.load(path)) bioDefs.softMaskDict2HardMask(rDict) return rDict
def memefiles2tamo(files, tamoname): global probefile, PROBESET, fsafile motifs = [] for filename in files: print ">>>SDFSD>F ",filename if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa') elif re.search('\.meme.*$',filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile if fsafile: fsaname = fsafile else: fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) if 0 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs,tamoname)
def get_seq(chr,start=None,stop=None): global ChrD if not ChrD: from TAMO.seq import Fasta ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa') if (type(chr) != type('')) or (chr.find('chr') != 0): # 1 -> chr1, 'X' -> chrX chr = 'chr%s'%chr if (start == None) and chr.find(':') > 0: # chr4:454-465 -> chr4, 454, 465 _chr,_range = chr.split(':') chr = _chr start, end = _range.split('-') start, end = int(start), int(end) return ChrD[chr][start-1:end]
def get_seq(chr, start=None, stop=None): global ChrD if not ChrD: from TAMO.seq import Fasta ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa') if (type(chr) != type('')) or (chr.find('chr') != 0): # 1 -> chr1, 'X' -> chrX chr = 'chr%s' % chr if (start == None) and chr.find(':') > 0: # chr4:454-465 -> chr4, 454, 465 _chr, _range = chr.split(':') chr = _chr start, end = _range.split('-') start, end = int(start), int(end) return ChrD[chr][start - 1:end]
def LoadDNA(verbose=False): ############################################################################### # # Read DNA seqeuence # Extract sub-sequence to model # Define rules for DNA # ############################################################################### START_POS = 0 dna = "" fastafile = params.GetString(DNA_section,"FILE") if (fastafile): chromo = params.GetString(DNA_section,"CHR") chr_start = params.GetInt(DNA_section,"START") chr_end = params.GetInt(DNA_section,"END") if (not chr_end): chr_end = params.GetInt(DNA_section,"LENGTH") chr_end += chr_start if verbose: print ("Loading fasta: [%s]\n"%fastafile) seqs = Fasta.load(fastafile) seqkeys = seqs.keys() seqkeys.sort() n = 0 for chr in seqkeys: n += len(seqs[chr]) if verbose: print("Genome length = %d, # chromosomes = %d\n"%(n, len(seqkeys))) if (seqs.has_key(chromo)): seq = seqs[chromo] if verbose: print("Chr[%s] = %d nt\n"%(chromo,len(seq))) dna = seq[chr_start:chr_end] if verbose: print("DNA[%d:%d] = %d nt\n"%(chr_start,chr_end,len(dna))) else: if verbose: print("Cannot find [%s] chromosome in %s\n"%(chromo, filename)) if (verbose): print("DNA:[%s]\n"%dna) return dna
def LoadDNA(): ############################################################################### # # Read DNA seqeuence # Extract sub-sequence to model # Define rules for DNA # ############################################################################### START_POS = 0 dna = "" fastafile = params.GetString(DNA_section, "FILE") if (fastafile): chromo = params.GetString(DNA_section, "CHR") chr_start = params.GetInt(DNA_section, "START") chr_end = params.GetInt(DNA_section, "END") if (not chr_end): chr_end = params.GetInt(DNA_section, "LENGTH") chr_end += chr_start print("Loading fasta: [%s]\n" % fastafile) seqs = Fasta.load(fastafile) seqkeys = seqs.keys() seqkeys.sort() n = 0 for chr in seqkeys: n += len(seqs[chr]) print("Genome length = %d, # chromosomes = %d\n" % (n, len(seqkeys))) if (seqs.has_key(chromo)): seq = seqs[chromo] print("Chr[%s] = %d nt\n" % (chromo, len(seq))) dna = seq[chr_start:chr_end] print("DNA[%d:%d] = %d nt\n" % (chr_start, chr_end, len(dna))) else: print("Cannot find [%s] chromosome in %s\n" % (chromo, filename)) print("DNA:[%s]\n" % dna) return dna
def swp_find_and_format(swp): global _swp_seqs if not _swp_seqs: _swp_seqs = Fasta.load(_SWPFASTA,key_func=lambda x:x) hits = [] for key in _swp_seqs.keys(): if key[0:60].find(swp) >= 0: hits.append(key) if not hits: return None if len(hits) > 1: print "# Multiple matches found for %s:"%swp for hit in hits: print '#',hit return None hit = hits[0] seq = _swp_seqs[hit] txt = '' for i in range(0,len(seq),70): txt = txt + seq[i:i+70] + '\n' return txt
def swp_find_and_format(swp): global _swp_seqs if not _swp_seqs: _swp_seqs = Fasta.load(_SWPFASTA, key_func=lambda x: x) hits = [] for key in _swp_seqs.keys(): if key[0:60].find(swp) >= 0: hits.append(key) if not hits: return None if len(hits) > 1: print "# Multiple matches found for %s:" % swp for hit in hits: print '#', hit return None hit = hits[0] seq = _swp_seqs[hit] txt = '' for i in range(0, len(seq), 70): txt = txt + seq[i:i + 70] + '\n' return txt
def __init__(self,fastaSeqs, motifDict, thresh=0.5,window=200): self.seqMaps = {} # Get seqs from fasta assert type(fastaSeqs) == type('string') or type(fastaSeqs) == type({}),\ 'MapLib arg(fastaSeqs) must be string pointing to file or a seqDict.' if type(fastaSeqs) == type('string'): seqs = Fasta.load(fastaSeqs) elif type(fastaSeqs) == type({}): seqs = fastaSeqs # Instantiate a SeqMap obj for each seq in seqs c = 0 for k in seqs: c += 1 assert c <= 250 realT1 = time() self.seqMaps[k] = SeqMap(k, seqs[k], motifDict, thresh=thresh, window=window) realT2 = time() print '%.4f\t%s' % (realT2-realT1,c)
def geneList2FastaDict(geneList, sourceFastaPath, hardMasked=True): """ Returns a Dict of requested fasta recs in form SeqName:Sequence. Defaults to HardMasked return seqeunces. """ sourceDict = Fasta.load(sourceFastaPath) # make new dict of all genes both in geneList AND sourceDict # new dict may be shorter than geneList!!!!!! newDict = {} for i in geneList: if sourceDict[i]: newDict[i] = sourceDict[i] print "%s genes names given, %s found." % (len(geneList), len(newDict)) if hardMasked: softMaskDict2HardMask(newDict) return newDict
def motif_matrix(fsa,motif,outfile,genome='mm9'): if genome=='hg18': markov="/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov="/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m=MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F=Fasta.load(fsa,key_func=lambda x:x) seqs=F.values() n_seqs=len(seqs) n_motifs=len(m) SCORES=np.zeros((n_motifs,n_seqs),dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i,M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi,ma=AM.minscore,AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j,seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score=AM.bestscore(seq_fwd) mscore=(max_score-mi)/(ma-mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i,j]=mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile,SCORES,fmt='%1.3f')
originalFastaDict = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Aedes/aedes2KBupStreamTSS.softMasked.geneStrand.fas' desiredFastaList = open('/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.genes.txt', 'rU').readlines() outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.UNmasked.fas' hardMask = None #========================================================================== # Strip newlines from fasta ID list desiredFastaList = map(string.strip, desiredFastaList) # Instantiate the fasta rec lists originalFastaDict = Fasta.load(originalFastaDict) # New dict to catch copied seqObjs desiredFastaDict = {} for rec in desiredFastaList: if originalFastaDict.has_key(rec): desiredFastaDict[rec] = originalFastaDict[rec] else: print rec+' not found in source fasta list!' # Hard Mask if requested if hardMask: for x in desiredFastaDict: desiredFastaDict[x] = desiredFastaDict[x].replace('a','N')
def info2seeds(N, infofile, probefile, species='YEAST'): if species == 'human': species = 'HUMAN' G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = [] if re.search('.info$', infofile): #I = infoana.Infofile(infofile,'DONT REMOVE QUERY') I = infoana.Infofile(infofile) print "# Loading infofile: %s" % infofile print I seqs = map(lambda x: 'NNNN%sNNNN' % x, I.bsites2seqs(50.0)) elif re.search('.fsa$', infofile): fsaDict = Fasta.load(infofile) probes = fsaDict.keys() #sequence_repository = KenzieSequences() cons_pickle = infofile.split('.')[0] + '.cpickle' try: CFH = open(cons_pickle, 'r') ConsDict = pickle.load(CFH) CFH.close() except: ConsDict = {} for probe in probes: seqs = [] cons = [] try: seq_list = G.alignments[probe] except: continue if (seq_list != []): cer_seq = seq_list[0][1] else: cer_seq = '' cer_seq = cer_seq.upper() numg = len(seq_list) - 1 for i in range(1, 4): try: seqs.append(seq_list[i][1].upper()) except: seqs.append('') cons.append([]) for position in range(len(cer_seq)): ref = cer_seq[position] for i in range(3): if (seqs[i] == ''): continue if (seqs[i][position] != ref): cons[i].append(1) else: cons[i].append(0) ConsDict[probe] = cons CFH = open(cons_pickle, 'w') pickle.dump(ConsDict, CFH) CFH.close() for probe in probes: superseq = '' try: seq_list = G.alignments[probe] except: continue for seq in seq_list: subseq = seq[1].replace('-', '') subseq = subseq.replace('.', '') seqs.append(subseq) if not N: nmers = seqs else: if (N < 11): nmers = ConvergeMotifTools.top_nmers(N, seqs) else: gaplen = N - 2 * (N / 3) gr = '' for i in range(gaplen): gr = gr + 'N' nmers = ConvergeMotifTools.top_nmers(N, seqs, 0, '', 1) gnmers = [] for nmer in nmers: gnmers.append(nmer[0:(N / 3)] + gr + nmer[(N / 3):2 * (N / 3)]) nmers = gnmers if len(nmers) > 201: nmers = nmers[0:200] print "Scoring enrichment of %d nmers from .info file" % len(nmers) nmers_scoresT = [] for nmer in nmers: if nmer[0:(N / 3)].isalpha(): p = G.p_value(nmer, IDs, 'verbose') #if (species=='Ciona'): ng = 2 #else: ng = 4 #p_cons = conservation_pvalue(nmer,IDs,fsaDict,ConsDict,ng) #if (p_cons<0.1): nmers_scoresT.append((nmer, p)) nmers_scoresT.sort(lambda x, y: cmp(x[1], y[1])) #for tup in nmers_scoresT: # print tup last = min(20, len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = ConvergeMotifTools.Motif('', Q) m.compute_from_text(seq, 0.1) models.append(m) return (models)
def freq_from_fasta(self,fastafile): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() self.freq_from_seqs(seqs)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="]) except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7) ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) probes = Fasta.load(fastafile) if motiffile: motifs.extend(TAMO.tamofile2motifs(motiffile)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
def main(): if len(sys.argv) < 3: print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0])) print "Options include:" print " -valid <tf_name> Check answers against Transfac" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -info <file.info> for structural priors" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' cbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels= [] species = 'YEAST' valid_tfs = [] gapped_syl= None gapflank = 0 gapweight = 0.2 enrichfact= 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None padlen = 0 thetas = [] seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array sp_seed = 0 '''Parse command-line arguments''' for tok,i in zip(sys.argv,xrange(len(sys.argv))): if tok == '-top' : seed_count = int(sys.argv[i+1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior' : seed_s.append(sys.argv[i+1]) elif tok == 'sp' : sp_seed = 1 elif tok == '-beta' : beta = float(sys.argv[i+1]) elif tok == '-beta' : seedbeta = float(sys.argv[i+1]) elif tok == '-cbeta' : cbeta = float(sys.argv[i+1]) elif tok == '-thetas': for j in range(int(sys.argv[i+1])): thetas.append(float(sys.argv[i+j+2])) elif tok == '-gamma' : gamma = float(sys.argv[i+1]) elif tok == '-delta' : deltamin = float(sys.argv[i+1]) elif tok == '-info' : infofile = sys.argv[i+1] elif tok == '-valid' : valid_tfs.append(sys.argv[i+1]) elif tok == '-w' : width = sys.argv[i+1] elif tok == '-width' : width = sys.argv[i+1] elif tok == '-gap' : gapped_syl = sys.argv[i+1] elif tok == '-gapflank' :gapflank = int(sys.argv[i+1]) elif tok == '-gapweight':gapweight = float(sys.argv[i+1]) elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1]) elif tok == '-pmax' : pmax = 1 elif tok == '-Y2K' : species = "YEAST_2000_UP" elif tok == '-Y5C' : species = "YEAST_500_UP" elif tok == '-B' : species = "BAC_ORF" elif tok == '-Ch22' : species = "Ch22" elif tok == '-genome': species = sys.argv[i+1] elif tok == '-pad' : pad = sys.argv[i+1] padlen = sys.argv[i+2] elif tok == '-TF' : TFSEEDS = 1 for j in range(i+1,len(sys.argv)): if re.match('M0',sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human' : _s = '' if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1] else: _s = '' species = 'HUMAN'+_s seqs = [] fsaD = Fasta.load(fastafile) probes = fsaD.keys() ''' for probeid in fsaD.keys(): seqs.append (fsaD [probeid]) ''' numprobes = len(probes) #print "numprobes: %i"%numprobes if not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(fastafile,numprobes,species) #seqs = EM.fasta2seqs(fastafile) all_seqs = seqs seed_s.extend(seqs[0:min(seed_count,len(seqs))]) #not necessary --- seed_c.extend(c_seqs[0:min(seed_count,len(seqs))]) if infofile and width=='info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width,infofile,fastafile,species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*','',key.split()[0])) m.seedtxt = '%-24s %s'%(m,key) tf.append(m) tf.sort(lambda x,y: cmp(x.seednum,y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x:'N'+x+'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] for m in seedmodels: newmodels.append(m[-2,m.width+2]) seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s,[],[],width,"VERBOSE") if beta: theEM.beta = beta if cbeta: theEM.cbeta = cbeta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta if thetas: theEM.thetas = thetas theEM.param['gamma'] = gamma theEM.probeids.extend(probes) theEM.seqs.extend(all_seqs) #theEM.cons_seqs.extend(c_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) #sys.exit(0) ''' Compute some metrics ''' print "#Loading Genome %s"%species ; sys.stdout.flush() if species == 'human': Genome = ProbeSet('HUMAN',enrichfact) else: Genome = ProbeSet(species,enrichfact) ids = Genome.ids_from_file(fastafile) #fsaDict = Fasta.load(fastafile) #probes = fsaDict.keys() #cons_pickle = fastafile.split('.')[0] + '.cpickle' for C in theEM.candidates: #p_cons = conservation_pvalue(C.pssm,probes,fsaDict,ConsDict,4) #print p_cons if not pmax: w_dict = Genome.w_dict for key,i in zip(w_dict.keys(),range(len(C.pssm.thetas))): w_dict[key] = C.pssm.thetas[i] Genome.w_dict = w_dict C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose') #print "P-VAL: %f"%(Genome.p_value(C.pssm,ids,'verbose')*p_cons) C.pssm.church = Genome.church(C.pssm,ids) else: (p,frac) = Genome.best_p_value(C.pssm,ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:",p,frac for valid_tf in valid_tfs: C.pssm.valid = Validate.validate(C.pssm,valid_tf,'Verbose',"Want Tuple") ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue)) for C,i in zip(theEM.candidates,range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed print "Log-odds matrix for Motif %3d %s"%(i,C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s"%(i,m.seedtxt) if m.source: print "Source: ",m.source if m.gamma: print "Gamma: %7.5f"%m.gamma if m.threshold: print "Threshold: %5.2f"%m.threshold if m.thetas != []: tstr = "thetas:" for theta in m.thetas: tstr = tstr + " " + str(theta) print tstr #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s'%( math.fabs(math.log(C.pssm.church)/math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%"%( C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq,i,C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*'*len(seq) print "MAP Score: %f"%C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times
from TAMO.seq import Fasta fastaFile = '/Users/biggus/Documents/James/AedesPeptides/Aedes_aegypti.AaegL1.50.pep.all.fa' outFile = '/Users/biggus/Documents/James/AedesPeptides/Aedes_aegypti.AaegL1.50.pep.all.reformatted.fa' fDict = Fasta.load(fastaFile, lambda x: x) newDict = {} for n,s in fDict.items(): n = [n[:13]+'_Ens',n[13:]] n = '|'.join(n) n = '|'+n newDict[n] = s outFile = open(outFile, 'w') newDict_keys = newDict.keys() newDict_keys.sort() for key in newDict_keys: entry = '>%s\n%s\n' % (key,newDict[key]) outFile.write(entry) print 'Done.'
def motif_matrix(fsa, motif, outfile, genome='mm9'): if genome == 'hg18': markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m = MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F = Fasta.load(fsa, key_func=lambda x: x) seqs = F.values() n_seqs = len(seqs) n_motifs = len(m) SCORES = np.zeros((n_motifs, n_seqs), dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i, M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log( bg[letter]) / math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi, ma = AM.minscore, AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j, seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score = AM.bestscore(seq_fwd) mscore = (max_score - mi) / (ma - mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i, j] = mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile, SCORES, fmt='%1.3f')
def main(): if len(sys.argv) < 2: print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0])) print "Options include:" print "" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" # print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -kmerseeds Use kmers with best enrichment score as seeds for EM" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -g genome.fsa Use specicied Fasta file as background (searches first for matching frequency file)" # print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" # print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels= [] species = 'YEAST' valid_tfs = [] #NOT USED gapped_syl= None gapflank = 0 gapweight = 0.2 enrichfact= 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None bgfile = None seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array '''Parse command-line arguments''' for tok,i in zip(sys.argv,xrange(len(sys.argv))): if tok == '-top' : seed_count = int(sys.argv[i+1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior' : seed_s.append(sys.argv[i+1]) elif tok == '-beta' : beta = float(sys.argv[i+1]) elif tok == '-seedbeta': seedbeta = float(sys.argv[i+1]) elif tok == '-gamma' : gamma = float(sys.argv[i+1]) elif tok == '-delta' : deltamin = float(sys.argv[i+1]) elif tok == '-kmerseeds' : infofile = 1 elif tok == '-valid' : valid_tfs.append(sys.argv[i+1]) #NOT USED elif tok == '-w' : width = sys.argv[i+1] elif tok == '-width' : width = sys.argv[i+1] elif tok == '-gap' : gapped_syl = sys.argv[i+1] elif tok == '-gapflank' :gapflank = int(sys.argv[i+1]) elif tok == '-gapweight':gapweight = float(sys.argv[i+1]) elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1]) elif tok == '-pmax' : pmax = 1 elif tok == '-Y2K' : species = "YEAST_2000_UP" elif tok == '-Y5C' : species = "YEAST_500_UP" elif tok == '-B' : species = "BAC_ORF" elif tok == '-Ch22' : species = "Ch22" elif tok == '-genome': species = sys.argv[i+1] elif tok == '-pad' : pad = "TRUE" elif tok == '-bgfile': bgfile = sys.argv[i+1] elif tok == '-TF' : #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION) TFSEEDS = 1 for j in range(i+1,len(sys.argv)): if re.match('M0',sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human' : _s = '' if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1] else: _s = '' species = 'HUMAN'+_s if infofile: infofile = fastafile if bgfile: EM.loadMarkovBackground(bgfile) elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(species) else: EM.theMarkovBackground = EM.Zeroth() fsaD = Fasta.load(fastafile) Fasta.delN(fsaD) seqs = fsaD.values() probes = fsaD.keys() all_seqs = seqs seed_s.extend(seqs[0:min(seed_count,len(seqs))]) if infofile and width=='info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width,infofile,fastafile,species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: #NOT USED tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*','',key.split()[0])) m.seedtxt = '%-24s %s'%(m,key) tf.append(m) tf.sort(lambda x,y: cmp(x.seednum,y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x:'N'+x+'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] left = MotifTools.Motif_from_text('@') right = MotifTools.Motif_from_text('N') for m in seedmodels: newmodels.append(left + m + right) print left + m + right seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s,[],width,"VERBOSE") if beta: theEM.beta = beta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta theEM.param['gamma'] = gamma theEM.seqs.extend(all_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) ''' Compute some metrics ''' print "#Loading Genome %s"%species ; sys.stdout.flush() Genome = ProbeSet(species,enrichfact) ids = Genome.ids_from_file(fastafile) for C in theEM.candidates: if not pmax: C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose') C.pssm.church = Genome.church(C.pssm,ids) C.pssm.frac = Genome.frac(C.pssm,probes,None,0.7) else: (p,frac) = Genome.best_p_value(C.pssm,ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:",p,frac matching = Genome.matching_ids(C.pssm,[],factor=0.7) matchbound = [x for x in matching if x in probes] C.pssm.numbound = len(probes) C.pssm.nummotif = len(matching) C.pssm.numboundmotif = len(matchbound) sys.stdout.flush() ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue)) for C,i in zip(theEM.candidates,range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: #NOT USED _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed MotifTools.print_motif(C.pssm,20,i) sys.stdout.flush() continue #Antiquated stuff -- Remove !! print "Log-odds matrix for Motif %3d %s"%(i,C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s"%(i,m.seedtxt) if m.source: print "Source: ",m.source if m.gamma: print "Gamma: %7.5f"%m.gamma if m.threshold: print "Threshold: %5.2f"%m.threshold #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s'%( math.fabs(math.log(C.pssm.church)/math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%"%( C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq,i,C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*'*len(seq) print "MAP Score: %f"%C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times
def memefiles2tamo(files, tamoname): global probefile, PROBESET motifs = [] for filename in files: print ">>>SDFSD>F ", filename if re.search('\.ace$', filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile = filename.replace('.ace', '.fsa') elif re.search('\.meme.*$', filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile = re.sub('\..\.meme', '.meme', filename).replace('.meme', '.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key, seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if re.search('\.meme$', filename): motif.MAP = -math.log(motif.evalue) / math.log(10) if 1 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$', filename): mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs, tamoname)
for header, seq in fg.items() : num_peak_bases += len(seq) if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 3 : parser.error('Must provide three non-option arguments') sample_type, organism, fg_fn = args[:3] settings_dict = get_org_settings(organism) fg = Fasta.load(fg_fn) bg = rejection_sampling(fg,settings_dict) ############################################################### # start Chris' code from rej_samp_bg_rand2.py the_genes={} #list of distances to nearest TSS # for each peak find the chromosome, distance to nearest # gene, size of peaks in bases, and GC content the_chrs,dists,sizes,gcs=[],[],[],[] # number of bases in the fg sequences size=0 for key in pos_seqs.keys():
from TAMO.seq import Fasta def groupMiRsBym2m8(miRNAs): """ miRNAs = dict(k='miRname', v='miRseq') Returns seedDict = dict(k='m2m8Seq', v=[miRnames]) """ seedDict = {} for m in miRNAs: m2m8 = miRNAs[m][1:8] if m2m8 in seedDict: seedDict[m2m8].append(m) else: seedDict[m2m8] = [m] for each in seedDict: print '%s' % (', '.join(seedDict[each])) return seedDict if __name__ == '__main__': miRNAs = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.fa' miRNAs = Fasta.load(miRNAs) sD = groupMiRsBym2m8(miRNAs)
from TAMO.seq import Fasta fastaFile = "/Users/biggus/Documents/James/AedesPeptides/aaegypti.PEPTIDES-AaegL1.1.reformatted.fa" fDict = Fasta.load(fastaFile, lambda x: x.split("|")[1]) nrNames = [] rNames = [] for each in fDict.keys(): if each not in nrNames: nrNames.append(each) else: if each not in rNames: rNames.append(each) print "%s names were repeated at least once." % (len(rNames)) x = 1
def info2seeds(N,infofile,probefile,species='YEAST'): if species == 'human': species = 'HUMAN' G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = [] if re.search('.info$',infofile): #I = infoana.Infofile(infofile,'DONT REMOVE QUERY') I = infoana.Infofile(infofile) print "# Loading infofile: %s"%infofile print I seqs = map(lambda x: 'NNNN%sNNNN'%x, I.bsites2seqs(50.0)) elif re.search('.fsa$',infofile): fsaDict = Fasta.load(infofile) probes = fsaDict.keys() #sequence_repository = KenzieSequences() cons_pickle = infofile.split('.')[0] + '.cpickle' try: CFH = open(cons_pickle, 'r') ConsDict = pickle.load(CFH) CFH.close() except: ConsDict = {} for probe in probes: seqs = [] cons = [] try: seq_list = G.alignments[probe] except: continue if (seq_list!=[]): cer_seq = seq_list[0][1] else: cer_seq = '' cer_seq = cer_seq.upper() numg = len(seq_list) - 1 for i in range(1,4): try: seqs.append(seq_list[i][1].upper()) except: seqs.append('') cons.append([]) for position in range(len(cer_seq)): ref = cer_seq[position] for i in range(3): if (seqs[i]==''): continue if (seqs[i][position]!=ref): cons[i].append(1) else: cons[i].append(0) ConsDict[probe] = cons CFH = open(cons_pickle, 'w') pickle.dump(ConsDict, CFH) CFH.close() for probe in probes: superseq = '' try: seq_list = G.alignments[probe] except: continue for seq in seq_list: subseq = seq[1].replace('-','') subseq = subseq.replace('.','') seqs.append(subseq) if not N: nmers = seqs else: if (N<11): nmers= ConvergeMotifTools.top_nmers(N,seqs) else: gaplen = N - 2*(N/3) gr = '' for i in range(gaplen): gr = gr + 'N' nmers = ConvergeMotifTools.top_nmers(N,seqs,0,'',1) gnmers = [] for nmer in nmers: gnmers.append(nmer[0:(N/3)]+gr+nmer[(N/3):2*(N/3)]) nmers = gnmers if len(nmers) > 201: nmers = nmers[0:200] print "Scoring enrichment of %d nmers from .info file"%len(nmers) nmers_scoresT = [] for nmer in nmers: if nmer[0:(N/3)].isalpha(): p = G.p_value(nmer,IDs,'verbose') #if (species=='Ciona'): ng = 2 #else: ng = 4 #p_cons = conservation_pvalue(nmer,IDs,fsaDict,ConsDict,ng) #if (p_cons<0.1): nmers_scoresT.append((nmer,p)) nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1])) #for tup in nmers_scoresT: # print tup last = min(20,len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = ConvergeMotifTools.Motif('',Q) m.compute_from_text(seq,0.1) models.append(m) return(models)
from gusPyCode.MDAP_proj.MDAP_defs import shuffleSeqDict from TAMO.seq import Fasta from gusPyCode.defs.bioDefs import softMaskDict2HardMask from time import time from gusPyCode.defs.mosqData import promoterSeqPaths # User Variables: inFile = promoterSeqPaths.Aa_2000bpUp_softMasked outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Aedes/aedes2KBupStreamTSS.UnMasked.geneStrand.shuffledSeqs.1.fas' hardMask = None d = Fasta.load(inFile) #d = {1:'AACTGCANACTGACNNNACTGATGNNN'} if not hardMask: for x in d: d[x] = d[x].upper() t1 = time() sD = shuffleSeqDict(d) t2 = time() Fasta.write(sD,outFile) print 'Shuffling took %.2f min.' % ((float(t2)-t1)/60)
tOut = "/Users/biggus/Documents/James/Data/ReClustering/Python_CRM/tamoTimeIt.6memeMotifs.35seqs.30runs.txt" mOut = "/Users/biggus/Documents/James/Data/ReClustering/Python_CRM/motilityTimeIt.6memeMotifs.35seqs.30runs.txt" genes = 35 runs = 30 tmoFiles = [ "/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/MemeResults/Clus2_247gene_0.8_Apr16_14-46-36.meme.txt.tmo", "/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/MemeResults/Clus2_247gene_0.8_Apr16_14-46-33.meme.txt.tmo", ] fastaPath = ( "/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas" ) seqs = Fasta.load(fastaPath) targetGenes = "/Users/biggus/Documents/James/Data/ReClustering/kmedsPear33Clus50x_2/Clus2_247genes.genes.txt" targetGenes = map(lambda l: l.strip(), open(targetGenes, "rU")) targetGenes = targetGenes[:genes] for i in range(len(targetGenes)): targetGenes[i] = seqs[targetGenes[i]] motifs = [] tMotifs = [] mMotifs = [] for t in tmoFiles: Ms = loadTMOs(t) motifs.extend(Ms) for i in range(len(motifs)):
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="]) # AD added 'i' except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7 ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) elif opt == '-i': motiffile = value # AD added this option to ACTUALLY supply the tamo motif file at the command-line. The code to deal with motiffiles already existed. There was just no code for User to supply one. probes = Fasta.load(fastafile) if motiffile: for f in motiffile.split(','): # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs motifs.extend(MotifTools.load(f)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
def main(): if len(sys.argv) < 3: print "Usage: %s <fasta_file> [width = None ] [options]" % (re.sub( '^.*/', '', sys.argv[0])) print "Options include:" print " -valid <tf_name> Check answers against Transfac" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -info <file.info> for structural priors" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC " % ( sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 " % ( sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 " % (sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ', '\ ', x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' cbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels = [] species = 'YEAST' valid_tfs = [] gapped_syl = None gapflank = 0 gapweight = 0.2 enrichfact = 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None padlen = 0 thetas = [] seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array sp_seed = 0 '''Parse command-line arguments''' for tok, i in zip(sys.argv, xrange(len(sys.argv))): if tok == '-top': seed_count = int(sys.argv[i + 1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior': seed_s.append(sys.argv[i + 1]) elif tok == 'sp': sp_seed = 1 elif tok == '-beta': beta = float(sys.argv[i + 1]) elif tok == '-beta': seedbeta = float(sys.argv[i + 1]) elif tok == '-cbeta': cbeta = float(sys.argv[i + 1]) elif tok == '-thetas': for j in range(int(sys.argv[i + 1])): thetas.append(float(sys.argv[i + j + 2])) elif tok == '-gamma': gamma = float(sys.argv[i + 1]) elif tok == '-delta': deltamin = float(sys.argv[i + 1]) elif tok == '-info': infofile = sys.argv[i + 1] elif tok == '-valid': valid_tfs.append(sys.argv[i + 1]) elif tok == '-w': width = sys.argv[i + 1] elif tok == '-width': width = sys.argv[i + 1] elif tok == '-gap': gapped_syl = sys.argv[i + 1] elif tok == '-gapflank': gapflank = int(sys.argv[i + 1]) elif tok == '-gapweight': gapweight = float(sys.argv[i + 1]) elif tok == '-enrichfact': enrichfact = float(sys.argv[i + 1]) elif tok == '-pmax': pmax = 1 elif tok == '-Y2K': species = "YEAST_2000_UP" elif tok == '-Y5C': species = "YEAST_500_UP" elif tok == '-B': species = "BAC_ORF" elif tok == '-Ch22': species = "Ch22" elif tok == '-genome': species = sys.argv[i + 1] elif tok == '-pad': pad = sys.argv[i + 1] padlen = sys.argv[i + 2] elif tok == '-TF': TFSEEDS = 1 for j in range(i + 1, len(sys.argv)): if re.match('M0', sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human': _s = '' if sys.argv[i + 1].isdigit(): _s = '_' + sys.argv[i + 1] else: _s = '' species = 'HUMAN' + _s seqs = [] fsaD = Fasta.load(fastafile) probes = fsaD.keys() ''' for probeid in fsaD.keys(): seqs.append (fsaD [probeid]) ''' numprobes = len(probes) #print "numprobes: %i"%numprobes if not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(fastafile, numprobes, species) #seqs = EM.fasta2seqs(fastafile) all_seqs = seqs seed_s.extend(seqs[0:min(seed_count, len(seqs))]) #not necessary --- seed_c.extend(c_seqs[0:min(seed_count,len(seqs))]) if infofile and width == 'info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width, infofile, fastafile, species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*', '', key.split()[0])) m.seedtxt = '%-24s %s' % (m, key) tf.append(m) tf.sort(lambda x, y: cmp(x.seednum, y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x: 'N' + x + 'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] for m in seedmodels: newmodels.append(m[-2, m.width + 2]) seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s, [], [], width, "VERBOSE") if beta: theEM.beta = beta if cbeta: theEM.cbeta = cbeta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta if thetas: theEM.thetas = thetas theEM.param['gamma'] = gamma theEM.probeids.extend(probes) theEM.seqs.extend(all_seqs) #theEM.cons_seqs.extend(c_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) #sys.exit(0) ''' Compute some metrics ''' print "#Loading Genome %s" % species sys.stdout.flush() if species == 'human': Genome = ProbeSet('HUMAN', enrichfact) else: Genome = ProbeSet(species, enrichfact) ids = Genome.ids_from_file(fastafile) #fsaDict = Fasta.load(fastafile) #probes = fsaDict.keys() #cons_pickle = fastafile.split('.')[0] + '.cpickle' for C in theEM.candidates: #p_cons = conservation_pvalue(C.pssm,probes,fsaDict,ConsDict,4) #print p_cons if not pmax: w_dict = Genome.w_dict for key, i in zip(w_dict.keys(), range(len(C.pssm.thetas))): w_dict[key] = C.pssm.thetas[i] Genome.w_dict = w_dict C.pssm.pvalue = Genome.p_value(C.pssm, ids, 'verbose') #print "P-VAL: %f"%(Genome.p_value(C.pssm,ids,'verbose')*p_cons) C.pssm.church = Genome.church(C.pssm, ids) else: (p, frac) = Genome.best_p_value(C.pssm, ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:", p, frac for valid_tf in valid_tfs: C.pssm.valid = Validate.validate(C.pssm, valid_tf, 'Verbose', "Want Tuple") ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x, y: cmp(x.pssm.pvalue, y.pssm.pvalue)) for C, i in zip(theEM.candidates, range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3]) else: vstring = "(HIT %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed print "Log-odds matrix for Motif %3d %s" % (i, C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s" % (i, m.seedtxt) if m.source: print "Source: ", m.source if m.gamma: print "Gamma: %7.5f" % m.gamma if m.threshold: print "Threshold: %5.2f" % m.threshold if m.thetas != []: tstr = "thetas:" for theta in m.thetas: tstr = tstr + " " + str(theta) print tstr #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s' % (math.fabs( math.log(C.pssm.church) / math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s" % ( i, C, -math.log(C.pssm.pvalue) / math.log(10), vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%" % ( C.pssm.threshold, 100.0 * C.pssm.threshold / C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq, i, C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*' * len(seq) print "MAP Score: %f" % C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times