def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1; continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile,db)) self.debug(rje.isYounger(hfile,db) == hfile) need2blast = need2blast or not rje.isYounger(hfile,db) == hfile if not need2blast: self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self,null_file); nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')] rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) os.rename(sbfile,hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1 if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file)) self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
def hmmSearch(self,hmm,dbase=None,outfile=None,wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR','HMM file %s is missing!' % hmm); return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR','Database file "%s" is missing!' % dbase); return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in ['','none']: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile(hmm,True),rje.baseFile(dbase,True)) resfile = outfile if not os.path.exists(outfile) and self.opt['GZip'] and os.path.exists('%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger(resfile,hmm) == resfile and rje.isYounger(resfile,dbase) == resfile: self.printLog('#HMM','HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self,outfile,unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) self.log.printLog('#HMM',_command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile,'a').write(os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM','Outfile produced for %s: %s.' % (hmm,outfile)) if self.opt['GZip']: rje.backup(self,'%s.gz' % outfile,unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP','%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def splitMascot(self): ### Reads the MASCOT file and splits into header, hits and unmatched files. '''Reads the MASCOT file and splits into header, hits and unmatched files.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() infile = self.getStr('MASCOT') if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('MASCOT'))) #x#self.deBug(self.basefile()) headfile = '%s.header.txt' % self.basefile() hitsfile = '%s.mascot.csv' % self.basefile() peptfile = '%s.nohits.csv' % self.basefile() if rje.isYounger(self.getStr('MASCOT'),hitsfile) == hitsfile and not self.force(): return self.printLog('#FILE','%s file found (force=F)' % hitsfile) ### ~ [1] Split MASCOT~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headlines = [] csvhead = [] mdb = None mx = 0 itraq = [] prot_data = {} for mline in open(self.getStr('MASCOT'),'r').readlines(): mx += 1 # Index of next line in case needed for iTRAQ reading! ## ~ [1a] Skip down until Header found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not headlines and mline.find('Header') < 0: continue ## ~ [1b] Add Header lines to headlines until results headers found ~~~~~~~~~~~~~~~ ## if not csvhead and mline.find('prot_hit_num') < 0: headlines.append(mline); continue ## ~ [1c] Sort out MASCOT results headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if mline.find('prot_hit_num') >= 0: ## ~ Read Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## open(headfile,'w').writelines(headlines) csvhead = rje.readDelimit(string.join(string.split(rje.chomp(mline))),',') while '' in csvhead: csvhead.remove('') ## ~ Sort out iTRAQ headers (missing) ~~~~~~~~~ ## if self.getBool('iTRAQ'): iline = open(self.getStr('MASCOT'),'r').readlines()[mx] for isplit in rje.readDelimit(iline,',')[len(csvhead):]: # Should be start of iTRAQ data if '/' in isplit: itraq.append(isplit) self.printLog('#ITRAQ',string.join(itraq)) csvhead += itraq idb = db.addEmptyTable('itraq',['prot_hit_num','prot_acc','prot_desc','itraq','ratio','n','geomean','summary'],keys=['prot_hit_num','itraq']) idb.info['Delimit'] = ',' ## ~ Add emPAI header (also missing) ~~~~~~~~~~ ## if self.getBool('emPAI'): csvhead.append('empai') ## ~ Set up Database Table ~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#HEAD',string.join(csvhead,'; ')) mdb = db.addEmptyTable('mascot',csvhead,keys=['prot_hit_num','pep_query']) mdb.info['Delimit'] = ',' elif mline.find('Peptide matches') >= 0: mdb.saveToFile() if self.getBool('emPAI'): csvhead.remove('empai') mdb = db.addEmptyTable('nohits',csvhead,keys=['pep_query']) for field in mdb.fields(): if field[:4] == 'prot': mdb.dropField(field) mdb.info['Delimit'] = ',' continue elif rje.chomp(mline): #self.deBug('%s ... %s' % (mline[:20],mline.find('Peptide matches'))) data = rje.readDelimit(mline,',') entry = {}; pretraq = True #self.deBug(csvhead); self.deBug(itraq); for d in range(len(csvhead)+len(itraq)): if d >= len(data): break if data[d] in itraq: dhead = data[d]; pretraq = False elif data[d] == 'emPAI': entry['empai'] = data[d+1]; pretraq = False elif pretraq and d < len(csvhead): dhead = csvhead[d] elif pretraq: continue # Unmatched peptides will not have emPAI or iTRAQ data #self.deBug('%s > %s' % (data[d],dhead)) if d and data[d-1] == 'emPAI': continue elif data[d] in itraq + ['emPAI']: continue elif dhead not in entry: entry[dhead] = data[d] #self.deBug('%s = %s' % (dhead,entry[dhead])) if entry['prot_acc']: prot_data[entry['prot_hit_num']] = {'prot_acc':entry['prot_acc'],'prot_desc':entry['prot_desc']} if self.getBool('iTRAQ') and 'Quantitation summary for protein' in data: d = data.index('Quantitation summary for protein') + 1 if entry['prot_hit_num'] in prot_data: pacc = prot_data[entry['prot_hit_num']]['prot_acc'] pdesc = prot_data[entry['prot_hit_num']]['prot_desc'] else: pacc = entry['prot_acc'] pdesc = entry['prot_desc'] while d < len(data): if data[d] in itraq: idb.addEntry({'prot_hit_num':entry['prot_hit_num'],'prot_acc':pacc,'prot_desc':pdesc, 'itraq':data[d],'ratio':data[d+1],'n':data[d+2],'geomean':data[d+3],'summary':data[d+4]}) d += 1 #self.deBug(entry) if entry['prot_hit_num'] or entry['pep_query']: mdb.addEntry(entry) mdb.saveToFile() if self.getBool('iTRAQ'): idb.saveToFile() self.deBug('') return True except: self.errorLog('Error reading MASCOT file'); return False
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath(rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)) self.printLog('#FARM',batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = ['subjobs=%s' % batfile,'farm=batch','qsub=F','i=-1','runpath=%s' % os.path.abspath(self.info['HaqDir'])] if self.opt['MultiHAQ']: haqfarm = ['First round','Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog('#CHDIR','Changing directory for %s farming: %s' % (farmrun,self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log,self.cmd_list+farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog('#CHDIR','Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM','HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ',quitchoice=True)
def multiHAQ(self,secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt['MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList() # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = ['ini=haqesac.ini','seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F'] self.printLog('#HAQ','Running HAQESAC for %s - will have own log etc.' % seq.shortName(),log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd,info=info) out = rje.Out(cmd_list=haqcmd) # Sets up Out object for controlling output to screen out.printIntro(info) # Prints intro text using details from Info object haqlog = rje.setLog(info,out,haqcmd) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo('Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog('#HAQ','HAQESAC final round run for %s' % seq.shortName()) else: self.printLog('#HAQ','HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList(self.log,['seqin=%s' % infile,'autoload=T','autofilter=F']).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog('#QRY','%d other queries found in %s: [%s]' % (len(qacclist),infile,string.join(qacclist,'; '))) self.printLog('#QRY','%d of %d queries processed' % (len(processed),self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI','Executing second round of multiHAQ'); self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ',quitchoice=True)
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath( rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True)) self.printLog('#FARM', batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = [ 'subjobs=%s' % batfile, 'farm=batch', 'qsub=F', 'i=-1', 'runpath=%s' % os.path.abspath(self.info['HaqDir']) ] if self.opt['MultiHAQ']: haqfarm = ['First round', 'Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog( '#CHDIR', 'Changing directory for %s farming: %s' % (farmrun, self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log, self.cmd_list + farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog( '#CHDIR', 'Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM', 'HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ', quitchoice=True)
def multiHAQ(self, secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt[ 'MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList( ) # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = [ 'ini=haqesac.ini', 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F' ] self.printLog( '#HAQ', 'Running HAQESAC for %s - will have own log etc.' % seq.shortName(), log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd, info=info) out = rje.Out( cmd_list=haqcmd ) # Sets up Out object for controlling output to screen out.printIntro( info ) # Prints intro text using details from Info object haqlog = rje.setLog( info, out, haqcmd ) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo( 'Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog( '#HAQ', 'HAQESAC final round run for %s' % seq.shortName()) else: self.printLog( '#HAQ', 'HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList( self.log, ['seqin=%s' % infile, 'autoload=T', 'autofilter=F' ]).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog( '#QRY', '%d other queries found in %s: [%s]' % (len(qacclist), infile, string.join(qacclist, '; '))) self.printLog( '#QRY', '%d of %d queries processed' % (len(processed), self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI', 'Executing second round of multiHAQ') self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ', quitchoice=True)
def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile() nx = 0 null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file, 'r').read(), '\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1 continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile, db)) self.debug(rje.isYounger(hfile, db) == hfile) need2blast = need2blast or not rje.isYounger(hfile, db) == hfile if not need2blast: self.printLog( '#BLAST', 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self, null_file) nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('MultiCut'), 'blastv=%d' % self.getInt('MultiCut') ] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('BlastCut'), 'blastv=%d' % self.getInt('BlastCut') ] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += [ 'blasta=%d' % self.getInt('Forks') ] rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'), seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath( '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) os.rename(sbfile, hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file, 'a').write('%s\n' % seq.info['AccNum']) nx += 1 if nx: self.printLog( '#BLAST', '%s Accession Numbers without BLAST2Fas hits output to %s' % (nx, null_file)) self.printLog( '#BLAST', '%s HAQESAC input files made using BLAST2Fas' % (self.seqNum() - nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas') raise
def hmmSearch( self, hmm, dbase=None, outfile=None, wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR', 'HMM file %s is missing!' % hmm) return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR', 'Database file "%s" is missing!' % dbase) return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in [ '', 'none' ]: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile( hmm, True), rje.baseFile(dbase, True)) resfile = outfile if not os.path.exists( outfile) and self.opt['GZip'] and os.path.exists( '%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger( resfile, hmm) == resfile and rje.isYounger( resfile, dbase) == resfile: self.printLog('#HMM', 'HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self, outfile, unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) self.log.printLog('#HMM', _command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile, 'a').write( os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM', 'Outfile produced for %s: %s.' % (hmm, outfile)) if self.opt['GZip']: rje.backup(self, '%s.gz' % outfile, unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP', '%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def run(self): ### Main Run method ''' Main Run method. ''' try: ### SLiMDisc Run ### if self.opt['SLiMDisc']: return self.slimDisc() ### TEIRESIAS ### if self.opt['Teiresias']: ## Setup ## seqlist = rje_seq.SeqList(self.log,self.cmd_list) infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True) outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True) run_teiresias = True if rje.isYounger(outfile,infile) == outfile: if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'): run_teiresias = False ## Run TEIRESIAS ## if run_teiresias: seqlist.saveFasta(seqfile=infile,name='Teiresias') ### Saves sequences in fasta format command = rje.makePath(self.info['TeiresiasPath'],True) command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt']) self.log.printLog('#CMD',command) os.system(command) ## Read Results ## self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1) self.list['Pattern'] = [] RESULTS = open(outfile,'r') line = RESULTS.readline() while line: if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line)) elif len(line) > 3 and line[0] != '#': self.log.errorLog('Did not recognise line: %s' % line,False,False) line = RESULTS.readline() RESULTS.close() patx = len(self.list['Pattern']) self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile)) ## Calculate Information Content ## aafreq = seqlist.aaFreq() self.verbose(0,3,'Calculating Information Content & Length stats...',0) occx = 0 for pattern in self.list['Pattern']: pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq) pattern._makeLength() occx += 1 rje.progressPrint(self,occx,patx/100,patx/10) self.verbose(0,1,'...Done!',2) ## Prepare Results ## delimit = rje.getDelimit(self.cmd_list) if self.info['Name'] == 'None': self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit)) if self.opt['MySQL']: # Two tables patfile = os.path.splitext(self.info['Name']) occfile = '%s.occ%s' % (patfile[0],patfile[1]) patfile = '%s.patterns%s' % (patfile[0],patfile[1]) if self.opt['Append']: PATFILE = open(patfile,'a') OCCFILE = open(occfile,'a') else: PATFILE = open(patfile,'w') rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit) OCCFILE = open(occfile,'a') rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit) else: if self.opt['Append']: RESFILE = open(self.info['Name'],'a') else: RESFILE = open(patfile,'w') rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit) ## Save Results ## occx = 0 for pattern in self.list['Pattern']: patstats = [] for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']: patstats.append('%d' % pattern.stat[stat]) patstats[2] = '%.3f' % pattern.stat['Info'] if self.opt['MySQL']: # Two tables rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit) for occ in rje.sortKeys(pattern.occ): seq = seqlist.seq[occ] for pos in pattern.occ[occ]: match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])] outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match] if self.opt['MySQL']: # Two tables rje.writeDelimit(OCCFILE,outlist,delimit) else: rje.writeDelimit(RESFILE,outlist+patstats,delimit) occx += 1 if self.opt['MySQL']: # Two tables PATFILE.close() OCCFILE.close() self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile)) self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile)) else: RESFILE.close() self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' % (rje.integerString(occx),rje.integerString(patx),self.info['Name'])) ### InfoContent ### elif self.info['Info'] != 'None': ## Setup ## alphabet = rje_seq.alph_protx if not os.path.exists(self.info['Info']): self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False) return False else: mypresto = presto.Presto(self.log,self.cmd_list) mypresto.loadMotifs(file=self.info['Info'],clear=True) seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) if seqlist.seqNum() > 0: aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False) ### Returns dictionary of AA (& gap etc.) frequencies else: aafreq = {} for aa in alphabet: aafreq[aa] = 1.0 / len(alphabet) alphabet = aafreq.keys() maxinfo = 0 for aa in alphabet: maxinfo += (aafreq[aa] * math.log(aafreq[aa],2)) ## Output ## delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext) if self.opt['Append']: OUTFILE = open(outfile,'a') else: OUTFILE = open(outfile,'w') rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit) ## Calculate Information Scores ## for motif in mypresto.motif: self.verbose(2,4,motif.info['Sequence'],0) pattern = string.replace(motif.info['Sequence'],'X','.') elements = string.split(pattern,'-') pattern = '' for el in elements: if el.find('.{') == 0: # Ambiguous spacer length - compress pattern += '.' else: pattern += el self.verbose(2,2,'=> %s' % pattern,1) motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen']) self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1) ## Output ## rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit) ## Finish ## OUTFILE.close() except: self.log.errorLog('Error in run().',printerror=True,quitchoice=False) raise # Delete this if method error not terrible