def startFork(self, fdict): ### Sets a new fork going using the data in fdict. '''Sets a new fork going using the data in fdict.''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fdict['cmd'] = self.list['ToFork'].pop(0) fdict['ID'] = 'Fork %d' % self.list['Forked'].index(fdict) fdict['FID'] = 'f_%s' % rje.randomString(6) if self.getBool('RjePy'): fdict['Log'] = '%s%s.log' % (self.getStr('RunPath'), fdict['FID']) fdict['cmd'] += ' basefile=%s' % (fdict['Log']) fdict['ResFile'] = self.list['ResFile'][0:] try: open(fdict['Log'], 'w') except: self.errorLog('Log problem. Aborting fork.') return self.endJob(fdict) ### ~ [2] ~ Add Fork ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setNum({'KillTime': time.time()}) cpid = os.fork() # Fork child process if cpid: # parent process records pid of child rsh process fdict['PID'] = cpid self.printLog( '#FORK', 'Forking cmd as %s: %d remain; %.1f%% mem free' % (cpid, len(self.list['ToFork']), fdict['Mem'])) self.printLog('#FORK', '%s cmd: %s' % (cpid, fdict['cmd'])) else: # child process os.system(fdict['cmd']) os._exit(0) except SystemExit: raise # Child except: self.errorLog('Forker.startFork error')
def nextSeqJob( self, host_id ): ### Sets an new job running on host with given index #V1.0 '''Sets an new job running on host with given index.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### jdict = self.dict['Running'][host_id] = { } # Setup empty dictionary to fill, if jobs available if self.list['Seq']: seq = self.list['Seq'].pop(0) # UniFake this sequence else: return # Out of sequences: stop if seq.info['AccNum'] in self.list['Pickup']: return self.nextSeqJob(host_id) # Skip this sequence jran = 'i_%s' % rje.randomString(6) jdict['Log'] = '%s%s.log' % (self.getStr('RunPath'), jran) jdict['Qry'] = '%s%s.qry' % (self.getStr('RunPath'), jran) for out in self.list['OutList']: jdict[out] = '%s%s.%s' % (self.getStr('RunPath'), jran, out) open(jdict['Qry'], 'w').write( '>%s\n%s\n' % (seq.info['Name'], seq.info['Sequence'])) job = 'python %s%s.py' % (self.getStr('PyPath'), self.getStr('Farm')) if self.getStr('JobINI'): job = '%s ini=%s' % (job, self.getStr('JobINI')) job = '%s seqin=%s i=-1 v=-1 basefile=%s' % (job, jdict['Qry'], jran) initial_cmds = 'cd ' + self.getStr( 'RunPath') + ' ; echo %s on `hostname` as %s ; ' % ( seq.shortName(), jran) if self.rsh(): job = '%s %s ; echo Finishing on `hostname`' % (initial_cmds, job) job = "rsh %s '%s'" % (self.list['Hosts'][host_id], job) ### ~ [2] ~ Add Job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: cpid = os.fork() # Fork child process except OSError: self.errorLog( 'JobFarmer.nextJob error. Will sleep for 10 mins then continue.' ) time.sleep(600) self.printLog( '#YAWN', 'Re-awakening JobFarmer nextJob. Fingers crossed.') jdict[ 'Error'] = 'JobFarmer.nextJob OSError.' #!# Make sure this node is retried. return if cpid: # parent process records pid of child rsh process jdict['PID'] = cpid self.printLog( '#SEQ', 'Running %s as %s [%d::%s]: %d remain' % (seq.shortName(), cpid, host_id, self.list['Hosts'][host_id], len(self.list['Seq']))) else: # child process os.system(job) os._exit(0) except SystemExit: raise # Child except: self.errorLog('JobFarmer.nextSeqJob error')
def _setAttributes(self): ### Sets Attributes of Object ''' Sets Attributes of Object: - Info:str ['Program','QPath','Job'] - Opt:boolean ['ModPurge','RjePy'] - Stats:float ['Nodes','Walltime','VMem'] - List:list ['Depend','Modules'] - Dict:dictionary [] - Obj:RJE_Object [] ''' ### Basics ### self.infolist = ['Program','QPath','Job','PyPath','Email','HPC','DependHPC'] self.optlist = ['ModPurge','RjePy','Report','MailStart'] self.statlist = ['Nodes','Walltime','PPN','Pause'] self.listlist = ['Depend','PreCall','Modules'] self.dictlist = [] self.objlist = [] ### Defaults ### self._setDefaults(info='None',opt=True,stat=0.0,obj=None,setlist=True,setdict=True) ### Other Attributes ### self.setInfo({'QPath':os.path.abspath(os.curdir),'Job':'rje_%s' % rje.randomString(4), 'PyPath':'/home/re1u06/Serpentry/','Email':'', 'HPC':'IRIDIS4','DependHPC':'blue30.iridis.soton.ac.uk'}) self.setStat({'Walltime':60,'Nodes':1,'PPN':12,'Pause':5,'VMem':48}) self.setOpt({'Report':False,'MailStart':False,'ModPurge':True})
def nextSeqJob(self,host_id): ### Sets an new job running on host with given index #V1.0 '''Sets an new job running on host with given index.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### jdict = self.dict['Running'][host_id] = {} # Setup empty dictionary to fill, if jobs available if self.list['Seq']: seq = self.list['Seq'].pop(0) # UniFake this sequence else: return # Out of sequences: stop if seq.info['AccNum'] in self.list['Pickup']: return self.nextSeqJob(host_id) # Skip this sequence jran = 'i_%s' % rje.randomString(6) jdict['Log'] = '%s%s.log' % (self.getStr('RunPath'),jran) jdict['Qry'] = '%s%s.qry' % (self.getStr('RunPath'),jran) for out in self.list['OutList']: jdict[out] = '%s%s.%s' % (self.getStr('RunPath'),jran,out) open(jdict['Qry'],'w').write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence'])) job = 'python %s%s.py' % (self.getStr('PyPath'),self.getStr('Farm')) if self.getStr('JobINI'): job = '%s ini=%s' % (job,self.getStr('JobINI')) job = '%s seqin=%s i=-1 v=-1 basefile=%s' % (job,jdict['Qry'],jran) initial_cmds = 'cd ' + self.getStr('RunPath') + ' ; echo %s on `hostname` as %s ; ' % (seq.shortName(),jran) if self.rsh(): job = '%s %s ; echo Finishing on `hostname`' % (initial_cmds,job) job = "rsh %s '%s'" % (self.list['Hosts'][host_id],job) ### ~ [2] ~ Add Job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: cpid = os.fork() # Fork child process except OSError: self.errorLog('JobFarmer.nextJob error. Will sleep for 10 mins then continue.') time.sleep(600) self.printLog('#YAWN','Re-awakening JobFarmer nextJob. Fingers crossed.') jdict['Error'] = 'JobFarmer.nextJob OSError.' #!# Make sure this node is retried. return if cpid: # parent process records pid of child rsh process jdict['PID'] = cpid self.printLog('#SEQ','Running %s as %s [%d::%s]: %d remain' % (seq.shortName(),cpid,host_id,self.list['Hosts'][host_id],len(self.list['Seq']))) else: # child process os.system(job) os._exit(0) except SystemExit: raise # Child except: self.errorLog('JobFarmer.nextSeqJob error')
def nextJob( self, host_id ): ### Sets an new job running on host with given index #V1.0 '''Sets an new job running on host with given index.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### node = self.list['Hosts'][host_id] freemem = self.freeMem(node) if self.getBool('SeqBySeq'): return self.nextSeqJob(host_id) jdict = self.dict['Running'][host_id] = { } # Setup empty dictionary to fill, if jobs available if self.getNum('MemFree') > freemem: jdict['PID'] = 'WAIT - %.1f%% %s mem' % (freemem * 100.0, node) return if self.list['SubJobs']: job = self.list['SubJobs'].pop(0) else: return if self.getBool('RjePy'): jdict['Log'] = '%si_%s.log' % (self.getStr('RunPath'), rje.randomString(6)) if self.getStr('JobINI'): job = '%s ini=%s' % (job, self.getStr('JobINI')) if 'Log' in jdict: job = '%s log=%s' % (job, jdict['Log']) try: open(jdict['Log'], 'w') except: self.errorLog('Log problem. Aborting %s job.' % host_id) return self.endJob(host_id) ### ~ [2] ~ Add Job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rsh = "rsh %s '%s'" % (self.list['Hosts'][host_id], job) cpid = os.fork() # Fork child process if cpid: # parent process records pid of child rsh process jdict['PID'] = cpid if self.rsh(): self.printLog('#SUB', rsh) else: self.printLog('#SUB', job) self.printLog( '#JOB', 'Running job as %s: %d remain; %.1f%% mem free' % (cpid, len(self.list['SubJobs']), freemem * 100.0)) else: # child process if self.rsh(): os.system(rsh) else: os.system(job) os._exit(0) except SystemExit: raise # Child except: self.errorLog('JobFarmer.nextJob error')
def startFork(self, fdict): ### Sets a new fork going using the data in fdict. '''Sets a new fork going using the data in fdict.''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqin = self.seqinObj() fdict['seq'] = self.list['ToFork'].pop(0) (seqname, sequence) = seqin.getSeq(fdict['seq']) fdict['ID'] = 'Fork -%d' % (len(self.list['ToFork']) + 1) fdict['FID'] = 'f_%s' % rje.randomString(6) fdict['Log'] = '%s%s.log' % (self.getStr('RunPath'), fdict['FID']) fdict['ResFile'] = ['depthcharge.tdt'] try: open(fdict['Log'], 'w') except: self.errorLog('Log problem. Aborting fork.') return self.endJob(fdict) ### ~ [2] ~ Add Fork ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setNum({'KillTime': time.time()}) cpid = os.fork() # Fork child process if cpid: # parent process records pid of child rsh process fdict['PID'] = cpid self.printLog('\r#FORK', 'Forking seq as %s: %d remain; %.1f%% mem free' % (cpid, len(self.list['ToFork']), fdict['Mem']), log=self.getBool('LogFork'), screen=self.getBool('LogFork') or self.v() > 1) self.printLog('#FORK', '%s seq: %s' % (cpid, fdict['seq']), log=self.getBool('LogFork'), screen=self.getBool('LogFork') or self.v() > 1) else: # child process self.baseFile(fdict['FID']) self.setInt({'Interactive': -1}) self.log.info['ErrorLog'] = '' self.log.info['LogFile'] = fdict['Log'] self.log.opt[ 'Quiet'] = True # When True, will not write to screen or log apart from errors. self.list['ResFile'] = fdict['ResFile'] self.depthCharge(seqname, sequence) os._exit(0) except SystemExit: raise # Child except: self.errorLog('Forker.startFork error')
def startFork(self,fdict): ### Sets a new fork going using the data in fdict. '''Sets a new fork going using the data in fdict.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fdict['cmd'] = self.list['ToFork'].pop(0) fdict['ID'] = 'Fork %d' % self.list['Forked'].index(fdict) fdict['FID'] = 'f_%s' % rje.randomString(6) if self.getBool('RjePy'): fdict['Log'] = '%s%s.log' % (self.getStr('RunPath'),fdict['FID']) fdict['cmd'] += ' basefile=%s' % (fdict['Log']) fdict['ResFile'] = self.list['ResFile'][0:] try: open(fdict['Log'],'w') except: self.errorLog('Log problem. Aborting fork.'); return self.endJob(fdict) ### ~ [2] ~ Add Fork ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setNum({'KillTime':time.time()}) cpid = os.fork() # Fork child process if cpid: # parent process records pid of child rsh process fdict['PID'] = cpid self.printLog('#FORK','Forking cmd as %s: %d remain; %.1f%% mem free' % (cpid,len(self.list['ToFork']),fdict['Mem'])) self.printLog('#FORK','%s cmd: %s' % (cpid,fdict['cmd'])) else: # child process os.system(fdict['cmd']) os._exit(0) except SystemExit: raise # Child except: self.errorLog('Forker.startFork error')
def loadOrthAln(callobj,seq,gopher=True): ### Identifies file, loads and checks alignment. ''' Identifies file, loads and checks alignment. If the identified file is not actually aligned, then RJE_SEQ will try to align the proteins using MUSCLE or ClustalW. >> callobj:Object containing settings for stats generation (MotifList, generally). >> seq:Sequence being analysed. >> gopher:bool [True] = whether to try to generate alignment with GOPHER if callobj.opt['Gopher'] << aln = SeqList object containing alignment with queryseq ''' try: ### Setup Attributes ### v = callobj.stat['Verbose'] alndir = rje.makePath(callobj.info['AlnDir']) alnext = callobj.info['AlnExt'] ### Identify File ### if alnext[0] != '.': alnext = '.%s' % alnext alnstart = [seq.info['AccNum'],seq.info['ID'],seq.shortName(),None] if v > 2: callobj.log.printLog('#PRESTO','%s' % callobj.opt) #!# Old debugging? #!# if callobj.opt['Gopher'] and callobj.opt['FullForce']: if v > 0: callobj.log.printLog('#ALN','FullForce=T. Will call Gopher for %s regardless of existing files' % seq.shortName()) alnstart = [None] for file in alnstart: if file: file = '%s%s%s' % (alndir,file,alnext) if rje.checkForFile(file): break # File found else: #!# Sort out logging and see if Gopher can be used directly rather than just run() #!# ### Run GOPHER ### if gopher and callobj.opt['Gopher']: #!# Add working version for PRESTO and SlimPickings #!# callobj.deBug('Run GOPHER in %s' % callobj.info['GopherDir']) mydir = os.getcwd() os.chdir(callobj.info['GopherDir']) callobj.log.printLog('\n#GOPHER','Running GOPHER on %s' % seq.shortName()) try: #!# Add log.silent() method? #!# gcmd = ['orthtree'] + callobj.cmd_list + ['gnspacc=T','i=-1'] solo_gopher = gopher_V2.GopherFork(log=callobj.log,cmd_list=gcmd) solo_gopher.info['Name'] = seq.shortName() solo_gopher.obj['Sequence'] = seq solo_gopher.obj['BLAST'] = gopher_V2.Gopher(callobj.log,gcmd).setupBlast() #!# Contemplate setting up Gopher in callobj #!# solo_gopher.obj['BLAST'].log = callobj.log solo_gopher.run('orthalign') #X#gopher_V2.Gopher(callobj.log,gcmd).setMode()) except: os.chdir(mydir) callobj.log.errorLog('Problem with Gopher run!') return None if not 'old_school': inputseq = 'tmp%s.fas' % rje.randomString(8) TMP = open(inputseq,'w') TMP.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence'])) TMP.close() gcmd = ['orthtree'] + callobj.cmd_list + ['gopher=%s' % inputseq, 'gnspacc=T','i=-1'] try: mygopher = gopher_V2.Gopher(log=callobj.log,cmd_list=gcmd) mygopher.run() except: os.chdir(mydir) callobj.log.errorLog('Problem with Gopher run!',printerror=False) return None rje_blast.cleanupDB(callobj,dbfile=inputseq,deletesource=True) os.chdir(mydir) if callobj.opt['Gopher']: file = '%s%s%s' % (alndir,seq.info['AccNum'],alnext) if not os.path.exists(file): file = None if not file: callobj.log.printLog('#ALN','No alignment file found for %s in %s.' % (seq.shortName(),alndir),screen=False) return None ### Load Alignment ### callobj.log.stat['Verbose'] = v - 1 alncmd = ['seqin=None','query=%s' % seq.shortName(),'accnr=F','seqnr=F','autofilter=F','align=T','gnspacc=F'] aln = rje_seq.SeqList(log=callobj.log,cmd_list=callobj.cmd_list+alncmd) #X#print file aln.loadSeqs(seqfile=file,seqtype='Protein',aln=True,nodup=None) callobj.log.stat['Verbose'] = v ## Check Query ## qry = aln.obj['QuerySeq'] if not qry: if aln.querySeq(query=seq.info['AccNum']): qry = aln.obj['QuerySeq'] else: callobj.log.printLog('#ALN','Problem finding %s in %s.' % (seq.shortName(),file),screen=False) return None ### Check Alignment ### if aln.seqNum() < 2: callobj.log.printLog('#ALN','Not enough sequences for %s in %s.' % (seq.shortName(),file),screen=False) return None if aln._checkAln(aln=True,realign=True): return aln else: callobj.log.printLog('#ERR','%s not aligned!!!' % (file)) return None except: callobj.log.errorLog('Something bad has happened in rje_motif_stats.loadOrthAln()') callobj.log.stat['Verbose'] = v return None
def ANCHOR(self, retry=2): ### Runs ANCHOR disorder prediction '''Runs ANCHOR disorder prediction.''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## apath = self.info['ANCHOR'] if os.path.basename(apath) == 'anchor': apath = os.path.dirname(apath) anchor = rje.makePath(apath) + 'anchor' if not os.path.exists(anchor): self.errorLog('Path "%s" not found!' % anchor, printerror=False) retry = 0 raise IOError ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(tmp, 'w').write('>%s\n%s\n' % (name, sequence)) acmd = '%s %s -d %s' % (anchor, tmp, apath) dlines = os.popen(acmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Name'] not in ['', 'None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if d[:1] == '#': continue if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d): dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog( '%s: Position %d is %s in sequence but %s in ANCHOR output!' % (name, pos, sequence[i], aa), printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog( '%s: Position %d reached in ANCHOR output but previous results missing!' % (name, pos), printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog( '%s: Sequence = %d aa but ANCHOR results stop at %s!' % (name, len(sequence), len(self.list['ResidueDisorder'])), printerror=False) raise ValueError ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat[ 'IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start, i - 1)) dx += i - start start = 0 if not fstart and score <= self.stat[ 'IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart, i - 1)) fstart = 0 if start: self.list['RegionDisorder'].append((start, len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart, len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']), dx)) return True except: if retry: self.printLog('#RETRY', 'Trying %s again...' % name) return self.ANCHOR(retry - 1) self.log.errorLog( 'Error in Disorder.ANCHOR(%s). Disorder prediction failed.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] return False
def iuPred(self, retry=2): ### Runs IUPred disorder prediction '''Runs IUPred disorder prediction.''' mydir = os.path.abspath(os.curdir) try: ### Setup sequence and temp file ### sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ### Run Disorder ### iupath = string.join( string.split(self.info['IUPath'], os.sep)[:-1], os.sep) iupred = string.split(self.info['IUPath'], os.sep)[-1] if self.opt['IUChDir']: os.chdir( string.join( string.split(self.info['IUPath'], os.sep)[:-1], os.sep)) open(tmp, 'w').write('>%s\n%s\n' % (name, sequence)) if self.opt['IUChDir'] and self.opt['Win32']: iucmd = '%s %s %s' % (iupred, tmp, self.info['IUMethod'].lower()) elif self.opt['IUChDir']: iucmd = './%s %s %s' % (iupred, tmp, self.info['IUMethod'].lower()) else: iucmd = '%s %s %s' % (self.info['IUPath'], tmp, self.info['IUMethod'].lower()) dlines = os.popen(iucmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) if self.opt['IUChDir']: os.chdir(mydir) if self.info['Name'] not in ['', 'None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)', d): dm = rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)', d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog( '%s: Position %d is %s in sequence but %s in IUPred output!' % (name, pos, sequence[i], aa), printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog( '%s: Position %d reached in IUPred output but previous results missing!' % (name, pos), printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog( '%s: Sequence = %d aa but IUPred results stop at %s!' % (name, len(sequence), len(self.list['ResidueDisorder'])), printerror=False) raise ValueError ### Make Regions ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat[ 'IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start, i - 1)) dx += i - start start = 0 if not fstart and score <= self.stat[ 'IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart, i - 1)) fstart = 0 if start: self.list['RegionDisorder'].append((start, len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart, len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'IUPred (%s) Disorder prediction complete: %d disorder regions, %d disordered aa' % (self.info['IUMethod'].lower(), len(self.list['RegionDisorder']), dx)) return True except: if self.opt['IUChDir']: os.chdir(mydir) if retry: self.printLog('#RETRY', 'Trying %s again...' % name) return self.iuPred(retry - 1) self.log.errorLog( 'Error in Disorder.iuPred(%s). Disorder prediction failed. Check (setenv?) IUPred_PATH environment variable.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] #try: os.system('rm %s*tmp' % (rje.makePath(os.path.split(self.info['IUPath'])[0]))) #except: pass return False
def ANCHOR(self,retry=2): ### Runs ANCHOR disorder prediction '''Runs ANCHOR disorder prediction.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## apath = self.info['ANCHOR'] if os.path.basename(apath) == 'anchor': apath = os.path.dirname(apath) anchor = rje.makePath(apath) + 'anchor' if not os.path.exists(anchor): self.errorLog('Path "%s" not found!' % anchor,printerror=False) retry = 0; raise IOError ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(tmp,'w').write('>%s\n%s\n' % (name,sequence)) acmd = '%s %s -d %s' % (anchor,tmp,apath) dlines = os.popen(acmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Name'] not in ['','None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if d[:1] == '#': continue if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)',d): dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)',d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog('%s: Position %d is %s in sequence but %s in ANCHOR output!' % (name,pos,sequence[i],aa),printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog('%s: Position %d reached in ANCHOR output but previous results missing!' % (name,pos),printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog('%s: Sequence = %d aa but ANCHOR results stop at %s!' % (name,len(sequence),len(self.list['ResidueDisorder'])),printerror=False) raise ValueError ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat['IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start,i-1)) dx += i - start start = 0 if not fstart and score <= self.stat['IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart,i-1)) fstart = 0 if start: self.list['RegionDisorder'].append((start,len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart,len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog('\r#DIS','ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),dx)) return True except: if retry: self.printLog('#RETRY','Trying %s again...' % name) return self.ANCHOR(retry-1) self.log.errorLog('Error in Disorder.ANCHOR(%s). Disorder prediction failed.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] return False
def iuPred(self,retry=2): ### Runs IUPred disorder prediction '''Runs IUPred disorder prediction.''' mydir = os.path.abspath(os.curdir) try: ### Setup sequence and temp file ### sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ### Run Disorder ### iupath = string.join(string.split(self.info['IUPath'],os.sep)[:-1],os.sep) iupred = string.split(self.info['IUPath'],os.sep)[-1] if self.opt['IUChDir']: os.chdir(string.join(string.split(self.info['IUPath'],os.sep)[:-1],os.sep)) open(tmp,'w').write('>%s\n%s\n' % (name,sequence)) if self.opt['IUChDir'] and self.opt['Win32']: iucmd = '%s %s %s' % (iupred,tmp,self.info['IUMethod'].lower()) elif self.opt['IUChDir']: iucmd = './%s %s %s' % (iupred,tmp,self.info['IUMethod'].lower()) else: iucmd = '%s %s %s' % (self.info['IUPath'],tmp,self.info['IUMethod'].lower()) dlines = os.popen(iucmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) if self.opt['IUChDir']: os.chdir(mydir) if self.info['Name'] not in ['','None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)',d): dm = rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)',d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog('%s: Position %d is %s in sequence but %s in IUPred output!' % (name,pos,sequence[i],aa),printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog('%s: Position %d reached in IUPred output but previous results missing!' % (name,pos),printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog('%s: Sequence = %d aa but IUPred results stop at %s!' % (name,len(sequence),len(self.list['ResidueDisorder'])),printerror=False) raise ValueError ### Make Regions ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat['IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start,i-1)) dx += i - start start = 0 if not fstart and score <= self.stat['IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart,i-1)) fstart = 0 if start: self.list['RegionDisorder'].append((start,len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart,len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog('\r#DIS','IUPred (%s) Disorder prediction complete: %d disorder regions, %d disordered aa' % (self.info['IUMethod'].lower(),len(self.list['RegionDisorder']),dx)) return True except: if self.opt['IUChDir']: os.chdir(mydir) if retry: self.printLog('#RETRY','Trying %s again...' % name) return self.iuPred(retry-1) self.log.errorLog('Error in Disorder.iuPred(%s). Disorder prediction failed. Check (setenv?) IUPred_PATH environment variable.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] #try: os.system('rm %s*tmp' % (rje.makePath(os.path.split(self.info['IUPath'])[0]))) #except: pass return False
def uniFake( self, seqs=[], store=False ): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx, seqnum) = (0, seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt( self.log, self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['', 'none']: self.info['DatOut'] = rje.baseFile( seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self, datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self, hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog( '#SEQ', 'Processing %s (%s aa) %s...' % (seq.shortName(), rje.integerString( seq.aaLen()), seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5), seq.info['AccNum']) open('%s.fas' % utmp, 'w').write( '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence'])) udata = { 'CC': ['-!- Features generated using unifake.py'], 'AC': [] } if seq.info['SpecCode'] in ['Unknown', 'UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']): details = rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']) self.addAlias(seq.info['AccNum'], details[0]) self.addAlias(seq.info['AccNum'], details[1]) self.addAlias(seq.info['AccNum'], details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(), seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append( '%s;' % string.join(self.dict['Aliases'][id], '; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [ seq.shortName(), seq.info['AccNum'], seq.info['ID'] ]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list[ 'RegionDisorder']: ft.append({ 'Type': 'DISORDER', 'Desc': 'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': disorder[0], 'End': disorder[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({ 'Type': 'ORDER', 'Desc': 'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': fold[0], 'End': fold[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) except: self.log.errorLog( 'UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({ 'SearchDB': '%s.fas' % utmp, 'HMMOut': '%s.hmm.out' % utmp }) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [ hmm.hmmSearch(self.info['PFam'], outfile=hmm.info['HMMOut']) ] # Used in hmmTable hmm.hmmTable(outfile=hmmfile, append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list[ 'ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append( 'PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'], time.ctime( os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append( '-!- ERROR: PFam HMMer Search failure!') out = {'Type': '!ERROR!', 'Name': name} rje.delimitedFileOutput( self, hmmfile, [ 'Type', 'Name', 'Start', 'End', 'Eval', 'Score' ], datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = { 'Start': aln.stat['SbjStart'], 'End': aln.stat['SbjEnd'], 'Type': 'PFAM', 'Desc': '%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'], aln.stat['Expect'], aln.stat['BitScore']) } if disorder: region = disorder[ aln.stat['SbjStart'] - 1:aln.stat['SbjEnd']] hmmdisorder = float( sum(region)) / len(region) pfamft[ 'Desc'] = '%s; IUPRed: %.2f' % ( pfamft['Desc'], hmmdisorder) if hmmdisorder < self.stat[ 'DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog( 'UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen( '%s %s.fas -short' % (self.info['TMHMM'], utmp)).readlines() domlist = rje_tm.domainList( rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append( 'TMHMM: %d TM domains; N-Term %s' % ((len(domlist) - 1) / 2, domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system( '%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'], utmp, utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({ 'Type': 'SIGNALP', 'Desc': desc, 'Start': 1, 'End': cpos }) except: self.log.errorLog( 'UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq, udata, ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq( seq, data=udata, ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile, append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog( '#UNIFAKE', '|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx), rje.integerString(seqnum - sx)), log=False) if store: uniprot.saveUniProt(datfile, append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog( 'Oh, the shame of it! Trouble during UniFake.uniFake()')
def forker(self): ### Generic forking method ''' Generic method for forking (without threads). Add description here (and arguments.) ''' try: ### <0> ### Setup _stage = '<0> Fork Setup' forkx = int(self.stat['Forks']) # Number of forks to have running at one time if self.opt['Win32'] or forkx < 1: self.opt['NoForks'] = True forks = [] # List of active fork PIDs killforks = int(self.stat['KillForks']) # Time in seconds to wait after main thread has apparently finished forking_condition = True # Condition to keep forking ### Sequence List setup ### _stage = '<1> Forking' seqx = 0 # Sequence Counter subx = 0 # Subset sequence counter outfile = None # Output file name randlist = [] # List of random strings for split sequence files filedict = {} # Dictionary of input files for each random string seqlist = rje_seq.SeqList(log=self.log,cmd_list=['autoload=F']+self.cmd_list) seqlist.makeBaseFile() SEQFILE = open(seqlist.info['Name'], 'r') (seq,lastline) = seqlist.nextFasSeq(SEQFILE,'Starting') while seq: seqlist.seq = [seq] if self.info['StartFrom'] != 'None': # Not yet reached wanted sequence if self.info['StartFrom'] in [seq.info['Name'], seq.info['ID'], seq.info['AccNum'], seq.shortName()]: self.info['StartFrom'] = 'None' if self.info['StartFrom'] == 'None': # Wanted sequence if outfile: # Create new file SEQOUT = open(outfile,'a') else: rs = rje.randomString(6) while rs in randlist: rs = rje.randomString(6) outfile = '%s.%s.fas' % (seqlist.info['Basefile'],rs) SEQOUT = open(outfile,'w') randlist.append(rs) filedict[rs] = outfile SEQOUT.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence'])) SEQOUT.close() seqx += 1 subx += 1 if subx == self.stat['Split']: # Finished split self.log.printLog('#SEQ','%s sequences output to %s.' % (rje.integerString(subx),outfile)) outfile = None subx = 0 (seq,lastline) = seqlist.nextFasSeq(SEQFILE,lastline) if subx > 0: self.log.printLog('#SEQ','%s sequences output to %s.' % (rje.integerString(subx),outfile)) self.log.printLog('#SEQ','%s sequences output in total to %d files.' % (rje.integerString(seqx),len(randlist))) # Now have the list of random strings in randlist (in order) and filenames in filedict ### <1> ### Forking killtime = time.time() dealt_with = 0 # Split files dealt with while dealt_with < len(randlist) or len(forks): ## <a> ## forks _stage = '<1a> New Forks' while dealt_with < len(randlist) and (len(forks) < forkx or self.opt['NoForks']): # Add more forks _stage = '<1a-i> Fork: Get stuff for fork' killtime = time.time() # Reset killtime - still doing stuff # Add new fork _stage = '<1a-ii> Fork: New Fork' new_fork_id = randlist[dealt_with] dealt_with += 1 outcmd = string.split(self.info['OutCmd'],'.') if len(outcmd) > 1: outcmd = outcmd[:-1] + [new_fork_id] + outcmd[-1:] else: outcmd = outcmd + [new_fork_id] + ['resfile'] outcmd = string.join(outcmd,'.') forkcmd = '%s %s%s %s %s log=%s.log newlog=T i=-1' % (self.info['ForkProg'],self.info['SeqInCmd'],filedict[new_fork_id],outcmd,self.info['ForkCmd'],new_fork_id) if self.opt['NoForks']: os.system(forkcmd) else: # Forks newpid = os.fork() if newpid == 0: # child os.system(forkcmd) sys.exit() # Exit process elif newpid == -1: # error self.log.errorLog('Problem forking %s.' % new_fork_id) else: forks.append(newpid) # Add fork to list ## <b> ## Monitor and remove finished forks _stage = '<1b> Finished Forks' forklist = self._activeForks(forks) if len(forklist) != len(forks): self.verbose(0,2,' => %d of %d forks finished!' % (len(forks) - len(forklist),len(forks)),1) forks = forklist[0:] self.verbose(3,3,'End of a Cycle.',2) ## <c> ## Look for eternal hanging of forks _stage = '<1c> Hanging' if time.time() - killtime > killforks: self.verbose(0,1,'\n%d seconds of main program inactivity. %d forks still active!' % (killforks,len(forks)),1) for fork in forks: self.verbose(0,2,' => Fork PID %d still Active!' % (fork),1) if rje.yesNo('Kill?'): break #!# killing options else: killtime = time.time() ### <3> ### Finish _stage = '<3> Finish' if len(forks) > 0: self.log.errorLog('%d Forks still active after %d seconds of main program inactivity' % (len(forks),killforks),True) else: self.verbose(0,1,'Forks have finished.',2) ### <4> ### Recompile results for randstr in randlist: os.unlink(filedict[randstr]) rje.fileTransfer(fromfile='%s.log' % randstr,tofile=self.log.info['Name'],deletefrom=True) outfiles = glob.glob('*.%s.*' % randstr) for outfile in outfiles: compfile = outfile.split('.') compfile.remove(randstr) compfile = string.join(compfile,'.') if randstr == randlist[0] and os.path.exists(compfile) and not self.opt['Append']: os.unlink(compfile) rje.fileTransfer(fromfile=outfile,tofile=compfile,deletefrom=True) self.verbose(1,2,'Copying results data from %s to %s...' % (outfile,compfile),0) self.verbose(0,1,'%d results files copied for Split %d.' % (len(outfiles),(randlist.index(randstr)+1)),1) self.log.printLog('#OUT','Results for %d splits compiled.' % len(randlist)) except SystemExit: # Don't want forks raising an Exception upon exiting sys.exit() except: self.log.errorLog('Error in forker(%s):' % _stage,printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def nextJob(self,host_id): ### Sets an new job running on host with given index #V1.0 '''Sets an new job running on host with given index.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### node = self.list['Hosts'][host_id] freemem = self.freeMem(node) if self.getBool('SeqBySeq'): return self.nextSeqJob(host_id) jdict = self.dict['Running'][host_id] = {} # Setup empty dictionary to fill, if jobs available if self.getNum('MemFree') > freemem: jdict['PID'] = 'WAIT - %.1f%% %s mem' % (freemem*100.0,node) return if self.list['SubJobs']: job = self.list['SubJobs'].pop(0) else: return if self.getBool('RjePy'): jdict['Log'] = '%si_%s.log' % (self.getStr('RunPath'),rje.randomString(6)) if self.getStr('JobINI'): job = '%s ini=%s' % (job,self.getStr('JobINI')) if 'Log' in jdict: job = '%s log=%s' % (job,jdict['Log']) try: open(jdict['Log'],'w') except: self.errorLog('Log problem. Aborting %s job.' % host_id); return self.endJob(host_id) ### ~ [2] ~ Add Job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rsh = "rsh %s '%s'" % (self.list['Hosts'][host_id],job) cpid = os.fork() # Fork child process if cpid: # parent process records pid of child rsh process jdict['PID'] = cpid if self.rsh(): self.printLog('#SUB',rsh) else: self.printLog('#SUB',job) self.printLog('#JOB','Running job as %s: %d remain; %.1f%% mem free' % (cpid,len(self.list['SubJobs']),freemem*100.0)) else: # child process if self.rsh(): os.system(rsh) else: os.system(job) os._exit(0) except SystemExit: raise # Child except: self.errorLog('JobFarmer.nextJob error')
def uniFake(self,seqs=[],store=False): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx,seqnum) = (0,seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt(self.log,self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['','none']: self.info['DatOut'] = rje.baseFile(seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self,datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log,self.cmd_list+['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self,hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log,self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog('#SEQ','Processing %s (%s aa) %s...' % (seq.shortName(),rje.integerString(seq.aaLen()),seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5),seq.info['AccNum']) open('%s.fas' % utmp,'w').write('>%s\n%s\n' % (seq.shortName(),seq.info['Sequence'])) udata = {'CC':['-!- Features generated using unifake.py'],'AC':[]} if seq.info['SpecCode'] in ['Unknown','UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']): details = rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']) self.addAlias(seq.info['AccNum'],details[0]) self.addAlias(seq.info['AccNum'],details[1]) self.addAlias(seq.info['AccNum'],details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(),seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append('%s;' % string.join(self.dict['Aliases'][id],'; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [seq.shortName(),seq.info['AccNum'],seq.info['ID']]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list['RegionDisorder']: ft.append({'Type':'DISORDER','Desc':'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'],'Start':disorder[0],'End':disorder[1]}) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % (ft[-1]['Desc'],dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({'Type':'ORDER','Desc':'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'],'Start':fold[0],'End':fold[1]}) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % (ft[-1]['Desc'],dis.stat['IUCut']) except: self.log.errorLog('UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({'SearchDB':'%s.fas' % utmp,'HMMOut':'%s.hmm.out' % utmp}) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [hmm.hmmSearch(self.info['PFam'],outfile=hmm.info['HMMOut'])] # Used in hmmTable hmm.hmmTable(outfile=hmmfile,append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list['ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append('PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'],time.ctime(os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append('-!- ERROR: PFam HMMer Search failure!') out = {'Type':'!ERROR!','Name':name} rje.delimitedFileOutput(self,hmmfile,['Type','Name','Start','End','Eval','Score'],datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = {'Start':aln.stat['SbjStart'],'End':aln.stat['SbjEnd'],'Type':'PFAM', 'Desc':'%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'],aln.stat['Expect'],aln.stat['BitScore'])} if disorder: region = disorder[aln.stat['SbjStart']-1:aln.stat['SbjEnd']] hmmdisorder = float(sum(region)) / len(region) pfamft['Desc'] = '%s; IUPRed: %.2f' % (pfamft['Desc'],hmmdisorder) if hmmdisorder < self.stat['DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog('UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen('%s %s.fas -short' % (self.info['TMHMM'],utmp)).readlines() domlist = rje_tm.domainList(rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append('TMHMM: %d TM domains; N-Term %s' % ((len(domlist)-1)/2,domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system('%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'],utmp,utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({'Type':'SIGNALP','Desc':desc,'Start':1,'End':cpos}) except: self.log.errorLog('UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq,udata,ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq(seq,data=udata,ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile,append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog('#UNIFAKE','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx),rje.integerString(seqnum-sx)),log=False) if store: uniprot.saveUniProt(datfile,append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog('Oh, the shame of it! Trouble during UniFake.uniFake()')