def interactiveUpdate(self): ### Interactive method for updating failed genes '''Interactive method for updating failed genes.''' try: ### ~ Setup failed lists and check interactivity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.stat['Interactive'] < 0: return failures = [] for gene in self.list['Genes']: if self.dict['GeneCard'][gene]['Symbol'] == '!FAILED!': failures.append(gene) if not failures or not rje.yesNo('Try manual mapping of %d failures?' % len(failures)): return ### ~ Manually map failures onto new gene list and try extracting ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mymapping = {} newgenes = [] try: for gene in failures: new = rje.choice('New gene symbol for > %s <?' % gene) if not new: continue mymapping[gene] = new if new not in newgenes: newgenes.append(new) except KeyboardInterrupt: if rje.yesNo('Quit GeneCards?',default='N'): raise except: raise self.processGenes(newgenes) for gene in mymapping: self.dict['GeneCard'][gene] = self.dict['GeneCard'][mymapping[gene]] return self.interactiveUpdate() except: self.log.errorLog('Problem during rje_GeneCards.interactiveUpdate()')
def cmdHelp(info=None,out=None,cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands '''Prints *.__doc__ and asks for more sys.argv commands.''' try: if info == None: info = makeInfo() if out == None: out = rje.Out() helpx = cmd_list.count('help') + cmd_list.count('-help') + cmd_list.count('-h') if helpx > 0: print '\n\nHelp for %s %s: %s\n' % (info.program, info.version, time.asctime(time.localtime(info.start_time))) out.verbose(-1,4,text=__doc__) if rje.yesNo('Show general commandline options?'): out.verbose(-1,4,text=rje.__doc__) if rje.yesNo('Quit?'): sys.exit() cmd_list += rje.inputCmds(out,cmd_list) elif out.stat['Interactive'] > 1: # Ask for more commands cmd_list += rje.inputCmds(out,cmd_list) return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: print 'Major Problem with cmdHelp()'
def cmdHelp(info=None, out=None, cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands """Prints *.__doc__ and asks for more sys.argv commands.""" try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not info: info = makeInfo() if not out: out = rje.Out() ### ~ [2] ~ Look for help commands and print options if found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### help = cmd_list.count("help") + cmd_list.count("-help") + cmd_list.count("-h") if help > 0: print "\n\nHelp for %s %s: %s\n" % ( info.program, info.version, time.asctime(time.localtime(info.start_time)), ) out.verbose(-1, 4, text=__doc__) if rje.yesNo("Show general commandline options?"): out.verbose(-1, 4, text=rje.__doc__) if rje.yesNo("Show disorder commandline options?"): out.verbose(-1, 4, text=rje_disorder.__doc__) if rje.yesNo("Quit?"): sys.exit() # Option to quit after help cmd_list += rje.inputCmds(out, cmd_list) # Add extra commands interactively. elif out.stat["Interactive"] > 1: cmd_list += rje.inputCmds(out, cmd_list) # Ask for more commands ### ~ [3] ~ Return commands ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: print "Major Problem with cmdHelp()"
def cmdHelp( info=None, out=None, cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands '''Prints *.__doc__ and asks for more sys.argv commands.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not info: info = makeInfo() if not out: out = rje.Out() ### ~ [2] ~ Look for help commands and print options if found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### help = cmd_list.count('help') + cmd_list.count( '-help') + cmd_list.count('-h') if help > 0: rje.printf('\n\nHelp for {0} {1}: {2}\n'.format( info.program, info.version, time.asctime(time.localtime(info.start_time)))) out.verbose(-1, 4, text=__doc__) if rje.yesNo('Show general commandline options?'): out.verbose(-1, 4, text=rje.__doc__) if rje.yesNo('Quit?'): sys.exit() # Option to quit after help cmd_list += rje.inputCmds( out, cmd_list) # Add extra commands interactively. elif out.stat['Interactive'] > 1: cmd_list += rje.inputCmds(out, cmd_list) # Ask for more commands ### ~ [3] ~ Return commands ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: rje.printf('Major Problem with cmdHelp()')
def cmdHelp( info=None, out=None, cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands '''Prints *.__doc__ and asks for more sys.argv commands.''' try: if not info: info = makeInfo() if not out: out = rje.Out() help = cmd_list.count('help') + cmd_list.count( '-help') + cmd_list.count('-h') if help > 0: print '\n\nHelp for %s %s: %s\n' % ( info.program, info.version, time.asctime(time.localtime(info.start_time))) out.verbose(-1, 4, text=__doc__) if rje.yesNo('Show general commandline options?'): out.verbose(-1, 4, text=rje.__doc__) if rje.yesNo('Quit?'): sys.exit() cmd_list += rje.inputCmds(out, cmd_list) elif out.stat['Interactive'] > 1: cmd_list += rje.inputCmds(out, cmd_list) # Ask for more commands return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: print 'Major Problem with cmdHelp()'
def cmdHelp(info=None, out=None, cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands """Prints *.__doc__ and asks for more sys.argv commands.""" try: if info == None: info = makeInfo() if out == None: out = rje.Out() help = cmd_list.count("help") + cmd_list.count("-help") + cmd_list.count("-h") if help > 0: print "\n\nHelp for %s %s: %s\n" % ( info.program, info.version, time.asctime(time.localtime(info.start_time)), ) out.verbose(-1, 4, text=__doc__) if rje.yesNo("Show general commandline options?"): out.verbose(-1, 4, text=rje.__doc__) if rje.yesNo("Quit?"): sys.exit() cmd_list += rje.inputCmds(out, cmd_list) elif out.stat["Interactive"] > 1: # Ask for more commands cmd_list += rje.inputCmds(out, cmd_list) return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: print "Major Problem with cmdHelp()"
def forking( self ): ### Keeps forking out and processing jobs until no more jobs in self.list['Forked']. '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('PIDCheck') or self.dev(): pidcheck = '%s.pid' % rje.baseFile( self.log.info['LogFile']) # Set *.pid object to match log else: pidcheck = False #self.deBug(pidcheck) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.list['Forked']: if pidcheck: PIDCHECK = open(pidcheck, 'w') for fdict in self.list['Forked'][0:]: try: pid = fdict['PID'] if pidcheck: PIDCHECK.write('%s: %s\n' % (self.list['Forked'].index(fdict), pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG) except: self.errorLog('!') status = 1 if status > 0: self.list['Forked'].remove(fdict) self.endFork( fdict ) # Fork has finished: can replace with processing if pidcheck: PIDCHECK.close() #self.deBug(open(pidcheck,'r').read()) ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if time.time() - self.getNum('KillTime') > self.getNum( 'KillForks'): self.verbose( 0, 1, '\n%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'), len(self.list['Forked'])), 1) for fdict in self.list['Forked']: self.verbose( 0, 2, ' => Fork %s, PID %d still Active!' % (fdict['ID'], fdict['PID']), 1) if self.i() < 0 or rje.yesNo('Kill Main Thread?'): raise ValueError( '%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'), len(self.list['Forked']))) elif rje.yesNo('Kill hanging forks?'): for fdict in self.list['Forked']: self.printLog( '#KILL', 'Killing Fork %s, PID %d.' % (fdict['ID'], fdict['PID'])) os.system('kill %d' % fdict['PID']) else: self.setNum({'KillTime': time.time()}) ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## time.sleep(self.getNum('ForkSleep'))
def gasp(self): ### Main GASP Method, copied from GASP v1.4 ''' Main GASP Method, copied from GASP v1.4. ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mytree = self.obj['Tree'] indeltree = self.getBool('IndelTree') if self.baseFile() == 'infile': self.baseFile('gasp') ### ~ [2] GASP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not indeltree or not mytree.node[-1].obj['Sequence']: # Perform GASP self.printLog('#SEQ','GASP: Gapped Ancestral Sequence Prediction') mygasp = rje_ancseq.Gasp(tree=mytree,ancfile='%s' % self.baseFile(),cmd_list=self.cmd_list,log=self.log) self.verbose(0,2,'%s' % mygasp.details(),1) if self.i() > 0: if not rje.yesNo('Use these parameters?'): mygasp.edit() mygasp.gasp() self.printLog('#GASP',"GASP run completed OK!") self.dict['Output']['anc.fas'] = '%s.anc.fas' % self.baseFile() self.dict['Output']['anc.nsf'] = '%s.anc.nsf' % self.baseFile() self.dict['Output']['anc.txt'] = '%s.anc.txt' % self.baseFile() ### ~ [3] InDel Tree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if indeltree: self.dict['Output']['indeltree'] = '%s.indel.txt' % self.baseFile() mytree.indelTree(filename='%s.indel.txt' % self.baseFile()) except: self.errorLog('%s.gasp error' % self.prog())
def convert(self,filelist=[],outfile=None): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists(outfile): # Create with header OUTFILE = open(outfile,'w') headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa'] rje.writeDelimit(OUTFILE,headers,delimit) else: OUTFILE = open(outfile,'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False) continue fx = 0 INFILE = open(infile,'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite,inline): scanlist = rje.matchExp(re_scansite,inline) rje.writeDelimit(OUTFILE,scanlist,delimit) sx += 1 fx += 1 rje.progressPrint(self,sx) inline = rje.nextLine(INFILE) self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False) raise
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'), rje.baseFile(self.getStr('RestBase'), strip_path=True, keepext=True)) rje.mkDir(self, self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full', 'text']: outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', '%s: %s' % (self.getStrLC('Rest'), outfile)) return True elif self.getStrLC('Rest'): self.printLog( '#OUTFMT', 'REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', 'full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self, self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey], 'w').write(self.dict['Output'][rkey]) self.printLog('#OUT', '%s: %s' % (rkey, self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def cmdHelp(info=None,out=None,cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands '''Prints *.__doc__ and asks for more sys.argv commands.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not info: info = makeInfo() if not out: out = rje.Out() ### ~ [2] ~ Look for help commands and print options if found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### cmd_help = cmd_list.count('help') + cmd_list.count('-help') + cmd_list.count('-h') if cmd_help > 0: print '\n\nHelp for %s %s: %s\n' % (info.program, info.version, time.asctime(time.localtime(info.start_time))) out.verbose(-1,4,text=__doc__) if rje.yesNo('Show general commandline options?'): out.verbose(-1,4,text=rje.__doc__) if rje.yesNo('Quit?'): sys.exit() # Option to quit after help cmd_list += rje.inputCmds(out,cmd_list) # Add extra commands interactively. elif out.stat['Interactive'] > 1: cmd_list += rje.inputCmds(out,cmd_list) # Ask for more commands ### ~ [3] ~ Return commands ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: print 'Major Problem with cmdHelp()'
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['Prog'] = None prog = self.getStrLC('Name') if prog in mod: i = self.obj['ProgInfo'] = mod[prog].makeInfo() self.printLog('#PROG','%s V%s: %s' % (i.program,i.version,i.description)) progcmd = rje.getCmdList([],info=i) + self.cmd_list + ['newlog=F'] out = rje.Out(cmd_list=progcmd) out.printIntro(i) #self.debug(prog); self.debug(progcmd) if self.getBool('Help'): progcmd = mod[prog].cmdHelp(i,out,['help']+progcmd) self.printLog('#CMD','Full %s CmdList: %s' % (i.program,rje.argString(rje.tidyArgs(progcmd,nopath=self.getStrLC('Rest') and not self.dev(),purgelist=purgelist))),screen=False) #self.debug(prog); self.debug(progcmd) ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prog in ['seqlist','rje_seqlist']: self.obj['Prog'] = rje_seqlist.SeqList(self.log,progcmd) elif prog in ['uniprot','rje_uniprot']: self.obj['Prog'] = rje_uniprot.UniProt(self.log,progcmd) elif prog in ['taxonomy','rje_taxonomy']: self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log,progcmd) elif prog in ['tree','rje_tree']: self.obj['Prog'] = rje_tree.Tree(self.log,progcmd) elif prog in ['xref','rje_xref']: self.obj['Prog'] = rje_xref.XRef(self.log,progcmd) elif prog in ['seq','rje_seq']: self.obj['Prog'] = rje_seq.SeqList(self.log,progcmd) elif prog in ['mitab','rje_mitab']: self.obj['Prog'] = rje_mitab.MITAB(self.log,progcmd) elif prog in ['dbase','database']: self.obj['Prog'] = rje_dbase.DatabaseController(self.log,progcmd) elif prog in ['pydocs']: self.obj['Prog'] = rje_pydocs.PyDoc(self.log,progcmd) elif prog in ['ensembl','rje_ensembl']: self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log,progcmd) elif prog in ['genbank','rje_genbank']: self.obj['Prog'] = rje_genbank.GenBank(self.log,progcmd) elif prog in ['extatic']: self.obj['Prog'] = extatic.ExTATIC(self.log,progcmd) elif prog in ['revert']: self.obj['Prog'] = revert.REVERT(self.log,progcmd) elif prog in ['fiesta']: self.obj['Prog'] = fiesta.FIESTA(self.log,progcmd) elif prog in ['gablam']: self.obj['Prog'] = gablam.GABLAM(self.log,progcmd) elif prog in ['gopher']: self.obj['Prog'] = gopher.Gopher(self.log,progcmd) elif prog in ['haqesac']: self.obj['Prog'] = haqesac.HAQESAC(self.log,progcmd) elif prog in ['multihaq']: self.obj['Prog'] = multihaq.MultiHAQ(self.log,progcmd) elif prog in ['pingu']: self.obj['Prog'] = pingu.PINGU(self.log,progcmd) elif prog in ['pacbio']: self.obj['Prog'] = rje_pacbio.PacBio(self.log,progcmd) elif prog in ['rje_zen','zen']: self.obj['Prog'] = rje_zen.Zen(self.log,progcmd) ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['Prog']: self.printLog('#ERR','Program "%s" not recognised.' % self.getStr('Name')) if self.i() < 0: return False if rje.yesNo('Show SeqSuite help with program options?'): extracmd = cmdHelp(cmd_list=['help'])[1:] if extracmd: self.cmd_list += extracmd self._cmdList() if prog != self.getStrLC('Name'): return self.setup() self.setStr({'Name':rje.choice('Give program name (Blank or CTRL+C to quit)')}) if self.getStrLC('Name'): return self.setup() else: return False return self.obj['Prog'] # Setup successful except KeyboardInterrupt: return False except SystemExit: raise except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def findOccPos(callobj,Occ,qry,fudge=0): ### Finds Motif Occurence in alignment ''' Finds Motif Occurence in alignment. >> callobj = calling MotifList object >> Occ = MotifOcc object >> qry = query Sequence object from alignment file >> fudge = amount to try shifting match to find occurrence is non-matching sequence << (start,end) = start and end position in aligment to allow sequence[start:end] ''' try: ### Find Hit in Alignment ### (start,end) = (-1,qry.seqLen()) # Start and end positins of match *in alignment* qpos = Occ.stat['Pos'] + fudge # Starting position of hit (from 1->L) qmatch = Occ.getData('Match') qend = qpos + len(qmatch) - 1 # Ending position of hit (1->L) (r,a) = (0,0) # Counters for aln residues (r) and amino acid positions (a) while r < qry.seqLen(): # Keep looking #x#print r, a, qpos, qend, start, end if qry.info['Sequence'][r] != '-': # Not a gap: increment a by 1 a += 1 if a == qpos and start < 0: # Start of match (not yet r+=1 because pos is 1->L) start = r r += 1 # Move on through aligned sequences if a == qend: # End of match end = r break ### Assess whether hit is right! ### amatch = string.replace(qry.info['Sequence'][start:end],'-','') if amatch == qmatch: # Everything is OK! return (start,end) ## Check whether already fudging! ## if fudge != 0: raise ValueError ### Something is wrong! Try to find real match! ### etxt = 'Alignment sequence (%s) does not match occurence (%s)' % (amatch,qmatch) #X#if string.replace(qry.info['Sequence'],'-','') == Occ.obj['Seq'].info['Sequence']: # But sequence matches! #X#callobj.log.errorLog('Problem with %s pos %d. Sequences match but %s.' % (qry.shortName(),Occ.stat['Pos'],etxt),printerror=False) #X#return (-1,-1) ## Try to find match by moving start (using fudge) ## if callobj.stat['Interactive'] < 1 or rje.yesNo('%s. Try to find closest correct match?' % etxt): fudge = findFudge(string.replace(qry.info['Sequence'],'-',''),qmatch,Occ.stat['Pos']-1) if fudge: if callobj.stat['Interactive'] > 0: callobj.log.errorLog('%s in alignment differs from input: Fudged %s by %d aa!' % (qry.shortName(),qmatch,fudge),printerror=False) else: callobj.log.printLog('#ERR','%s in alignment differs from input: Fudged %s by %d aa!' % (qry.shortName(),qmatch,fudge),screen=False) return findOccPos(callobj,Occ,qry,fudge) callobj.log.errorLog('%s in alignment differs from input: Cannot find %s anywhere' % (qry.shortName(),qmatch),printerror=False) return (-1,-1) callobj.log.errorLog(etxt,printerror=False) return (-1,-1) except: callobj.log.errorLog('Something bad has happened in rje_motif_stats.findOccPos()') return (-1,-1)
def setup(self,rest=False): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['Prog'] = None prog = self.getStrLC('Name') if prog in mod: i = self.obj['ProgInfo'] = mod[prog].makeInfo() self.printLog('#PROG','%s V%s: %s' % (i.program,i.version,i.description)) progcmd = rje.getCmdList([],info=i) + self.cmd_list + ['newlog=F'] out = rje.Out(cmd_list=progcmd) out.printIntro(i) if self.getBool('Help'): progcmd = mod[prog].cmdHelp(i,out,['help']+progcmd) purgelist = seqsuite.purgelist self.printLog('#CMD','Full %s CmdList: %s' % (i.program,rje.argString(rje.tidyArgs(progcmd,nopath=self.getStrLC('Rest') and not self.dev(),purgelist=purgelist))),screen=False) #self.debug(prog) ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prog in ['slimcore','rje_slimcore','core']: self.obj['Prog'] = rje_slimcore.SLiMCore(self.log,progcmd) elif prog in ['rlc','iuscore']: self.obj['Prog'] = rje_slimcore.SLiMCore(self.log,progcmd+['prog=%s' % prog]) elif prog in ['slimlist','rje_slimlist']: self.obj['Prog'] = rje_slimlist.SLiMList(self.log,progcmd) elif prog in ['slimfinder']: self.obj['Prog'] = slimfinder.SLiMFinder(self.log,progcmd) elif prog in ['qslimfinder']: self.obj['Prog'] = qslimfinder.QSLiMFinder(self.log,progcmd) elif prog in ['slimprob']: self.obj['Prog'] = slimprob.SLiMProb(self.log,progcmd) elif prog in ['slimmaker']: self.obj['Prog'] = slimmaker.SLiMMaker(self.log,progcmd) elif prog in ['slimfarmer','farm']: self.obj['Prog'] = slimfarmer.SLiMFarmer(self.log,progcmd) elif prog in ['slimbench']: self.obj['Prog'] = slimbench.SLiMBench(self.log,progcmd) elif prog in ['comparimotif']: self.obj['Prog'] = comparimotif.CompariMotif(self.log,progcmd) elif prog in ['peptcluster']: self.obj['Prog'] = peptcluster.PeptCluster(self.log,progcmd) elif prog in ['peptalign']: self.obj['Prog'] = peptcluster.PeptCluster(self.log,['peptalign=T']+progcmd+['peptdis=None']) self.obj['Prog'].dict['Output']['help'] = mod[prog].__doc__ elif prog in seqsuite.mod: seqsuiteobj = seqsuite.SeqSuite(self.log,self.cmd_list) self.obj['Prog'] = seqsuiteobj.setup() self.obj['ProgInfo'] = seqsuiteobj.obj['ProgInfo'] self.obj['Prog'].dict['Output']['help'] = seqsuite.mod[prog].__doc__ ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['Prog']: if self.getStrLC('Name') != 'help': if not rest: self.printLog('#ERR','Program "%s" not recognised.' % self.getStr('Name')) if self.i() < 0 or rest: return False #!# Try SeqSuite? #!# if self.getStrLC('Name') == 'help' or rje.yesNo('Show SLiMSuite help with program options?'): extracmd = cmdHelp(cmd_list=['help'])[1:] if extracmd: self.cmd_list += extracmd self._cmdList() if prog != self.getStrLC('Name'): return self.setup() self.setStr({'Name':rje.choice('Give program name (Blank or CTRL+C to quit)')}) if self.getStrLC('Name'): return self.setup() else: return False return self.obj['Prog'] # Setup successful except KeyboardInterrupt: return False except SystemExit: raise except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def setup(self): ### Main class setup method. Makes sumfile if necessary. '''Main class setup method. Makes sumfile if necessary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile')) if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile'])) if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile() self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile')) if os.path.exists(self.info['SumFile']) and not self.opt['Force']: if rje.yesNo('%s found. Use these results?' % self.info['SumFile']): return self.printLog('#SUM','Summary results file found. No MASCOT processing.') mapgi = False ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mfile in self.list['ResFiles']: bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile]) bud.info['Name'] = mfile bud.readMascot() self.dict['Searches'][mfile] = bud.dict['Hits'] protacclist = rje.sortKeys(bud.dict['Hits']) for protacc in protacclist: if rje.matchExp('gi\|(\d+)',protacc): mapgi = True accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile)) self.debug(accfile) open(accfile,'w').write(string.join(protacclist,'\n')) self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist))) ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #if mapgi: # mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI') # open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n')) ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list) self.dict['Acc2Seq'] = seqlist.seqNameDic('Max') ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',') rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True) for mfile in rje.sortKeys(self.dict['Searches']): bud = self.dict['Searches'][mfile] for protacc in rje.sortKeys(bud)[0:]: protname = bud[protacc]['prot_acc'] protdesc = bud[protacc]['prot_desc'] if rje.matchExp('gi\|(\d+)',protacc): gi = rje.matchExp('gi\|(\d+)',protacc)[0] try: protname = self.dict['Acc2Seq'][gi].shortName() protdesc = self.dict['Acc2Seq'][gi].info['Description'] except: protname = 'gi_UNK__%s' % gi #x#print protname, protdesc, bud[protacc] for pep in bud[protacc]['Peptides']: data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname, 'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']} rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data) except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def forking(self): ### Keeps forking out and processing jobs until no more jobs in self.list['Forked']. '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('PIDCheck') or self.dev(): pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile']) # Set *.pid object to match log else: pidcheck = False #self.deBug(pidcheck) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.list['Forked']: if pidcheck: PIDCHECK = open(pidcheck,'w') for fdict in self.list['Forked'][0:]: try: pid = fdict['PID'] if pidcheck: PIDCHECK.write('%s: %s\n' % (self.list['Forked'].index(fdict),pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status,exit_stat) = os.waitpid(pid,os.WNOHANG) except: self.errorLog('!') status = 1 if status > 0: self.list['Forked'].remove(fdict) self.endFork(fdict) # Fork has finished: can replace with processing if pidcheck: PIDCHECK.close() #self.deBug(open(pidcheck,'r').read()) ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if time.time() - self.getNum('KillTime') > self.getNum('KillForks'): self.verbose(0,1,'\n%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'),len(self.list['Forked'])),1) for fdict in self.list['Forked']: self.verbose(0,2,' => Fork %s, PID %d still Active!' % (fdict['ID'],fdict['PID']),1) if self.i() < 0 or rje.yesNo('Kill Main Thread?'): raise ValueError('%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'),len(self.list['Forked']))) elif rje.yesNo('Kill hanging forks?'): for fdict in self.list['Forked']: self.printLog('#KILL','Killing Fork %s, PID %d.' % (fdict['ID'],fdict['PID'])) os.system('kill %d' % fdict['PID']) else: self.setNum({'KillTime':time.time()}) ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## time.sleep(self.getNum('ForkSleep'))
def cmdHelp( info=None, out=None, cmd_list=[]): ### Prints *.__doc__ and asks for more sys.argv commands '''Prints *.__doc__ and asks for more sys.argv commands.''' try: if not info: info = makeInfo() if len(sys.argv) == 2 and sys.argv[1] in [ 'version', '-version', '--version' ]: rje.printf(info.version) sys.exit(0) if len(sys.argv) == 2 and sys.argv[1] in [ 'details', '-details', '--details' ]: rje.printf('{0} v{1}'.format(info.program, info.version)) sys.exit(0) if not out: out = rje.Out() help = cmd_list.count('help') + cmd_list.count( '-help') + cmd_list.count('-h') if help > 0: rje.printf('\n\nHelp for {0} {1}: {2}\n'.format( info.program, info.version, time.asctime(time.localtime(info.start_time)))) out.verbose(-1, 4, text=__doc__) if rje.yesNo('Show general commandline options?'): out.verbose(-1, 4, text=rje.__doc__) if rje.yesNo('Quit?'): sys.exit() cmd_list += rje.inputCmds(out, cmd_list) elif out.stat['Interactive'] > 1: cmd_list += rje.inputCmds(out, cmd_list) # Ask for more commands return cmd_list except SystemExit: sys.exit() except KeyboardInterrupt: sys.exit() except: rje.printf('Major Problem with cmdHelp()')
def maskCleave(self): ### Outputs masked cleavage sequences to file ''' Outputs masked cleavage sequences to file. ''' try: ### <a> ### Setup _stage = '<a> Setup' seqlist = self.obj['SeqList'] seqlist.loadSeqs() seqlist.degapSeq() outfile = '%s.cleaved.fas' % seqlist.info['Basefile'] ### <b> ### MaskSeqs _stage = '<b> Mask' self.verbose(0,3,'Masking cleaved signalp petides from %s into %s...' % (seqlist.info['Name'],outfile),0) cx = 0 for seq in seqlist.seq: cpos = 0 acc = seq.info['AccNum'] if acc in self.signalp.keys(): sigp = self.signalp.pop(acc) else: continue if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos==0 or (cpos > 0 and hmm_c < cpos): cpos = hmm_c if cpos > 0: cx += 1 seq.info['Sequence'] = 'X' * cpos + seq.info['Sequence'][cpos:] if cx/100 == cx/100.0: self.verbose(0,4,'.',0) self.verbose(0,1,'Done! %d sequences masked.' % cx,1) ### <c> ### Save _stage = '<c> Save' seqlist.saveFasta(seqfile=outfile) ### <d> ### Run TMHMM _stage = '<d> Run TMHMM' if rje.yesNo('Run TMHMM on this now?'): os.system('tmhmm %s.cleaved.fas -short > %s.cleaved.tmhmm' % (seqlist.info['Basefile'],seqlist.info['Basefile'])) return except: self.log.errorLog('Problem with maskCleave() %s.' % _stage)
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) self.db().basefile(self.basefile()) self.list['Accuracy'] = [0,1.0 - self.getNum('ErrPerBase')] ## ~ [1a] SMRTReads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while self.getStrLC('SMRTUnits') not in ['reads','gb','mb']: txt = 'SMRTUnits "%s" not recognised' if self.getNum('SMRTReads') < 10: smrtunits = 'Gb' elif self.getNum('SMRTReads') > 10000: smrtunits = 'reads' else: smrtunits = 'Mb' if self.i() < 0 or rje.yesNo('%s: switch to (%s) %s?' % (txt,self.getNum('SMRTReads'),smrtunits)): self.setStr({'SMRTUnits':smrtunits}) elif self.i() >0: self.setStr({'SMRTUnits':rje.choice('SMRTUnits (reads/Gb/Mb)?')}) self.printLog('#UNITS','%s => %s' % (txt,self.getStr('SMRTUnits'))) if self.getStrLC('SMRTUnits') in ['gb','mb']: smrttotal = self.getNum('SMRTReads') * {'gb':1e9,'mb':1e6}[self.getStrLC('SMRTUnits')] txt = '%s %s @ %.3f kb/read' % (self.getNum('SMRTReads'),self.getStr('SMRTUnits'),self.getNum('AvRead')/1000.0) self.setNum({'SMRTReads':smrttotal/self.getNum('AvRead')}) txt += ' => %s reads' % rje.iStr(int(self.getNum('SMRTReads'))) self.printLog('#READS',txt) ## ~ [1b] XnList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## xnlist = [] for xn in self.list['XnList']: if xn == '': continue try: ixn = int(xn) if xn not in [ixn,'%d' % ixn]: self.printLog('#XN','"%s" -> %dX' % (xn,ixn)) if ixn == 0: self.printLog('#XN','No point in 0X output: use 1-%Coverage.') elif ixn == 1: self.printLog('#XN','No point in 1X output: use %Coverage.') else: xnlist.append(ixn) except: self.errorLog('Could not process %s as part of XnList. (Integers only.)' % xn) xnlist.sort() if xnlist: self.printLog('#XN','XnList: %sX.' % string.join(string.split('%s' % xnlist,','),'X, ')[1:-1]) self.list['XnList'] = xnlist return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) rje.mkDir(self,self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full','text']: outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','%s: %s' % (self.getStrLC('Rest'),outfile)) return True elif self.getStrLC('Rest'): self.printLog('#OUTFMT','REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self,self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey],'w').write(self.dict['Output'][rkey]) self.printLog('#OUT','%s: %s' % (rkey,self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def runMain(): try: ### <0> ### Basic Setup of Program [info,out,mainlog,cmd_list] = setupProgram() ### <1> ### Load Data ## <a> ## Read in Sequences try: out.verbose(1,3,'Loading sequences...',0) seqfile = 'infile.fas' nsfin = None for cmd in cmd_list: if cmd.find('seqin=') == 0: seqfile=cmd[len('seqin='):] if cmd.find('nsfin=') == 0: nsfin = cmd[len('nsfin='):] basefile = seqfile extension = seqfile[-4:] if (extension == '.fas') or (extension == '.phy') or (extension == '.aln'): basefile = seqfile[:-4] seqs = rje_seq.SeqList(log=mainlog,cmd_list=['i=0']+cmd_list+['autofilter=F','autoload=F','seqin=None']) out.verbose(1,3,"from %s" % seqfile,1) if not seqs.loadSeqs(seqfile=seqfile,seqtype='protein',aln=True): raise seqfile = seqs.info['Name'] basefile = rje.baseFile(seqfile) mainlog.printLog('#SEQ',"%s protein sequences read from %s\n" % (str(seqs.seqNum()),seqfile),1) mainlog.printLog('#SEQ',"Alignment = %s. (%d aa)\n" % (seqs.opt['Aligned'],seqs.seq[0].seqLen()),1) except: mainlog.errorLog("Fatal run Exception during Sequence Input\n") raise ## <b> ## Read in Tree try: if not nsfin: nsfin = basefile + '.nsf' while not os.path.exists(nsfin): if out.stat['Interactive'] >= 0: nsfin = rje.choice(text='Input tree file "%s" not found. Input filename? (Blank to exit.)' % nsfin) if nsfin == '': raise KeyboardInterrupt else: mainlog.log.errorLog('File %s not found. Cannot load tree!' % nsfin,printerror=False,quitchoice=True) raise cmd_list.append('nsfin=' + nsfin) out.verbose(1,3,'Loading tree from %s...' % nsfin,1) mytree = rje_tree.Tree(log=mainlog,cmd_list=['root=yes']+cmd_list) mytree.mapSeq(seqlist=seqs) mytree.textTree() if mytree.opt['ReRooted']: mytree.saveTree(filename='%s.nsf' % basefile) except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") raise except: mainlog.errorLog("Fatal run Exception during Tree Input\n") raise ### <2> ### GASP try: ## <a> ## InDel Tree Setup indeltree = None for cmd in cmd_list: if cmd.find('indeltree=') == 0: indeltree=cmd[len('indeltree='):] ## <b> ## GASP if indeltree == None or mytree.node[-1].obj['Sequence'] == None: # Perform GASP out.verbose(0,2,'',3) mainlog.printLog('#SEQ','GASP: Gapped Ancestral Sequence Prediction',1) if basefile == 'infile': basefile = 'gasp' mygasp = rje_ancseq.Gasp(tree=mytree,ancfile='%s' % basefile,cmd_list=cmd_list,log=mainlog) out.verbose(0,2,'%s' % mygasp.details(),1) if out.stat['Interactive'] > 0: if rje.yesNo('Use these parameters?') == False: mygasp.edit() mygasp.gasp() out.verbose(0,1,"\n\nGASP run completed OK!",2) ## <c> ## InDel Tree if indeltree: mytree.indelTree(filename=indeltree) except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") raise except: mainlog.errorLog("Fatal run Exception during GASP\n") raise ### <X> ### End except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") except: print "Unexpected error:", sys.exc_info()[0] mainlog.printLog('#LOG', "%s V:%s End: %s\n" % (info.program, info.version, time.asctime(time.localtime(time.time()))), 1)
def forker(self): ### Generic forking method ''' Generic method for forking (without threads). Add description here (and arguments.) ''' try: ### <0> ### Setup _stage = '<0> Fork Setup' forkx = int(self.stat['Forks']) # Number of forks to have running at one time if self.opt['Win32'] or forkx < 1: self.opt['NoForks'] = True forks = [] # List of active fork PIDs killforks = int(self.stat['KillForks']) # Time in seconds to wait after main thread has apparently finished forking_condition = True # Condition to keep forking ### Sequence List setup ### _stage = '<1> Forking' seqx = 0 # Sequence Counter subx = 0 # Subset sequence counter outfile = None # Output file name randlist = [] # List of random strings for split sequence files filedict = {} # Dictionary of input files for each random string seqlist = rje_seq.SeqList(log=self.log,cmd_list=['autoload=F']+self.cmd_list) seqlist.makeBaseFile() SEQFILE = open(seqlist.info['Name'], 'r') (seq,lastline) = seqlist.nextFasSeq(SEQFILE,'Starting') while seq: seqlist.seq = [seq] if self.info['StartFrom'] != 'None': # Not yet reached wanted sequence if self.info['StartFrom'] in [seq.info['Name'], seq.info['ID'], seq.info['AccNum'], seq.shortName()]: self.info['StartFrom'] = 'None' if self.info['StartFrom'] == 'None': # Wanted sequence if outfile: # Create new file SEQOUT = open(outfile,'a') else: rs = rje.randomString(6) while rs in randlist: rs = rje.randomString(6) outfile = '%s.%s.fas' % (seqlist.info['Basefile'],rs) SEQOUT = open(outfile,'w') randlist.append(rs) filedict[rs] = outfile SEQOUT.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence'])) SEQOUT.close() seqx += 1 subx += 1 if subx == self.stat['Split']: # Finished split self.log.printLog('#SEQ','%s sequences output to %s.' % (rje.integerString(subx),outfile)) outfile = None subx = 0 (seq,lastline) = seqlist.nextFasSeq(SEQFILE,lastline) if subx > 0: self.log.printLog('#SEQ','%s sequences output to %s.' % (rje.integerString(subx),outfile)) self.log.printLog('#SEQ','%s sequences output in total to %d files.' % (rje.integerString(seqx),len(randlist))) # Now have the list of random strings in randlist (in order) and filenames in filedict ### <1> ### Forking killtime = time.time() dealt_with = 0 # Split files dealt with while dealt_with < len(randlist) or len(forks): ## <a> ## forks _stage = '<1a> New Forks' while dealt_with < len(randlist) and (len(forks) < forkx or self.opt['NoForks']): # Add more forks _stage = '<1a-i> Fork: Get stuff for fork' killtime = time.time() # Reset killtime - still doing stuff # Add new fork _stage = '<1a-ii> Fork: New Fork' new_fork_id = randlist[dealt_with] dealt_with += 1 outcmd = string.split(self.info['OutCmd'],'.') if len(outcmd) > 1: outcmd = outcmd[:-1] + [new_fork_id] + outcmd[-1:] else: outcmd = outcmd + [new_fork_id] + ['resfile'] outcmd = string.join(outcmd,'.') forkcmd = '%s %s%s %s %s log=%s.log newlog=T i=-1' % (self.info['ForkProg'],self.info['SeqInCmd'],filedict[new_fork_id],outcmd,self.info['ForkCmd'],new_fork_id) if self.opt['NoForks']: os.system(forkcmd) else: # Forks newpid = os.fork() if newpid == 0: # child os.system(forkcmd) sys.exit() # Exit process elif newpid == -1: # error self.log.errorLog('Problem forking %s.' % new_fork_id) else: forks.append(newpid) # Add fork to list ## <b> ## Monitor and remove finished forks _stage = '<1b> Finished Forks' forklist = self._activeForks(forks) if len(forklist) != len(forks): self.verbose(0,2,' => %d of %d forks finished!' % (len(forks) - len(forklist),len(forks)),1) forks = forklist[0:] self.verbose(3,3,'End of a Cycle.',2) ## <c> ## Look for eternal hanging of forks _stage = '<1c> Hanging' if time.time() - killtime > killforks: self.verbose(0,1,'\n%d seconds of main program inactivity. %d forks still active!' % (killforks,len(forks)),1) for fork in forks: self.verbose(0,2,' => Fork PID %d still Active!' % (fork),1) if rje.yesNo('Kill?'): break #!# killing options else: killtime = time.time() ### <3> ### Finish _stage = '<3> Finish' if len(forks) > 0: self.log.errorLog('%d Forks still active after %d seconds of main program inactivity' % (len(forks),killforks),True) else: self.verbose(0,1,'Forks have finished.',2) ### <4> ### Recompile results for randstr in randlist: os.unlink(filedict[randstr]) rje.fileTransfer(fromfile='%s.log' % randstr,tofile=self.log.info['Name'],deletefrom=True) outfiles = glob.glob('*.%s.*' % randstr) for outfile in outfiles: compfile = outfile.split('.') compfile.remove(randstr) compfile = string.join(compfile,'.') if randstr == randlist[0] and os.path.exists(compfile) and not self.opt['Append']: os.unlink(compfile) rje.fileTransfer(fromfile=outfile,tofile=compfile,deletefrom=True) self.verbose(1,2,'Copying results data from %s to %s...' % (outfile,compfile),0) self.verbose(0,1,'%d results files copied for Split %d.' % (len(outfiles),(randlist.index(randstr)+1)),1) self.log.printLog('#OUT','Results for %d splits compiled.' % len(randlist)) except SystemExit: # Don't want forks raising an Exception upon exiting sys.exit() except: self.log.errorLog('Error in forker(%s):' % _stage,printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['Prog'] = None prog = self.getStrLC('Name') if prog in mod: i = self.obj['ProgInfo'] = mod[prog].makeInfo() self.printLog( '#PROG', '%s V%s: %s' % (i.program, i.version, i.description)) progcmd = rje.getCmdList( [], info=i) + self.cmd_list + ['newlog=F'] out = rje.Out(cmd_list=progcmd) out.printIntro(i) #self.debug(prog); self.debug(progcmd) if self.getBool('Help'): progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd) self.printLog('#CMD', 'Full %s CmdList: %s' % (i.program, rje.argString( rje.tidyArgs(progcmd, nopath=self.getStrLC('Rest') and not self.dev(), purgelist=purgelist))), screen=False) #self.debug(prog); self.debug(progcmd) ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prog in ['seqlist', 'rje_seqlist']: self.obj['Prog'] = rje_seqlist.SeqList(self.log, progcmd) elif prog in ['uniprot', 'rje_uniprot']: self.obj['Prog'] = rje_uniprot.UniProt(self.log, progcmd) elif prog in ['taxonomy', 'rje_taxonomy']: self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log, progcmd) elif prog in ['tree', 'rje_tree']: self.obj['Prog'] = rje_tree.Tree(self.log, progcmd) elif prog in ['xref', 'rje_xref']: self.obj['Prog'] = rje_xref.XRef(self.log, progcmd) elif prog in ['seq', 'rje_seq']: self.obj['Prog'] = rje_seq.SeqList(self.log, progcmd) elif prog in ['mitab', 'rje_mitab']: self.obj['Prog'] = rje_mitab.MITAB(self.log, progcmd) elif prog in ['dbase', 'database']: self.obj['Prog'] = rje_dbase.DatabaseController( self.log, progcmd) elif prog in ['pydocs']: self.obj['Prog'] = rje_pydocs.PyDoc(self.log, progcmd) elif prog in ['ensembl', 'rje_ensembl']: self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log, progcmd) elif prog in ['genbank', 'rje_genbank']: self.obj['Prog'] = rje_genbank.GenBank(self.log, progcmd) elif prog in ['extatic']: self.obj['Prog'] = extatic.ExTATIC(self.log, progcmd) elif prog in ['revert']: self.obj['Prog'] = revert.REVERT(self.log, progcmd) elif prog in ['fiesta']: self.obj['Prog'] = fiesta.FIESTA(self.log, progcmd) elif prog in ['gablam']: self.obj['Prog'] = gablam.GABLAM(self.log, progcmd) elif prog in ['gopher']: self.obj['Prog'] = gopher.Gopher(self.log, progcmd) elif prog in ['haqesac']: self.obj['Prog'] = haqesac.HAQESAC(self.log, progcmd) elif prog in ['multihaq']: self.obj['Prog'] = multihaq.MultiHAQ(self.log, progcmd) elif prog in ['pingu']: self.obj['Prog'] = pingu.PINGU(self.log, progcmd) elif prog in ['pacbio']: self.obj['Prog'] = rje_pacbio.PacBio(self.log, progcmd) elif prog in ['rje_zen', 'zen']: self.obj['Prog'] = rje_zen.Zen(self.log, progcmd) ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['Prog']: self.printLog( '#ERR', 'Program "%s" not recognised.' % self.getStr('Name')) if self.i() < 0: return False if rje.yesNo('Show SeqSuite help with program options?'): extracmd = cmdHelp(cmd_list=['help'])[1:] if extracmd: self.cmd_list += extracmd self._cmdList() if prog != self.getStrLC('Name'): return self.setup() self.setStr({ 'Name': rje.choice('Give program name (Blank or CTRL+C to quit)') }) if self.getStrLC('Name'): return self.setup() else: return False return self.obj['Prog'] # Setup successful except KeyboardInterrupt: return False except SystemExit: raise except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log, self.cmd_list) self.db().basefile(self.basefile()) self.list['Accuracy'] = [0, 1.0 - self.getNum('ErrPerBase')] ## ~ [1a] SMRTReads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while self.getStrLC('SMRTUnits') not in ['reads', 'gb', 'mb']: txt = 'SMRTUnits "%s" not recognised' if self.getNum('SMRTReads') < 10: smrtunits = 'Gb' elif self.getNum('SMRTReads') > 10000: smrtunits = 'reads' else: smrtunits = 'Mb' if self.i() < 0 or rje.yesNo( '%s: switch to (%s) %s?' % (txt, self.getNum('SMRTReads'), smrtunits)): self.setStr({'SMRTUnits': smrtunits}) elif self.i() > 0: self.setStr( {'SMRTUnits': rje.choice('SMRTUnits (reads/Gb/Mb)?')}) self.printLog('#UNITS', '%s => %s' % (txt, self.getStr('SMRTUnits'))) if self.getStrLC('SMRTUnits') in ['gb', 'mb']: smrttotal = self.getNum('SMRTReads') * { 'gb': 1e9, 'mb': 1e6 }[self.getStrLC('SMRTUnits')] txt = '%s %s @ %.3f kb/read' % (self.getNum('SMRTReads'), self.getStr('SMRTUnits'), self.getNum('AvRead') / 1000.0) self.setNum({'SMRTReads': smrttotal / self.getNum('AvRead')}) txt += ' => %s reads' % rje.iStr(int(self.getNum('SMRTReads'))) self.printLog('#READS', txt) ## ~ [1b] XnList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## xnlist = [] for xn in self.list['XnList']: if xn == '': continue try: ixn = int(xn) if xn not in [ixn, '%d' % ixn]: self.printLog('#XN', '"%s" -> %dX' % (xn, ixn)) if ixn == 0: self.printLog( '#XN', 'No point in 0X output: use 1-%Coverage.') elif ixn == 1: self.printLog('#XN', 'No point in 1X output: use %Coverage.') else: xnlist.append(ixn) except: self.errorLog( 'Could not process %s as part of XnList. (Integers only.)' % xn) xnlist.sort() if xnlist: self.printLog( '#XN', 'XnList: %sX.' % string.join(string.split('%s' % xnlist, ','), 'X, ')[1:-1]) self.list['XnList'] = xnlist return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def convert(self, filelist=[], outfile=None ): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog( 'No scansite files to convert! %s unchanged/not made.' % outfile, printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog( '#OUT', 'Converting %d file(s), output to %s.' % (len(filelist), outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists( outfile): # Create with header OUTFILE = open(outfile, 'w') headers = [ 'seq_id', 'enzyme', 'enz_group', 'aa', 'pos', 'score', 'percentile', 'matchseq', 'sa' ] rje.writeDelimit(OUTFILE, headers, delimit) else: OUTFILE = open(outfile, 'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog( 'Input file %s does not exist! :o(' % infile, False, False) continue fx = 0 INFILE = open(infile, 'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite, inline): scanlist = rje.matchExp(re_scansite, inline) rje.writeDelimit(OUTFILE, scanlist, delimit) sx += 1 fx += 1 rje.progressPrint(self, sx) inline = rje.nextLine(INFILE) self.log.printLog( '#OUT', '%s scansite results from %s. (%s Total.)' % (rje.integerString(fx), infile, rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog( '#OUT', '%s scansite results output to %s.' % (rje.integerString(sx), outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage, printerror=True, quitchoice=False) raise
def mapHit(self,seq,hits,hitdict,method): ### Tries to map seq onto hitseq and returns hit if successful ''' Tries to map seq onto hitseq and returns hit if successful. >> seq:Query Sequence Object >> hits:List of hits in rough order of goodness >> hitdict:Dictionary of {hitname:stats} >> method:Mapping method to use ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (name,sequence) = seq data = rje_sequence.extractNameDetails(name,self) data['Sequence'] = seq[1] data['ShortName'] = string.split(seq[0])[0] for hit in hitdict: hitdict[hit]['Data'] = rje_sequence.extractNameDetails(hitdict[hit]['Seq'][0],self) hitdict[hit]['Data']['Sequence'] = hitdict[hit]['Seq'][1] hitdict[hit]['Data']['ShortName'] = string.split(hitdict[hit]['Seq'][0])[0] ### SkipGene ### if method == 'id' and rje.matchExp('^(\S+)_\S+',data['ID']): gene = rje.matchExp('^(\S+)_\S+',data['ID']) if gene in self.list['SkipGene']: return None ### Name, AccNum, Sequence and ID ### if method_info[method] in ['Name', 'AccNum', 'Sequence', 'ID']: for hit in hits: hitdata = hitdict[hit['Hit']]['Data'] if hitdata[method_info[method]] == data[method_info[method]]: if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit ### DescAcc ### if method == 'descacc': for hit in hits: hitdata = hitdict[hit['Hit']]['Data'] if rje.matchExp('\W(%s)\W' % data['AccNum'],hitdata['Name']): if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit ### GABLAM ### if method != 'gablam': return None focus = self.str['MapFocus'][:1].upper() + self.str['MapFocus'][1:].lower() gstat = gstat_type[self.str['MapStat'].lower()] possibles = [] # List of Hits that meet MinMap criterion for hit in hits: hitname = hit['Hit'] hitdata = hitdict[hit['Hit']]['Data'] if self.getNum('AutoMap') > 0.0 and hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('AutoMap'): if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit elif hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('MinMap'): possibles.append(hit) ### Manual GABLAM Choice ### if self.i() < 0 or not possibles: return None possibles.reverse() print '\nMapping options for %s:\n' % data['ShortName'] for p in range(len(possibles)): hit = possibles[p] hitname = hit['Hit'] hitdata = hitdict[hit['Hit']]['Data'] print '<%d> %s (%d aa) =\t' % (len(possibles)-p,hitdata['Name'],hit['Length']), print '%.1f%% Qry Len,' % (100.0 * hit['Length'] / len(seq[1])), print '%.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Hit_ID'],hitdict[hitname]['Hit_Sim'],hitdict[hitname]['Hit_Len']), print '(Qry: %.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Query_ID'],hitdict[hitname]['Query_Sim'],hitdict[hitname]['Query_Len']) choice = -1 print '<0> No mapping.\n' ## Choice ## while 1: choice = rje.getInt('Select sequence to replace %s?' % data['ShortName'],default=1,confirm=True) i = len(possibles) - choice if choice == 0: # No mapping if self.i() < 2 or rje.yesNo('No GABLAM mapping for %s?' % (data['ShortName'])): return None elif choice > 0 and choice <= len(possibles): hit = possibles[i] hitdata = hitdict[hit['Hit']]['Data'] if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit except: self.errorLog('Problem during SeqMapper.mapHit(%s)' % method,quitchoice=True) return None
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['Prog'] = None prog = self.getStrLC('Name') if prog in mod: i = self.obj['ProgInfo'] = mod[prog].makeInfo() self.printLog( '#PROG', '%s V%s: %s' % (i.program, i.version, i.description)) progcmd = rje.getCmdList( [], info=i) + self.cmd_list + ['newlog=F'] out = rje.Out(cmd_list=progcmd) out.printIntro(i) if self.getBool('Help'): progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd) purgelist = seqsuite.purgelist self.printLog('#CMD', 'Full %s CmdList: %s' % (i.program, rje.argString( rje.tidyArgs(progcmd, nopath=self.getStrLC('Rest') and not self.dev(), purgelist=purgelist))), screen=False) #self.debug(prog) ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prog in ['slimcore', 'rje_slimcore']: self.obj['Prog'] = rje_slimcore.SLiMCore(self.log, progcmd) elif prog in ['rlc', 'disorder']: self.obj['Prog'] = rje_slimcore.SLiMCore( self.log, progcmd + ['prog=%s' % prog]) elif prog in ['slimlist', 'rje_slimlist']: self.obj['Prog'] = rje_slimlist.SLiMList(self.log, progcmd) elif prog in ['slimfinder']: self.obj['Prog'] = slimfinder.SLiMFinder(self.log, progcmd) elif prog in ['qslimfinder']: self.obj['Prog'] = qslimfinder.QSLiMFinder( self.log, progcmd) elif prog in ['slimprob']: self.obj['Prog'] = slimprob.SLiMProb(self.log, progcmd) elif prog in ['slimmaker']: self.obj['Prog'] = slimmaker.SLiMMaker(self.log, progcmd) elif prog in ['slimfarmer', 'farm']: self.obj['Prog'] = slimfarmer.SLiMFarmer(self.log, progcmd) elif prog in ['slimbench']: self.obj['Prog'] = slimbench.SLiMBench(self.log, progcmd) elif prog in ['comparimotif']: self.obj['Prog'] = comparimotif.CompariMotif( self.log, progcmd) elif prog in ['peptcluster']: self.obj['Prog'] = peptcluster.PeptCluster( self.log, progcmd) elif prog in ['peptalign']: self.obj['Prog'] = peptcluster.PeptCluster( self.log, ['peptalign=T'] + progcmd) self.obj['Prog'].dict['Output']['help'] = mod[prog].__doc__ elif prog in seqsuite.mod: seqsuiteobj = seqsuite.SeqSuite(self.log, self.cmd_list) self.obj['Prog'] = seqsuiteobj.setup() self.obj['ProgInfo'] = seqsuiteobj.obj['ProgInfo'] self.obj['Prog'].dict['Output']['help'] = seqsuite.mod[ prog].__doc__ ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['Prog']: self.printLog( '#ERR', 'Program "%s" not recognised.' % self.getStr('Name')) if self.i() < 0: return False #!# Try SeqSuite? #!# if rje.yesNo('Show SLiMSuite help with program options?'): extracmd = cmdHelp(cmd_list=['help'])[1:] if extracmd: self.cmd_list += extracmd self._cmdList() if prog != self.getStrLC('Name'): return self.setup() self.setStr({ 'Name': rje.choice('Give program name (Blank or CTRL+C to quit)') }) if self.getStrLC('Name'): return self.setup() else: return False return self.obj['Prog'] # Setup successful except KeyboardInterrupt: return False except SystemExit: raise except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def difference(self, table1, table2): ### Generates differences as new table ''' Generates differences as new table. >> table1:Table = iTunes database table to compare >> table2:Table = iTunes database table to compare ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dfields = [ 'Name', 'Artist', 'Composer', 'Album', 'Album_Artist', 'Genre', 'Time', 'Disc Number', 'Disc Count', 'Track Number', 'Track Count', 'Year', 'Date Added', 'Plays', 'Last Played', 'Skips', 'Last Skipped', 'My Rating', 'Location', 'Tracks', 'Score' ] db = self.db() tabindex = '#Artist#|#Album#|#Track Number#|#Name#' try: age1 = string.atoi(string.split(table1.name(), '.')[-1]) age2 = string.atoi(string.split(table2.name(), '.')[-1]) table1.index(tabindex, make=True) table2.index(tabindex, make=True) if age1 < age2: oldtable = table1 newtable = table2 newdate = age2 else: newtable = table1 oldtable = table2 newdate = age1 diftable = db.copyTable( newtable, '%s-%s' % (oldtable.name(), string.split(newtable.name(), '.')[-1])) diftable.keepFields(dfields + [tabindex]) diftable.addField('Status') except: self.errorLog('Cannot generate differences for %s and %s' % (table1, table2)) ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '#NEW', '%s tracks in new iTunes export.' % rje.iStr(newtable.entryNum())) self.printLog( '#OLD', '%s tracks in old iTunes export.' % rje.iStr(oldtable.entryNum())) oldfiles = oldtable.datakeys()[0:] for entry in diftable.entries(): ## ~ [2a] Find pair of entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Location'] in oldfiles: oldentry = oldtable.data(entry['Location']) elif entry[tabindex] in oldtable.index(tabindex): oldentry = oldtable.indexEntries(tabindex, entry[tabindex])[0] if len(oldtable.indexEntries(tabindex, entry[tabindex])) == 1: pass else: self.printLog( '#DUP', 'Duplicate entries for %s' % entry[tabindex]) for ientry in oldtable.indexEntries( tabindex, entry[tabindex]): if ientry['Location'] in oldfiles: oldentry = ientry break else: oldentry = None #self.deBug(entry) #self.deBug(oldentry) ## ~ [2b] Generate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not oldentry: entry['Status'] = 'New' continue #self.deBug(oldentry['Location'] in oldfiles) if oldentry['Location'] in oldfiles: oldfiles.remove(oldentry['Location']) #self.deBug(len(oldfiles)) changed = False for field in ['Plays', 'Skips', 'My Rating']: if entry[field] != oldentry[field]: changed = True try: entry[field] -= oldentry[field] except: pass # Keep new value - probably empty in old entry if changed: entry['Status'] = 'Changed' else: entry['Status'] = 'Unchanged' ### ~ [3] Add missing old entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### reportdel = rje.yesNo('Report deleted %s tracks?' % diftable.name()) for old in oldfiles: entry = diftable.addEntry(oldtable.data(old)) entry['Status'] = 'Deleted' if reportdel: self.printLog( '#DEL', '%s: %s [%s]' % (entry['Artist'], entry['Name'], entry['Album'])) ### ~ [4] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for status in rje.sortKeys(diftable.index('Status')): self.printLog( '#STAT', '%s: %d tracks' % (status.upper(), len(diftable.index('Status')[status]))) self.printLog('#TRACK', '%s tracks in total' % rje.iStr(diftable.entryNum())) self.deBug('?') for table in [table1, table2, diftable]: table.dropField(tabindex) diftable.saveToFile('%s.tdt' % diftable.name()) except: self.errorLog('%s.difference() error' % self)
def pairwiseAQ( self, seqlist=None, query=None, focus=[0, 0]): ### Performs PAQ on seqlist, adding seq.info['PAQ'] ''' Performs PAQ on seqlist, adding seq.info['PAQ'] >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: PAQ = PAQ sequences with alignment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <PAQ0> ### Setup try: _stage = '<0> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences if not query: query = haqlist.obj['QuerySeq'] if self.opt['NoQuery'] or not query: query = haqlist.seq[random.randint(0, haqlist.seqNum() - 1)] self.log.printLog( '#QRY', 'Temp (random) query %s assigned for PAQ' % query.shortName()) #!# paqx = [False] * seqlist.seq[0].seqLen() # List of whether a column of the alignment is bad (has an X) [True] or not [False] #!# - make this a method?! pwaq = {} # Dictionary of lists of pairwise alignements block_align = { } # Dictionary of whether residue in block of sequence that is well-aligned or not for seq in haqlist.seq: block_align[seq] = [False] * seq.seqLen() seq.info['PAQ'] = seq.info['Sequence'][0:] if seq.info.has_key('SAQX') and len( seq.info['SAQX'] ) == seq.seqLen( ): #!# Should no longer be issues due to length changes following realignment seq.info['Sequence'] = seq.info['SAQX'][0:] elif seq.info.has_key('SAQX'): self.log.errorLog( 'Cannot use SAQX for %s in PAQ as wrong length.' % seq.shortName(), printerror=False) for otherseq in haqlist.seq: pwaq[(seq, otherseq)] = [False] * seq.seqLen() ### <PAQ1> ### Directional Pairwise Comparisons of sequences _stage = '<1> Pairwise Comparisons' infotxt = 'PAQ%d: Pairwise Comparisons ...' % self.stat['PAQCyc'] #print self.stat for seq in haqlist.seq: for otherseq in haqlist.seq: myinfo = '%s %.1f%% %.1f%% ' % ( infotxt, (100.0 * haqlist.seq.index(seq) / haqlist.seqNum()), (100.0 * haqlist.seq.index(otherseq) / haqlist.seqNum())) self.log.printLog('\r#PAQ', myinfo, log=False, newline=False) for r in range(seq.seqLen()): ar = seq.info['Sequence'][r] ## <i> ## Look for PW aligned block _stage = '<1-i> Pairwise Comparisons' if ar not in ['-', 'X']: # Start of test block blen = 0 # Block length (PAQBlock) = AAs win = 0 # Window length = all sequence matchx = 0 # Score for residues in window while blen < self.stat['PAQBlock'] and ( r + win ) < seq.seqLen( ): # This time we allow overshoots in both directions ar = seq.info['Sequence'][r + win] at = otherseq.info['Sequence'][r + win] if 'X' in [ar, at]: # Hit Bad Region: Abort break else: # Better region if ar != '-': blen += 1 # Increase Block matchx += self._saqCon(ar, at) win += 1 ## <ii> ## Update pwaq if block good _stage = '<1-ii> Pairwise Comparisons' if matchx >= self.stat['PAQMatch']: for w in range(win): if seq.info['Sequence'][r + w] in ['-', 'X']: pwaq[(seq, otherseq)][r + w] = False else: pwaq[(seq, otherseq)][r + w] = True self.log.printLog('\r#PAQ', '%s 100.0% 100.0%. ' % infotxt, log=False) ### <PAQ2> ### Link back to Query _stage = '<2> Linking to Query' ### <PAQ2a> ### Network of Pairwise Quality alignments _stage = '<2a> Linking to Query' #self.verbose(1,3,'PAQ%d: Linking Residues to Query (%s)' % (self.stat['PAQCyc'],query.shortName()),0) infotxt = 'PAQ%d: Linking Residues to Query (%s) ...' % ( self.stat['PAQCyc'], query.shortName()) for r in range(query.seqLen()): _stage = '<2a> Linking to Query' self.log.printLog('\r#PAQ', '%s %.1f%%' % (infotxt, (100.0 * r / query.seqLen())), log=False, newline=False) qok = { } # Dictionary of whether residue in seq OK, i.e. linked to query for seq in haqlist.seq: qok[seq] = False qok[query] = True sok = [0, 1] # List of OK sequence for residue while sok[-2] != sok[-1]: ## <i> ## Match pairs, starting with query _stage = '<2a-i> Linking to Query' for seq in haqlist.seq: if qok[seq]: for otherseq in haqlist.seq: if pwaq[(seq, otherseq)][r] or pwaq[(otherseq, seq)][r]: qok[otherseq] = True ## <ii> ## Update sok _stage = '<2a-ii> Linking to Query' sok.append(0) for seq in haqlist.seq: if qok[seq]: sok[-1] += 1 block_align[seq][r] = True _stage = '<2a-iii> Linking to Query' if sok[-1] == 1: # Only query OK! block_align[query][r] = False self.log.printLog('\r#PAQ', '%s 100.0%%' % infotxt, log=False) ### <PAQ2b> ### Allow for divergence (Conserved Anchors) _stage = '<2b> Anchors' if self.opt['Anchors']: infotxt = 'PAQ%d: Accounting for divergence within aligned regions ...' % self.stat[ 'PAQCyc'] ## <i> ## Setup gapped list gapped = [ False ] * query.seqLen() # Whether column of alignment is gapped for seq in haqlist.seq: self.log.printLog( '\r#PAQ', '%s %.1f%% ' % (infotxt, (50.0 * haqlist.seq.index(seq) / haqlist.seqNum())), log=False, newline=False) (start, end) = (0, seq.seqLen()) while seq.info['Sequence'][start] == '-': start += 1 while seq.info['Sequence'][end - 1] == '-': end -= 1 for r in range(start, end): if seq.info['Sequence'][r] == '-': gapped[r] = True ## <ii> ## Correction for seq in haqlist.seq: self.log.printLog( '\r#PAQ', '%s %.1f%% ' % (infotxt, (50 + (50.0 * haqlist.seq.index(seq) / haqlist.seqNum()))), log=False, newline=False) for r in range(seq.seqLen()): if block_align[seq][r] or gapped[ r]: # No need for correction continue # Move in both directions: if good residues (or sequence end) reached before gaps then reinstate winf = 0 fwd = True fok = False winb = 0 bwd = True bok = False while fwd or bwd: # End of seqs if (r + winf) >= seq.seqLen(): fwd = False if (r - winb) < 0: bwd = False # Gaps/OK if fwd: if gapped[r + winf]: fok = False fwd = False elif block_align[seq][r + winf]: fwd = False else: winf += 1 if bwd: if gapped[r - winb]: bok = False bwd = False elif block_align[seq][r - winb]: bwd = False else: winb += 1 if fok and bok: # Reinstate for w in range(r - winb, r + winf + 1): block_align[seq][w] = True self.log.printLog('\r#PAQ', '%s 100.0%% ' % infotxt, log=False) ### <PAQ3> ### X out badly-aligned blocks _stage = '<3> Making bad sequence blocks' for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] #!# Add saving of data in 'datafull' option ### <PAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removing sequences/regions' self.verbose( 0, 4, 'PAQ%d: Removing bad sequences and/or dodgy regions...' % self.stat['PAQCyc'], 0) ## <PAQ4a> ## Process Query first - only interested in good regions within query if self.opt['NoQuery']: # No preprocessing of Query self.verbose(0, 4, 'no Master Query processing...', 0) else: haqlist.mapX( query, qtrim=True, focus=focus ) # Replaces other sequence ends and query X columns with Xs self.verbose(0, 4, 'Query (%s) processed...' % query.shortName(), 0) self.verbose(0, 3, '', 1) if self.opt['ManPAQ']: haqlist.saveFasta(seqfile='%s.manpaq.fas' % haqlist.info['Basefile']) ## <PAQ4b> ## Cycle through other sequences (worst first) until no more good residues are lost goodres = [0, self._getGood(haqlist.seq) ] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [ 0 ] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info[ 'Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = { } # Dictionary of seq:number of lost residues if seq kept losex = { } # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] #?# In Perl HAQESAC there was an option to ignore Orphans in this calculation. Reinstate? losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManPAQ']: default = 'N' if badkx * self.stat['PAQKeepLen'] > badlx * self.stat[ 'PAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo( '%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), default): seqlist.removeSeq( text= 'PAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['PAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose( 1, 3, '%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), 1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['PAQKeepLen'] > badlx * self.stat[ 'PAQKeepSeq']: # Lose sequence! seqlist.removeSeq( text='PAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['PAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) self.verbose(1, 3, '%d -> %d "good" aa' % (goodres[-2], goodres[-1]), 1) ### <PAQ5> ### Reinstate UnX'd sequence: _stage = '<5> Replacing sequences' for seq in haqlist.seq: [seq.info['PAQ'], seq.info['Sequence'] ] = [seq.info['Sequence'], seq.info['PAQ']] if self.opt['ManPAQ'] and rje.checkForFile( '%s.manpaq.fas' % haqlist.info['Basefile']): os.unlink('%s.manpaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog( 'rje_haq.py ~ Problem with pairwiseAQ %s.' % _stage, True)
def multiHAQ(self, secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt[ 'MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList( ) # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = [ 'ini=haqesac.ini', 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F' ] self.printLog( '#HAQ', 'Running HAQESAC for %s - will have own log etc.' % seq.shortName(), log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd, info=info) out = rje.Out( cmd_list=haqcmd ) # Sets up Out object for controlling output to screen out.printIntro( info ) # Prints intro text using details from Info object haqlog = rje.setLog( info, out, haqcmd ) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo( 'Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog( '#HAQ', 'HAQESAC final round run for %s' % seq.shortName()) else: self.printLog( '#HAQ', 'HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList( self.log, ['seqin=%s' % infile, 'autoload=T', 'autofilter=F' ]).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog( '#QRY', '%d other queries found in %s: [%s]' % (len(qacclist), infile, string.join(qacclist, '; '))) self.printLog( '#QRY', '%d of %d queries processed' % (len(processed), self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI', 'Executing second round of multiHAQ') self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ', quitchoice=True)
def singleSeqAQ(self, seqlist, focus=[ 0, -1 ]): ### Performs SAQ on seqlist, adding seq.info['SAQ'] ''' Performs SAQ on seqlist, adding seq.info['SAQ']. >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: SAQX = SAQ sequences with individual Xs - A new info key will be added: SAQ = SAQ sequences with aligment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <SAQ1> ### Setup try: _stage = '<1> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences query = haqlist.obj['QuerySeq'] if self.opt['NoQuery']: query = None badres = [-1, 0] # List of how many bad residues in total dataset block_align = { } # Dictionary of whether residue in block of sequence that is well-aligned or not res_align = { } # Dictionary of whether residue of sequence is well-aligned or not res_gap = { } # Dictionary of whether residue of sequence is a gap or not gap_align = { } # Dictionary of whether residue of sequence is a gap in a well-aligned block or not for seq in haqlist.seq: seq.info['SAQ'] = seq.info['Sequence'][ 0:] # Note! Sequence is modified and SAQ not, then they are swapped at end! block_align[seq] = [False] * seq.seqLen() res_align[seq] = [False] * seq.seqLen() res_gap[seq] = [False] * seq.seqLen() gap_align[seq] = [False] * seq.seqLen() ### <SAQ2> ### Repeated cycles of defining well- and badly-aligned blocks #X#self.deBug(self.stat) _stage = '<2> BlockID' while badres[-1] != badres[-2]: # Change in number of bad residues total_res = 0 badres.append( 0) # badres[-1] is the current number of bad residues infotxt = 'SAQ%d-%d: Calculating "bad" residues ...' % ( self.stat['SAQCyc'], len(badres) - 2) for seq in haqlist.seq: myinfo = '%s %.1f%%' % (infotxt, (100.0 * haqlist.seq.index(seq) / haqlist.seqNum())) self.log.printLog('\r#SAQ', myinfo, log=False, newline=False) #self.verbose(0,3,'\r%45s' % myinfo,0) ## <SAQ2a> ## For each sequence, mark residues as aligned or gapped _stage = '<2a> Mark Residues' for r in range(seq.seqLen()): gap_align[seq][r] = False res_align[seq][r] = False if block_align[seq][r] or len( badres ) == 3: # After first cycle, look only at well-aligned blocks (well-aligned for sequence not whole alignment) a = seq.info['Sequence'][r] res_gap[seq][r] = False if a == '-': res_gap[seq][r] = True gap_align[seq][r] = True else: # 'X' handled by self._saqCon conx = 0 # Matches with good regions of otherseqs (*including self*) for otherseq in haqlist.seq[0:]: if otherseq == seq: # > so self not counted! continue if len(otherseq.info['Sequence']) != len( seq.info['Sequence']): self.log.errorLog( 'Sequence lengths do not match - should be aligned!', printerror=False) raise ValueError if (block_align[otherseq][r] or len(badres) == 3): conx += self._saqCon( a, otherseq.info['Sequence'][r]) #if seq == query and r > 590: # print seq.shortName(),r,conx,'vs',self.stat['SAQCon'], if conx >= self.stat['SAQCon']: res_align[seq][r] = True #if seq == query and r > 590: # print r, res_align[seq][r] ## <SAQ2b> ## Marked regions of well-aligned residues for each sequence _stage = '<2b> Mark Regions' ## <i> ## Clear first _stage = '<2b-i> Mark Regions' for r in range(seq.seqLen()): block_align[seq][r] = False ## <ii> ## Recalculate _stage = '<2b-ii> Mark Regions' for r in range(seq.seqLen()): _stage = '<2b-ii> Blocks' if res_align[seq][r]: # Start of potential block blen = 0 # Block length (SAQBlock) = AAs win = 0 # Window length = all sequence matchx = 1 # Good residues in window (first residue must be good!) (SAQMatch) while blen < self.stat[ 'SAQBlock'] and matchx < self.stat[ 'SAQMatch']: win += 1 if (r + win ) >= seq.seqLen() or seq.info['Sequence'][ r + win] == 'X': # Hit Bad Region: Abort break else: # Better region if gap_align[seq][r + win]: # Decent gap continue else: blen += 1 # Increase Block if res_align[seq][r + win]: # Good residue matchx += 1 #if seq == query and r > 590: # print seq.shortName(),r,matchx,'vs',self.stat['SAQMatch'], if matchx >= self.stat['SAQMatch']: for w in range((win + 1)): block_align[seq][r + w] = True #if seq == query and r > 590: # print r, block_align[seq][r] ## <iii> ## Update bad residue count for r in range(seq.seqLen()): _stage = '<2b-iii> Mark Regions' #print seq.shortName(), r, seq.seqLen(), block_align[seq][r], res_gap[seq][r], badres[-1] # Bad residue if not block_align[seq][r] and not res_gap[seq][ r]: # Bad residue badres[-1] += 1 if not res_gap[seq][r]: total_res += 1 myinfo = '%s 100.0%%' % infotxt myinfo += ' => %s bad of %s total residues' % ( rje.integerString( badres[-1]), rje.integerString(total_res)) self.log.printLog('\r#SAQ', myinfo) #self.verbose(0,3,'\r%45s' % myinfo,0) if badres[-1] == total_res: self.log.errorLog('All residues marked as bad in SAQ!', printerror=False, quitchoice=True) # Now have all residues in all sequences marked as good (block_align=True) or bad (block_align=False) ### <SAQ3> ### X out badly-aligned blocks _stage = '<3> X-Out' self.log.printLog('#SAQ', 'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'], len(badres) - 2), log=False, newline=False) #self.verbose(0,3,'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),0) for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][ r] == '-': #!# Was backwards? res_gap[seq][r] == False: newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] seq.info['SAQX'] = newseq[ 0:] # Stores Xd sequences for individuals for use in PAQ #!# Add saving of data in 'datafull' option ### <SAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removal' self.log.printLog( '\r#SAQ', 'SAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'], len(badres) - 2), log=False, newline=False) #self.verbose(0,3,'\rSAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),0) ## <SAQ4a> ## Process Query first - only interested in good regions within query _stage = '<4a> Query Removal' if self.opt[ 'NoQuery'] or query == None: # No preprocessing of Query self.verbose(0, 4, 'no Master Query processing...', 0) else: haqlist.mapX( query, qtrim=True, focus=focus ) # Replaces other sequence ends and query X columns with Xs self.verbose(0, 4, 'Query (%s) processed...' % query.shortName(), 0) self.verbose(0, 3, '', 1) if self.opt['ManSAQ']: haqlist.saveFasta(seqfile='%s.mansaq.fas' % haqlist.info['Basefile']) ## <SAQ4b> ## Cycle through other sequences (worst first) until no more good residues or sequences are lost _stage = '<4b> Seq Removal' goodres = [0, self._getGood(haqlist.seq) ] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [ 0 ] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info[ 'Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = { } # Dictionary of seq:number of lost residues if seq kept losex = { } # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManSAQ']: default = 'N' if badkx * self.stat['SAQKeepLen'] > badlx * self.stat[ 'SAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo( '%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), default): seqlist.removeSeq( text= 'SAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['SAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose( 1, 3, '%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), 1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['SAQKeepLen'] > badlx * self.stat[ 'SAQKeepSeq']: # Lose sequence! haqlist.removeSeq( text='SAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['SAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) #X#self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <SAQ5> ### Reinstate UnX'd sequence: _stage = '<4b> Seq Removal' for seq in haqlist.seq: #print seq.info [seq.info['SAQ'], seq.info['Sequence'] ] = [seq.info['Sequence'], seq.info['SAQ']] if self.opt['ManSAQ'] and rje.checkForFile( '%s.mansaq.fas' % haqlist.info['Basefile']): os.unlink('%s.mansaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('Problem with singleSeqAQ() %s.' % _stage, quitchoice=True)
def singleSeqAQ(self,seqlist,focus=[0,-1]): ### Performs SAQ on seqlist, adding seq.info['SAQ'] ''' Performs SAQ on seqlist, adding seq.info['SAQ']. >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: SAQX = SAQ sequences with individual Xs - A new info key will be added: SAQ = SAQ sequences with aligment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <SAQ1> ### Setup try: _stage = '<1> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences query = haqlist.obj['QuerySeq'] if self.opt['NoQuery']: query = None badres = [-1,0] # List of how many bad residues in total dataset block_align = {} # Dictionary of whether residue in block of sequence that is well-aligned or not res_align = {} # Dictionary of whether residue of sequence is well-aligned or not res_gap = {} # Dictionary of whether residue of sequence is a gap or not gap_align = {} # Dictionary of whether residue of sequence is a gap in a well-aligned block or not for seq in haqlist.seq: seq.info['SAQ'] = seq.info['Sequence'][0:] # Note! Sequence is modified and SAQ not, then they are swapped at end! block_align[seq] = [False] * seq.seqLen() res_align[seq] = [False] * seq.seqLen() res_gap[seq] = [False] * seq.seqLen() gap_align[seq] = [False] * seq.seqLen() ### <SAQ2> ### Repeated cycles of defining well- and badly-aligned blocks #X#self.deBug(self.stat) _stage = '<2> BlockID' while badres[-1] != badres[-2]: # Change in number of bad residues total_res = 0 badres.append(0) # badres[-1] is the current number of bad residues infotxt = 'SAQ%d-%d: Calculating "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2) for seq in haqlist.seq: myinfo = '%s %.1f%%' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum())) self.log.printLog('\r#SAQ',myinfo,log=False,newline=False) #self.verbose(0,3,'\r%45s' % myinfo,0) ## <SAQ2a> ## For each sequence, mark residues as aligned or gapped _stage = '<2a> Mark Residues' for r in range(seq.seqLen()): gap_align[seq][r] = False res_align[seq][r] = False if block_align[seq][r] or len(badres) == 3: # After first cycle, look only at well-aligned blocks (well-aligned for sequence not whole alignment) a = seq.info['Sequence'][r] res_gap[seq][r] = False if a == '-': res_gap[seq][r] = True gap_align[seq][r] = True else: # 'X' handled by self._saqCon conx = 0 # Matches with good regions of otherseqs (*including self*) for otherseq in haqlist.seq[0:]: if otherseq == seq: # > so self not counted! continue if len(otherseq.info['Sequence']) != len(seq.info['Sequence']): self.log.errorLog('Sequence lengths do not match - should be aligned!',printerror=False) raise ValueError if (block_align[otherseq][r] or len(badres) == 3): conx += self._saqCon(a, otherseq.info['Sequence'][r]) #if seq == query and r > 590: # print seq.shortName(),r,conx,'vs',self.stat['SAQCon'], if conx >= self.stat['SAQCon']: res_align[seq][r] = True #if seq == query and r > 590: # print r, res_align[seq][r] ## <SAQ2b> ## Marked regions of well-aligned residues for each sequence _stage = '<2b> Mark Regions' ## <i> ## Clear first _stage = '<2b-i> Mark Regions' for r in range(seq.seqLen()): block_align[seq][r] = False ## <ii> ## Recalculate _stage = '<2b-ii> Mark Regions' for r in range(seq.seqLen()): _stage = '<2b-ii> Blocks' if res_align[seq][r]: # Start of potential block blen = 0 # Block length (SAQBlock) = AAs win = 0 # Window length = all sequence matchx = 1 # Good residues in window (first residue must be good!) (SAQMatch) while blen < self.stat['SAQBlock'] and matchx < self.stat['SAQMatch']: win += 1 if (r + win) >= seq.seqLen() or seq.info['Sequence'][r+win] == 'X': # Hit Bad Region: Abort break else: # Better region if gap_align[seq][r+win]: # Decent gap continue else: blen += 1 # Increase Block if res_align[seq][r+win]: # Good residue matchx += 1 #if seq == query and r > 590: # print seq.shortName(),r,matchx,'vs',self.stat['SAQMatch'], if matchx >= self.stat['SAQMatch']: for w in range((win+1)): block_align[seq][r+w] = True #if seq == query and r > 590: # print r, block_align[seq][r] ## <iii> ## Update bad residue count for r in range(seq.seqLen()): _stage = '<2b-iii> Mark Regions' #print seq.shortName(), r, seq.seqLen(), block_align[seq][r], res_gap[seq][r], badres[-1] # Bad residue if not block_align[seq][r] and not res_gap[seq][r]: # Bad residue badres[-1] += 1 if not res_gap[seq][r]: total_res += 1 myinfo = '%s 100.0%%' % infotxt myinfo += ' => %s bad of %s total residues' % (rje.integerString(badres[-1]),rje.integerString(total_res)) self.log.printLog('\r#SAQ',myinfo) #self.verbose(0,3,'\r%45s' % myinfo,0) if badres[-1] == total_res: self.log.errorLog('All residues marked as bad in SAQ!',printerror=False,quitchoice=True) # Now have all residues in all sequences marked as good (block_align=True) or bad (block_align=False) ### <SAQ3> ### X out badly-aligned blocks _stage = '<3> X-Out' self.log.printLog('#SAQ','SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False) #self.verbose(0,3,'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),0) for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': #!# Was backwards? res_gap[seq][r] == False: newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] seq.info['SAQX'] = newseq[0:] # Stores Xd sequences for individuals for use in PAQ #!# Add saving of data in 'datafull' option ### <SAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removal' self.log.printLog('\r#SAQ','SAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False) #self.verbose(0,3,'\rSAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),0) ## <SAQ4a> ## Process Query first - only interested in good regions within query _stage = '<4a> Query Removal' if self.opt['NoQuery'] or query == None: # No preprocessing of Query self.verbose(0,4,'no Master Query processing...',0) else: haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0) self.verbose(0,3,'',1) if self.opt['ManSAQ']: haqlist.saveFasta(seqfile='%s.mansaq.fas' % haqlist.info['Basefile']) ## <SAQ4b> ## Cycle through other sequences (worst first) until no more good residues or sequences are lost _stage = '<4b> Seq Removal' goodres = [0, self._getGood(haqlist.seq)] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [0] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = {} # Dictionary of seq:number of lost residues if seq kept losex = {} # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManSAQ']: default = 'N' if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default): seqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']: # Lose sequence! haqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) #X#self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <SAQ5> ### Reinstate UnX'd sequence: _stage = '<4b> Seq Removal' for seq in haqlist.seq: #print seq.info [seq.info['SAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['SAQ']] if self.opt['ManSAQ'] and rje.checkForFile('%s.mansaq.fas' % haqlist.info['Basefile']): os.unlink('%s.mansaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('Problem with singleSeqAQ() %s.' % _stage, quitchoice=True)
def difference(self,table1,table2): ### Generates differences as new table ''' Generates differences as new table. >> table1:Table = iTunes database table to compare >> table2:Table = iTunes database table to compare ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dfields = ['Name','Artist','Composer','Album','Album_Artist','Genre','Time','Disc Number','Disc Count','Track Number','Track Count','Year','Date Added','Plays','Last Played','Skips','Last Skipped','My Rating','Location','Tracks','Score'] db = self.db() tabindex = '#Artist#|#Album#|#Track Number#|#Name#' try: age1 = string.atoi(string.split(table1.name(),'.')[-1]) age2 = string.atoi(string.split(table2.name(),'.')[-1]) table1.index(tabindex,make=True) table2.index(tabindex,make=True) if age1 < age2: oldtable = table1; newtable = table2; newdate = age2 else: newtable = table1; oldtable = table2; newdate = age1 diftable = db.copyTable(newtable,'%s-%s' % (oldtable.name(),string.split(newtable.name(),'.')[-1])) diftable.keepFields(dfields+[tabindex]) diftable.addField('Status') except: self.errorLog('Cannot generate differences for %s and %s' % (table1,table2)) ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#NEW','%s tracks in new iTunes export.' % rje.iStr(newtable.entryNum())) self.printLog('#OLD','%s tracks in old iTunes export.' % rje.iStr(oldtable.entryNum())) oldfiles = oldtable.datakeys()[0:] for entry in diftable.entries(): ## ~ [2a] Find pair of entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Location'] in oldfiles: oldentry = oldtable.data(entry['Location']) elif entry[tabindex] in oldtable.index(tabindex): oldentry = oldtable.indexEntries(tabindex,entry[tabindex])[0] if len(oldtable.indexEntries(tabindex,entry[tabindex])) == 1: pass else: self.printLog('#DUP','Duplicate entries for %s' % entry[tabindex]) for ientry in oldtable.indexEntries(tabindex,entry[tabindex]): if ientry['Location'] in oldfiles: oldentry = ientry; break else: oldentry = None #self.deBug(entry) #self.deBug(oldentry) ## ~ [2b] Generate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not oldentry: entry['Status'] = 'New' continue #self.deBug(oldentry['Location'] in oldfiles) if oldentry['Location'] in oldfiles: oldfiles.remove(oldentry['Location']) #self.deBug(len(oldfiles)) changed = False for field in ['Plays','Skips','My Rating']: if entry[field] != oldentry[field]: changed = True try: entry[field] -= oldentry[field] except: pass # Keep new value - probably empty in old entry if changed: entry['Status'] = 'Changed' else: entry['Status'] = 'Unchanged' ### ~ [3] Add missing old entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### reportdel = rje.yesNo('Report deleted %s tracks?' % diftable.name()) for old in oldfiles: entry = diftable.addEntry(oldtable.data(old)) entry['Status'] = 'Deleted' if reportdel: self.printLog('#DEL','%s: %s [%s]' % (entry['Artist'],entry['Name'],entry['Album'])) ### ~ [4] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for status in rje.sortKeys(diftable.index('Status')): self.printLog('#STAT','%s: %d tracks' % (status.upper(),len(diftable.index('Status')[status]))) self.printLog('#TRACK','%s tracks in total' % rje.iStr(diftable.entryNum())) self.deBug('?') for table in [table1,table2,diftable]: table.dropField(tabindex) diftable.saveToFile('%s.tdt' % diftable.name()) except: self.errorLog('%s.difference() error' % self)
def badasp(out,mainlog,cmd_list,tree=None): ### Main BADASP Method ''' Main BADASP Method. Automated run if interactive < 1 <1> Load Sequences and Tree <2> Define Subfamilies <3> GASP Ancestral Sequence Prediction <4> Peform Functional Specificity and Sequence Conservation Calculations <5> Output Results ''' try: ### <0> ### Setup _seqfile = None _treefile = None append_file = None basefile = None for cmd in cmd_list: if cmd.find('seqin=') == 0: _seqfile = cmd[len('seqin='):] if _seqfile[-4] == '.': _seqfile = _seqfile[:-4] if cmd.find('useanc=') == 0: _seqfile = cmd[len('useanc='):] if _seqfile[-8:] == '.anc.fas': _seqfile = _seqfile[:-8] if cmd.find('nsfin=') == 0: _treefile = cmd[len('nsfin='):] if cmd.find('append=') == 0: append_file = cmd[len('append='):] if cmd.find('basefile=') == 0: basefile = cmd[len('basefile='):] if _seqfile and os.path.exists('%s.grp' % _seqfile): cmd_list.append('group=%s.grp' % _seqfile) if _seqfile and _treefile == None: if rje.checkForFile('%s.nwk' % _seqfile): _treefile = '%s.nwk' % _seqfile else: _treefile = '%s.nsf' % _seqfile out.verbose(0,2,'Looking for treefile %s.' % _treefile,1) if rje.checkForFile(_treefile): cmd_list.append('nsfin=%s' % _treefile) if tree == None: mainlog.verbose(0,1,'Tree: %s' % cmd_list,2) tree = rje_tree.Tree(log=mainlog,cmd_list=cmd_list) #tree._setupFromCmd() if tree.stat['MinFamNum'] < 2: tree.stat['MinFamNum'] = 2 ### <1> ### Load Sequences and Tree while out.stat['Interactive'] > 0 or tree.obj['SeqList'] == None: tree = rje_tree.treeMenu(out,mainlog,['root=yes']+cmd_list,tree) if tree.obj['SeqList'] and tree.opt['Rooted']: break else: print '\n ** Must have loaded sequences and a rooted tree. ** \n' if out.stat['Interactive'] < 0 or rje.yesNo('Quit BADASP?',default='N'): sys.exit() basename = tree.obj['SeqList'].info['Name'] if basename[-4:] == '.fas': basename = basename[:-4] if basename[-4:] == '.anc': basename = basename[:-4] if basefile: basename = basefile except SystemExit: raise except: mainlog.errorLog('Major Error in badasp loading sequences and tree',True) try: ### <2> ### Define Subfamilies while out.stat['Interactive'] > 0 or tree.groupNum() < 2: tree.treeGroup(callmenu=True) if tree.groupNum() >= 2: break else: mainlog.errorLog('Must have at least two subfamilies for specificity analyses.',printerror=False) if out.stat['Interactive'] < 0 or rje.yesNo('Continue without specificity analyses?'): cmd_list.append('funcspec=') break elif rje.yesNo('Abort BADASP?'): sys.exit() except SystemExit: raise except: mainlog.errorLog('Major Error in BADASP subfamilies',True) try: ### <3> ### GASP Ancestral Sequence Prediction if tree.node[-1].obj['Sequence'] == None: # No ancseq loaded while out.stat['Interactive'] > 0 and rje.yesNo('Use %s for output filenames?' % basename) == False: basename = rje.choice('FILEname (FILE.anc.fas, FILE.anc.nsf, FILE.txt)?: ', default=basename) mygasp = rje_ancseq.Gasp(tree=tree,ancfile=basename,cmd_list=cmd_list,log=mainlog) out.verbose(0,2,'%s' % mygasp.details(),1) if out.stat['Interactive'] > 0: if rje.yesNo('Use these parameters?') == False: mygasp.edit() mygasp.gasp() except: mainlog.errorLog('Major Error in BADASP GASP',True) try: ### <4> ### Peform Functional Specificity and Sequence Conservation Calculations _stage = '<4> Specificity/Conservation Analyses' aaprop = rje_aaprop.AAPropMatrix(log=mainlog,cmd_list=cmd_list) query = tree.obj['SeqList'].obj['QuerySeq'] ## <a> ## Chosen Methods _stage = '<4a> Specificity/Conservation Analyses - Chosen Methods' funcspec = rje_specificity.methodlist # ['BAD','BADN','BADX'] seqcon = rje_conseq.methodlist # ['info'] for cmd in cmd_list: if cmd.find('funcspec=') == 0: funcspec = cmd[9:].split(',') if cmd.find('seqcon=') == 0: seqcon = cmd[len('seqcon='):].split(',') if 'all' in funcspec: funcspec = rje_specificity.methodlist if 'all' in seqcon: seqcon = rje_conseq.methodlist for method in ['BADX','BADN','QPCon_Mean','QPCon_Abs','QPCon_Mean_All']: while method in funcspec and query == None: if rje.yesNo('Method %s needs query but none given. Drop %s from specificity methods?' % (method,method)): funcspec.remove(method) break for seq in tree.obj['SeqList'].seq: if rje.yesNo('Method %s needs query but none given. Use sequence 1 (%s)?' % (method,seq.shortName()),default='N'): query = seq tree.obj['SeqList'].obj['Query'] = seq break while method in seqcon and query == None: if rje.yesNo('Method %s needs query but none given. Drop %s from conservation methods?' % (method,method)): seqcon.remove(method) break for seq in tree.obj['SeqList'].seq: if rje.yesNo('Method %s needs query but none given. Use sequence 1 (%s)?' % (method,seq.shortName()),default='N'): query = seq tree.obj['SeqList'].obj['Query'] = seq break qname = query if query: qname = query.info['Name'] out.verbose(0,3,'\nQuery = %s' % qname,2) ## <b> ## Spec Calculations _stage = '<4b> Specificity Calculations' specmatrix = rje_specificity.FuncSpec(log=mainlog,cmd_list=cmd_list,tree=tree,aaprop=aaprop) specmatrix.calcScore(query=query,methods=funcspec) ## <c> ## Conservation Calculations _stage = '<4c> Specificity/Conservation Analyses - Conservation Calculations' conseq = rje_conseq.SeqStat(log=mainlog,cmd_list=cmd_list,tree=tree,aaprop=aaprop) conseq.calcScore(query=query,methods=seqcon) ### Sends appropriate seqlist to self.calcScore() ## <d> ## Special Case: QPCon vs All seqs _stage = '<4d> Specificity/Conservation Analyses - QPCon vs All' qpconall = [] #if 'QPCon_Abs_All' in seqcon and query: # qpconall.append('QPCon_Abs') if 'QPCon_Mean_All' in seqcon and query: qpconall.append('QPCon_Mean') for qp in qpconall: conseq.score['%s_All' % qp] = conseq.score[qp] if conseq.alnwin.has_key(qp): conseq.alnwin['%s_All' % qp] = conseq.alnwin[qp] if conseq.qrywin.has_key(qp): conseq.qrywin['%s_All' % qp] = conseq.qrywin[qp] if conseq.rank.has_key(qp): conseq.rank['%s_All' % qp] = conseq.rank[qp] if conseq.alnrankwin.has_key(qp): conseq.alnrankwin['%s_All' % qp] = conseq.alnrankwin[qp] if conseq.qryrankwin.has_key(qp): conseq.qryrankwin['%s_All' % qp] = conseq.qryrankwin[qp] _stage = '<4d> Specificity/Conservation Analyses - FamQP' famqp = [] if 'QPCon_Mean' in seqcon: famqp.append('QPCon_Mean') if 'QPCon_Abs' in seqcon: famqp.append('QPCon_Abs') if len(famqp) > 0 and query: #!# And subfam option? qseq = [] for fam in tree.subfam: for node in tree._nodeClade(fam): if query == node.obj['Sequence']: for qnode in tree._nodeClade(fam): qseq.append(qnode.obj['Sequence']) conseq.calcScore(query=query,seqlist=qseq,methods=famqp) ### Sends appropriate seqlist to self.calcScore() except: mainlog.errorLog('Major Error in BADASP Specificity Analysis (%s):' % _stage,True) try: ### <5> ### Full Output Results _stage = '<5> Full Output' # This output is in a tab- or comma-delimited file for easy manipulation or viewing with other programs. # (1) statistics for a given residue; # (2) statistics for a given window size across # - (a) the whole alignment, (node=None) # - (b) the Query protein of interest (if given) and (node=QueryNode) # - (c) the ancestral sequence of each subfamily; (node=ancnode) # (3) Predicted ancestral sequences at # - (a) the root and # - (b) the ancestor of each subfamily. delimit = rje.getDelimit(cmd_list) ## <a> ## Setup _stage = '<5a> Output - Setup' rankout = specmatrix.opt['Rank'] #tree._regenerateSeqList(tree.obj['SeqList'],tree.node) root = tree.node[-1].obj['Sequence'] #!# At some point, make sure this is the most ancient duplication! out.verbose(0,3,'\nBADASP Results Output (%s.badasp) ...' % basename,0) ## <b> ## Header _stage = '<5b> Output - Header' _header = True if append_file: if rje.checkForFile(append_file): _header = False BADASP = open(append_file, 'a') else: BADASP = open('%s.badasp' % basename, 'w') BADASP.write("BADASP Output: %s\n" % (time.asctime(time.localtime(time.time())))) BADASP.write('%s\n\n' % cmd_list) header = ['aln_pos','anc_aa'] # Aln Pos and AA alnlen = 0 statlist = funcspec + seqcon _stage = '<5b-i> Output - Header Query' if query: header += ['qry_pos','qry_aa'] # Qry Pos and AA _stage = '<5b-ii> Output - Header Subfam' for f in range(len(tree.subfam)): header += ['fam%d_pos' % (f+1),'fam%d_aa' % (f+1)] # Subfam Pos and AA for func in statlist: _stage = '<5b-iii> Output - Header %s' % func statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() alnlen = len(statobj.score[func]) header.append(fs) # Score if rankout: header.append('%s_rank' % fs) # Rank if statobj.stat['WinSize'] > 1: header.append('%s_alnwin' % fs) # Full align window if rankout: header.append('%s_alnrankwin' % fs) # Rank if query: header.append('%s_qrywin' % fs) # Qry window if rankout: header.append('%s_qryrankwin' % fs) # Rank if func in funcspec: for f in range(len(tree.subfam)): header.append('%s_fam%d_win' % (fs,f+1)) # Subfam windows if rankout: header.append('%s_fam%d_rankwin' % (fs,f+1)) # Subfam windows #if _header: BADASP.write('%s\n' % string.join(header, delimit)) out.verbose(1,3,'%s...' % string.join(header, delimit),0) ## <c> ## Stats _stage = '<5c> Stats' qr = 0 # Qry pos fr = [0] * len(tree.subfam) # List of subfam positions aa = '' # Root aa qa = '' # Qry aa fa = [''] * len(tree.subfam) # List of subfam aas for r in range(alnlen): # <i> # Positions and aas _stage = '<5c-i> Output - Stats, positions & aas' aa = root.info['Sequence'][r] if query: qa = query.info['Sequence'][r] if qa != '-': qr += 1 for f in range(len(tree.subfam)): fa[f] = tree.subfam[f].obj['Sequence'].info['Sequence'][r] if fa[f] != '-': fr[f] += 1 # <ii> # Positions and AAs ii _stage = '<5c-ii> Output - Pos & AA ii' line = ['%d' % (r+1), aa] # Aln Pos and AA if query: if qa == '-': line += ['-',qa] # Qry Pos and AA else: line += ['%d' % qr,qa] # Qry Pos and AA for f in range(len(tree.subfam)): if fa[f] == '-': line += ['-',fa[f]] # Subfam Pos and AA else: line += ['%d' % fr[f],fa[f]] # Subfam Pos and AA # <iii> # Stats _stage = '<5c-iii> Output - Stats' for func in statlist: statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() line.append(str(statobj.score[func][r])) # Score if rankout: line.append(str(statobj.rank[func][r])) # Rank if specmatrix.stat['WinSize'] > 1: line.append(str(statobj.alnwin[func][r])) # Full align window if rankout: line.append(str(statobj.alnrankwin[func][r])) # Rank if query: line.append(str(statobj.qrywin[func][r])) # Qry window if rankout: line.append(str(statobj.qryrankwin[func][r])) # Rank if func in funcspec: for f in range(len(tree.subfam)): line.append(str(statobj.famwin[func][tree.subfam[f]][r])) # Subfam windows if rankout: line.append(str(statobj.famrankwin[func][tree.subfam[f]][r])) # Subfam windows # <iv> # Writing _stage = '<5c-iv> Output - Writing' BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close() out.verbose(0,2,'Done!',2) except: mainlog.errorLog('Fatal Error in BADASP Full output (%s):' % _stage,True) BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close() try: ### <6> ### Partial Results Output _stage = '<6> Partial Output' ## <a> ## Setup _stage = '<6a> Output - Setup' # statlist & alnlen from above _part_append = False if out.stat['Interactive'] > 0 and rje.yesNo('Output additional, filtered results?',default='N'): partfile = rje.choice('Name for partial results file?:','%s.partial.badasp' % basename,confirm=True) if rje.checkForFile(partfile) and rje.yesNo('File %s exists. Append file without headers?' % partfile): _part_append = True else: return if rje.yesNo('Filter output columns?',default='N'): if rje.yesNo('Output query details (pos,aa & win)?') == False: query = None f = 1 for fam in tree.subfam[0:]: if rje.yesNo('Output subfam %d (%s) details (pos,aa & win)?' % (f,fam.info['CladeName'])) == False: tree.subfam.remove(fam) f += 1 for func in statlist[0:]: if rje.yesNo('Output %s results?' % func) == False: statlist.remove(func) alnout = [True] * alnlen if rje.yesNo('Filter Rows by Results VALUES?'): out.verbose(0,0,'Initial Defaults are minmum values. Accept intital default for no filtering of given Stat.',1) for stat in statlist: ### Filter by value? ### statobj = statObj(method=stat,objlist=[specmatrix,conseq]) scores = statobj.score[stat][0:] scores.sort() cutoff = rje.getFloat('Min. value for %s?:' % stat,default='%f' % scores[0],confirm=True) for r in range(alnlen): if statobj.score[stat][r] < cutoff: alnout[r] = False if rankout and rje.yesNo('Filter Rows by Results RANKS?'): out.verbose(0,0,'Ranks range from 0 (low) to 1 (high).',1) for stat in statlist: ### Filter by Rank? ### statobj = statObj(method=stat,objlist=[specmatrix,conseq]) cutoff = rje.getFloat('Min. rank for %s?:' % stat,default='0.0',confirm=True) for r in range(alnlen): if statobj.rank[stat][r] < cutoff: alnout[r] = False out.verbose(0,3,'\nBADASP Partial Results Output (%s) ...' % partfile,0) ## <b> ## Header _stage = '<6b> Partial Output - Header' if _part_append: BADASP = open(partfile, 'a') else: BADASP = open(partfile, 'w') BADASP.write("Partial BADASP Output: %s\n" % (time.asctime(time.localtime(time.time())))) BADASP.write('%s\n\n' % cmd_list) header = ['aln_pos','anc_aa'] # Aln Pos and AA _stage = '<6b-i> Partial Output - Header Query' if query: header += ['qry_pos','qry_aa'] # Qry Pos and AA _stage = '<6b-ii> Partial Output - Header Subfam' for f in range(len(tree.subfam)): header += ['fam%d_pos' % (f+1),'fam%d_aa' % (f+1)] # Subfam Pos and AA for func in statlist: _stage = '<6b-iii> Partial Output - Header %s' % func statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() header.append(fs) # Score if rankout: header.append('%s_rank' % fs) # Rank if statobj.stat['WinSize'] > 1: header.append('%s_alnwin' % fs) # Full align window if rankout: header.append('%s_alnrankwin' % fs) # Rank if query: header.append('%s_qrywin' % fs) # Qry window if rankout: header.append('%s_qryrankwin' % fs) # Rank if func in funcspec: for f in range(len(tree.subfam)): header.append('%s_fam%d_win' % (fs,f+1)) # Subfam windows if rankout: header.append('%s_fam%d_rankwin' % (fs,f+1)) # Subfam windows #if not _part_append: BADASP.write('%s\n' % string.join(header, delimit)) out.verbose(1,3,'%s...' % string.join(header, delimit),0) ## <c> ## Stats _stage = '<6c> Stats' qr = 0 # Qry pos fr = [0] * len(tree.subfam) # List of subfam positions aa = '' # Root aa qa = '' # Qry aa fa = [''] * len(tree.subfam) # List of subfam aas for r in range(alnlen): if alnout[r] == False: continue # <i> # Positions and aas _stage = '<6c-i> Partial Output - Stats, positions & aas' aa = root.info['Sequence'][r] if query: qa = query.info['Sequence'][r] if qa != '-': qr += 1 for f in range(len(tree.subfam)): fa[f] = tree.subfam[f].obj['Sequence'].info['Sequence'][r] if fa[f] != '-': fr[f] += 1 # <ii> # Positions and AAs ii _stage = '<6c-ii> Partial Output - Pos & AA ii' line = ['%d' % (r+1), aa] # Aln Pos and AA if query: if qa == '-': line += ['-',qa] # Qry Pos and AA else: line += ['%d' % qr,qa] # Qry Pos and AA for f in range(len(tree.subfam)): if fa[f] == '-': line += ['-',fa[f]] # Subfam Pos and AA else: line += ['%d' % fr[f],fa[f]] # Subfam Pos and AA # <iii> # Stats _stage = '<6c-iii> Partial Output - Stats' for func in statlist: statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() line.append(str(statobj.score[func][r])) # Score if rankout: line.append(str(statobj.rank[func][r])) # Rank if specmatrix.stat['WinSize'] > 1: line.append(str(statobj.alnwin[func][r])) # Full align window if rankout: line.append(str(statobj.alnrankwin[func][r])) # Rank if query: line.append(str(statobj.qrywin[func][r])) # Qry window if rankout: line.append(str(statobj.qryrankwin[func][r])) # Rank if func in funcspec: for f in range(len(tree.subfam)): line.append(str(statobj.famwin[func][tree.subfam[f]][r])) # Subfam windows if rankout: line.append(str(statobj.famrankwin[func][tree.subfam[f]][r])) # Subfam windows # <iv> # Writing _stage = '<6c-iv> Partial Output - Writing' BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close() out.verbose(0,2,'Done!',2) except: mainlog.errorLog('Fatal Error in BADASP Partial output (%s):' % _stage,True) BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close()
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath(rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)) self.printLog('#FARM',batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = ['subjobs=%s' % batfile,'farm=batch','qsub=F','i=-1','runpath=%s' % os.path.abspath(self.info['HaqDir'])] if self.opt['MultiHAQ']: haqfarm = ['First round','Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog('#CHDIR','Changing directory for %s farming: %s' % (farmrun,self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log,self.cmd_list+farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog('#CHDIR','Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM','HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ',quitchoice=True)
def slimJimMapping(self): ### Generate SLiMJIM PNGs for all sequences """Generate SpokeAln PNGs for all spokes.""" try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mapseq = {} # Dictionary of {dataset:[seqs]} scmd = ["autoload=T", "seqnr=F", "accnr=F", "replacechar=F"] mseq = rje_seq.SeqList(self.log, self.cmd_list + scmd) #!# Removed ['minregion=3']+ #!# while mseq.seq: ## ~ [1a] ~ Read in all sequences for one spoke ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pseq = [mseq.seq.pop(0)] # Pseq = list of sequences for this protein while mseq.seq: if ( mseq.seq[0].info["Name"].find("Motifs") > 0 and string.split(mseq.seq[0].info["Name"])[1] == "Motifs" ): break # Next protein pseq.append(mseq.seq.pop(0)) ## ~ [1b] ~ Update relevant sequence dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mapseq[pseq[0].shortName()] = pseq[0:] self.printLog("#ALN", "%d distinct alignments identified" % len(mapseq)) ### ~ [2] ~ Make SLiMJIM visualisations for each protein ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ex = 0 # Number of errors for mapping in rje.sortKeys(mapseq): try: ## ~ [3a] ~ Rename sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = pseq[0].shortName() if self.interactive() > 0 and not rje.yesNo(basefile): continue qryname = pseq[2].shortName() pseq = mapseq[mapping][0:] pseq[0].info["R"] = pseq[0].shortName()[len(qryname) + 1 :] pseq[1].info["R"] = "Masked" for seq in pseq[2:]: seq.info["R"] = seq.info["ID"] ## ~ [3b] ~ Setup new SeqList, strip Query gaps, calculate RelCons ~~~~~~~~~~~~~~~~ ## seqfile = "%s.aln.tdt" % basefile if os.path.exists(seqfile): os.unlink(seqfile) rseq = rje_seq.SeqList(self.log, self.cmd_list + scmd + ["autoload=F"]) rseq.seq = pseq rseq.obj["QuerySeq"] = pseq[2] rseq.tidyQueryGaps() rseq.saveR(rseq.seq, seqfile, name="R") rseq.seq = pseq[2:] relfile = "%s.rel.tdt" % basefile if os.path.exists(relfile): os.unlink(relfile) rseq.relCons(relfile) self.deBug(rseq.obj["QuerySeq"].cmd_list) ## ~ [3c] ~ Call R to generate graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rcmd = '%s --no-restore --no-save --args "sfmap2png" "%s"' % (self.info["RPath"], basefile) rslimjim = "%srje.r" % self.info["Path"] rcmd += ' < "%s" > "%s.r.tmp.txt" 2>&1' % (rslimjim, basefile) self.printLog("#RSLIM", rcmd) problems = os.popen(rcmd).read() if problems: self.errorLog(problems, printerror=False) pngx = len(glob.glob("%s*png" % basefile)) self.printLog("#PNG", "%d PNG files made for %s" % (pngx, basefile)) if pngx and os.path.exists("%s.r.tmp.txt" % basefile): os.unlink("%s.r.tmp.txt" % basefile) except: self.errorLog('SLiMJIM visualisation error for "%s"' % mapping) ex += 1 self.printLog("#SLIMJIM", "Generation of SLiMJIMs complete. %d Problems." % ex) except: self.errorLog(rje_zen.Zen().wisdom())
def runMain(): try: ### <0> ### Basic Setup of Program [info, out, mainlog, cmd_list] = setupProgram() ### <1> ### Load Data ## <a> ## Read in Sequences try: out.verbose(1, 3, 'Loading sequences...', 0) seqfile = 'infile.fas' nsfin = None for cmd in cmd_list: if cmd.find('seqin=') == 0: seqfile = cmd[len('seqin='):] if cmd.find('nsfin=') == 0: nsfin = cmd[len('nsfin='):] basefile = seqfile extension = seqfile[-4:] if (extension == '.fas') or (extension == '.phy') or (extension == '.aln'): basefile = seqfile[:-4] seqs = rje_seq.SeqList( log=mainlog, cmd_list=['i=0'] + cmd_list + ['autofilter=F', 'autoload=F', 'seqin=None']) out.verbose(1, 3, "from %s" % seqfile, 1) if not seqs.loadSeqs(seqfile=seqfile, seqtype='protein', aln=True): raise seqfile = seqs.info['Name'] basefile = rje.baseFile(seqfile) mainlog.printLog( '#SEQ', "%s protein sequences read from %s\n" % (str(seqs.seqNum()), seqfile), 1) mainlog.printLog( '#SEQ', "Alignment = %s. (%d aa)\n" % (seqs.opt['Aligned'], seqs.seq[0].seqLen()), 1) except: mainlog.errorLog("Fatal run Exception during Sequence Input\n") raise ## <b> ## Read in Tree try: if not nsfin: nsfin = basefile + '.nsf' while not os.path.exists(nsfin): if out.stat['Interactive'] >= 0: nsfin = rje.choice( text= 'Input tree file "%s" not found. Input filename? (Blank to exit.)' % nsfin) if nsfin == '': raise KeyboardInterrupt else: mainlog.log.errorLog( 'File %s not found. Cannot load tree!' % nsfin, printerror=False, quitchoice=True) raise cmd_list.append('nsfin=' + nsfin) out.verbose(1, 3, 'Loading tree from %s...' % nsfin, 1) mytree = rje_tree.Tree(log=mainlog, cmd_list=['root=yes'] + cmd_list) mytree.mapSeq(seqlist=seqs) mytree.textTree() if mytree.opt['ReRooted']: mytree.saveTree(filename='%s.nsf' % basefile) except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") raise except: mainlog.errorLog("Fatal run Exception during Tree Input\n") raise ### <2> ### GASP try: ## <a> ## InDel Tree Setup indeltree = None for cmd in cmd_list: if cmd.find('indeltree=') == 0: indeltree = cmd[len('indeltree='):] ## <b> ## GASP if indeltree == None or mytree.node[-1].obj[ 'Sequence'] == None: # Perform GASP out.verbose(0, 2, '', 3) mainlog.printLog('#SEQ', 'GASP: Gapped Ancestral Sequence Prediction', 1) if basefile == 'infile': basefile = 'gasp' mygasp = rje_ancseq.Gasp(tree=mytree, ancfile='%s' % basefile, cmd_list=cmd_list, log=mainlog) out.verbose(0, 2, '%s' % mygasp.details(), 1) if out.stat['Interactive'] > 0: if rje.yesNo('Use these parameters?') == False: mygasp.edit() mygasp.gasp() out.verbose(0, 1, "\n\nGASP run completed OK!", 2) ## <c> ## InDel Tree if indeltree: mytree.indelTree(filename=indeltree) except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") raise except: mainlog.errorLog("Fatal run Exception during GASP\n") raise ### <X> ### End except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") except: print "Unexpected error:", sys.exc_info()[0] mainlog.printLog( '#LOG', "%s V:%s End: %s\n" % (info.program, info.version, time.asctime(time.localtime(time.time()))), 1)
def menu(callobj, headtext='', menulist=[], choicetext='Please select:', changecase=True, default='', jointxt=' = ', confirm=False): ### Main Menu method ''' Main Menu method. >> callobj:Object for which attributes are to be read and altered. Also controls interactivity and log. >> headtext:str [''] = Introductory text for menu system. >> menulist:list [] = List of menu item tuples (edit code,description,optiontype,optionkey) - e.g. ('0','Sequence file','info','Name') would edit callobj.info['Name']) - If optiontype == 'return' then menu will return the value given in optionkey - If optiontype == '' then description will be printed as a breaker - If optiontype == 'infile' then callobj.info['Name'] would be changed using rje.getFileName(mustexist=True) - If optiontype == 'outfile' then callobj.info['Name'] would be changed using rje.getFileName(confirm=True) - If optiontype == 'showtext' then optionkey should contain text to be printed with verbose - If optiontype == 'addcmd' then commands can be added. >> choicetext:str ['Please select:'] = Text to display for choice option >> changecase:boolean [True] = change all choices and codes to upper text >> default:str [''] = What to return if nothing selected. >> jointxt:str [' = '] = What to join code and description with when listing options. >> confirm:bool [False] = Whether to confirm selection. << returns optionkey if appropriate, else True ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Fill out partial (return) tuples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## newlist = [] for mtuple in menulist: if len(mtuple) == 2: newlist.append(mtuple + ('return', mtuple[0])) elif len(mtuple) == 3: newlist.append(mtuple + (mtuple[0], )) else: newlist.append(mtuple) menulist = newlist ## ~ [0b] Choice Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## choicedict = {} for (code, desc, vtype, key) in menulist: if not vtype: continue if changecase: choicedict[code.upper()] = (vtype, key) else: choicedict[code] = (vtype, key) ## ~ [0c] Setup Header Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## maxlen = 0 for line in string.split(headtext, '\n'): if len(line) > maxlen: maxlen = len(line) headlist = ['#' * (maxlen + 10)] for line in string.split(headtext, '\n')[0:]: while len(line) < maxlen: line += ' ' headlist.append('# #> %s <# #' % line) headlist.append(headlist[0]) ### ~ [1] Main Menu Loop ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while menulist: ## ~ [1a] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '\n%s' % string.join(headlist, '\n') while mtxt[-2:] != '\n\n': mtxt += '\n' for (code, desc, vtype, key) in menulist: if vtype and (code or desc): if code and desc: mtxt += '%s%s%s' % (code, jointxt, desc) elif code: mtxt += code elif desc: mtxt += desc if vtype in [ 'info', 'list', 'opt', 'stat', 'int', 'str', 'bool', 'num' ]: mtxt += ': %s' % callobj.getAttribute( vtype, key, default='#!#ERROR#!#') elif vtype in ['infile', 'outfile']: mtxt += ': %s' % callobj.getAttribute( 'info', key, default='#!#ERROR#!#') else: mtxt += desc mtxt += '\n' ## ~ [1b] Give Choices ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print(mtxt) while mtxt: try: ## ~ Input user choice ~~~ ## choice = rje.choice(choicetext, default=default, confirm=confirm) if changecase: choice = choice.upper() ## ~ Process user choice ~ ## if choicedict.has_key(choice): (vtype, key) = choicedict[choice] if vtype in ['str', 'info']: callobj.setInfo({ key: callobj._editChoice(key, callobj.getStr(key)) }) if vtype in ['num', 'stat']: callobj.setStat({ key: callobj._editChoice(key, callobj.getNum(key), numeric=True) }) if vtype == 'int': callobj.setStat({ key: int( callobj._editChoice(key, callobj.getInt(key), numeric=True)) }) if vtype in ['bool', 'opt']: callobj.setOpt({key: not callobj.getBool(key)}) if vtype == 'list': callobj.list[key] = string.split( callobj._editChoice(key, callobj.list[key])) if vtype == 'infile': callobj.setInfo({ key: rje.getFileName('%s File Name?' % key, callobj.getStr(key)) }) if vtype == 'outfile': callobj.setInfo({ key: rje.getFileName('%s File Name?' % key, callobj.getStr(key), mustexist=False, confirm=True) }) if vtype == 'showtext': callobj.verbose(-1, -1, key) break if vtype == 'addcmd': prevcmd = callobj.cmd_list callobj.cmd_list = rje.inputCmds(out, choice) callobj.printLog( '#CMD', 'User Added commands: %s' % callobj.cmd_list) callobj._cmdList() callobj.cmd_list = prevcmd + callobj.cmd_list break if vtype in [ 'info', 'list', 'opt', 'stat', 'infile', 'outfile', 'str', 'bool', 'int', 'num' ]: callobj.printLog('#%s' % vtype.upper(), 'User edited %s parameter' % key) break elif vtype == 'return': return key print('Choice "%s" not recognised!\n' % choice) except KeyboardInterrupt: if rje.yesNo('Terminate program?'): raise SystemExit if rje.yesNo('Exit menu and proceed?'): return default except: raise ### End ### return True except KeyboardInterrupt: raise except SystemExit: raise except: if callobj: callobj.errorLog('Major disaster in rje_menu.menu()', quitchoice=True) else: raise
def readHMMPFamSearch(self,resfile=None,readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile,'r').readline().find('hmmpfam') != 0: self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False) if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'): os.unlink(resfile) self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0; hx = 0; seqx = 0 RESFILE = open(resfile,'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)',line): if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) newres = ['',line]; newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())] line = rje.chomp(RESFILE.readline()); newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line): newresout = True (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)}) hitx += 1 self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()); newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None; newres.append(line) elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line): seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n')) os.unlink(resfile) os.rename(newresfile,resfile) self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx))) self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def readPELM(self): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try:### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self,self.info['PELM'],mainkeys=['acc','position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc,pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog('#ERR','Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog('#ERR','Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos-1):pos]: self.log.printLog('#ERR','Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],seqdict[acc][pos-1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log,self.cmd_list) # UniProt entry unidict = pelmuni.accDict(acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc,'-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'],acc,desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog('#WARNING','Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % (acc,acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name,sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists(self.info['PELMFas']) or self.stat['Interactive'] < 0 or rje.yesNo('%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
def pairwiseAQ(self,seqlist=None,query=None,focus=[0,0]): ### Performs PAQ on seqlist, adding seq.info['PAQ'] ''' Performs PAQ on seqlist, adding seq.info['PAQ'] >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: PAQ = PAQ sequences with alignment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <PAQ0> ### Setup try: _stage = '<0> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences if not query: query = haqlist.obj['QuerySeq'] if self.opt['NoQuery'] or not query: query = haqlist.seq[random.randint(0,haqlist.seqNum()-1)] self.log.printLog('#QRY','Temp (random) query %s assigned for PAQ' % query.shortName()) #!# paqx = [False] * seqlist.seq[0].seqLen() # List of whether a column of the alignment is bad (has an X) [True] or not [False] #!# - make this a method?! pwaq = {} # Dictionary of lists of pairwise alignements block_align = {} # Dictionary of whether residue in block of sequence that is well-aligned or not for seq in haqlist.seq: block_align[seq] = [False] * seq.seqLen() seq.info['PAQ'] = seq.info['Sequence'][0:] if seq.info.has_key('SAQX') and len(seq.info['SAQX']) == seq.seqLen(): #!# Should no longer be issues due to length changes following realignment seq.info['Sequence'] = seq.info['SAQX'][0:] elif seq.info.has_key('SAQX'): self.log.errorLog('Cannot use SAQX for %s in PAQ as wrong length.' % seq.shortName(),printerror=False) for otherseq in haqlist.seq: pwaq[(seq,otherseq)] = [False] * seq.seqLen() ### <PAQ1> ### Directional Pairwise Comparisons of sequences _stage = '<1> Pairwise Comparisons' infotxt = 'PAQ%d: Pairwise Comparisons ...' % self.stat['PAQCyc'] #print self.stat for seq in haqlist.seq: for otherseq in haqlist.seq: myinfo = '%s %.1f%% %.1f%% ' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum()),(100.0 * haqlist.seq.index(otherseq) / haqlist.seqNum())) self.log.printLog('\r#PAQ',myinfo,log=False,newline=False) for r in range(seq.seqLen()): ar = seq.info['Sequence'][r] ## <i> ## Look for PW aligned block _stage = '<1-i> Pairwise Comparisons' if ar not in ['-','X']: # Start of test block blen = 0 # Block length (PAQBlock) = AAs win = 0 # Window length = all sequence matchx = 0 # Score for residues in window while blen < self.stat['PAQBlock'] and (r+win) < seq.seqLen(): # This time we allow overshoots in both directions ar = seq.info['Sequence'][r+win] at = otherseq.info['Sequence'][r+win] if 'X' in [ar,at]: # Hit Bad Region: Abort break else: # Better region if ar != '-': blen += 1 # Increase Block matchx += self._saqCon(ar,at) win += 1 ## <ii> ## Update pwaq if block good _stage = '<1-ii> Pairwise Comparisons' if matchx >= self.stat['PAQMatch']: for w in range(win): if seq.info['Sequence'][r+w] in ['-','X']: pwaq[(seq,otherseq)][r+w] = False else: pwaq[(seq,otherseq)][r+w] = True self.log.printLog('\r#PAQ','%s 100.0% 100.0%. ' % infotxt,log=False) ### <PAQ2> ### Link back to Query _stage = '<2> Linking to Query' ### <PAQ2a> ### Network of Pairwise Quality alignments _stage = '<2a> Linking to Query' #self.verbose(1,3,'PAQ%d: Linking Residues to Query (%s)' % (self.stat['PAQCyc'],query.shortName()),0) infotxt = 'PAQ%d: Linking Residues to Query (%s) ...' % (self.stat['PAQCyc'],query.shortName()) for r in range(query.seqLen()): _stage = '<2a> Linking to Query' self.log.printLog('\r#PAQ','%s %.1f%%' % (infotxt,(100.0 * r / query.seqLen())),log=False,newline=False) qok = {} # Dictionary of whether residue in seq OK, i.e. linked to query for seq in haqlist.seq: qok[seq] = False qok[query] = True sok = [0,1] # List of OK sequence for residue while sok[-2] != sok[-1]: ## <i> ## Match pairs, starting with query _stage = '<2a-i> Linking to Query' for seq in haqlist.seq: if qok[seq]: for otherseq in haqlist.seq: if pwaq[(seq,otherseq)][r] or pwaq[(otherseq,seq)][r]: qok[otherseq] = True ## <ii> ## Update sok _stage = '<2a-ii> Linking to Query' sok.append(0) for seq in haqlist.seq: if qok[seq]: sok[-1] += 1 block_align[seq][r] = True _stage = '<2a-iii> Linking to Query' if sok[-1] == 1: # Only query OK! block_align[query][r] = False self.log.printLog('\r#PAQ','%s 100.0%%' % infotxt,log=False) ### <PAQ2b> ### Allow for divergence (Conserved Anchors) _stage = '<2b> Anchors' if self.opt['Anchors']: infotxt = 'PAQ%d: Accounting for divergence within aligned regions ...' % self.stat['PAQCyc'] ## <i> ## Setup gapped list gapped = [False] * query.seqLen() # Whether column of alignment is gapped for seq in haqlist.seq: self.log.printLog('\r#PAQ','%s %.1f%% ' % (infotxt,(50.0 * haqlist.seq.index(seq) / haqlist.seqNum())),log=False,newline=False) (start,end) = (0,seq.seqLen()) while seq.info['Sequence'][start] == '-': start += 1 while seq.info['Sequence'][end-1] == '-': end -=1 for r in range(start,end): if seq.info['Sequence'][r] == '-': gapped[r] = True ## <ii> ## Correction for seq in haqlist.seq: self.log.printLog('\r#PAQ','%s %.1f%% ' % (infotxt,(50 + (50.0 * haqlist.seq.index(seq) / haqlist.seqNum()))),log=False,newline=False) for r in range(seq.seqLen()): if block_align[seq][r] or gapped[r]: # No need for correction continue # Move in both directions: if good residues (or sequence end) reached before gaps then reinstate winf = 0 fwd = True fok = False winb = 0 bwd = True bok = False while fwd or bwd: # End of seqs if (r + winf) >= seq.seqLen(): fwd = False if (r - winb) < 0: bwd = False # Gaps/OK if fwd: if gapped[r+winf]: fok = False fwd = False elif block_align[seq][r+winf]: fwd = False else: winf += 1 if bwd: if gapped[r-winb]: bok = False bwd = False elif block_align[seq][r-winb]: bwd = False else: winb += 1 if fok and bok: # Reinstate for w in range(r-winb,r+winf+1): block_align[seq][w] = True self.log.printLog('\r#PAQ','%s 100.0%% ' % infotxt,log=False) ### <PAQ3> ### X out badly-aligned blocks _stage = '<3> Making bad sequence blocks' for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] #!# Add saving of data in 'datafull' option ### <PAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removing sequences/regions' self.verbose(0,4,'PAQ%d: Removing bad sequences and/or dodgy regions...' % self.stat['PAQCyc'],0) ## <PAQ4a> ## Process Query first - only interested in good regions within query if self.opt['NoQuery']: # No preprocessing of Query self.verbose(0,4,'no Master Query processing...',0) else: haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0) self.verbose(0,3,'',1) if self.opt['ManPAQ']: haqlist.saveFasta(seqfile='%s.manpaq.fas' % haqlist.info['Basefile']) ## <PAQ4b> ## Cycle through other sequences (worst first) until no more good residues are lost goodres = [0, self._getGood(haqlist.seq)] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [0] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = {} # Dictionary of seq:number of lost residues if seq kept losex = {} # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] #?# In Perl HAQESAC there was an option to ignore Orphans in this calculation. Reinstate? losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManPAQ']: default = 'N' if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default): seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']: # Lose sequence! seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <PAQ5> ### Reinstate UnX'd sequence: _stage = '<5> Replacing sequences' for seq in haqlist.seq: [seq.info['PAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['PAQ']] if self.opt['ManPAQ'] and rje.checkForFile('%s.manpaq.fas' % haqlist.info['Basefile']): os.unlink('%s.manpaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('rje_haq.py ~ Problem with pairwiseAQ %s.' % _stage, True)
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath( rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True)) self.printLog('#FARM', batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = [ 'subjobs=%s' % batfile, 'farm=batch', 'qsub=F', 'i=-1', 'runpath=%s' % os.path.abspath(self.info['HaqDir']) ] if self.opt['MultiHAQ']: haqfarm = ['First round', 'Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog( '#CHDIR', 'Changing directory for %s farming: %s' % (farmrun, self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log, self.cmd_list + farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog( '#CHDIR', 'Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM', 'HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ', quitchoice=True)
def readPELM( self ): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try: ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self, self.info['PELM'], mainkeys=['acc', 'position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc, pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog( '#ERR', 'Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog( '#ERR', 'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]: self.log.printLog( '#ERR', 'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], seqdict[acc][pos - 1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log, self.cmd_list) # UniProt entry unidict = pelmuni.accDict( acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log, self.cmd_list + ['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc, '-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc, desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog( '#WARNING', 'Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % ( acc, acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name, sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists( self.info['PELMFas'] ) or self.stat['Interactive'] < 0 or rje.yesNo( '%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
def multiHAQ(self,secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt['MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList() # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = ['ini=haqesac.ini','seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F'] self.printLog('#HAQ','Running HAQESAC for %s - will have own log etc.' % seq.shortName(),log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd,info=info) out = rje.Out(cmd_list=haqcmd) # Sets up Out object for controlling output to screen out.printIntro(info) # Prints intro text using details from Info object haqlog = rje.setLog(info,out,haqcmd) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo('Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog('#HAQ','HAQESAC final round run for %s' % seq.shortName()) else: self.printLog('#HAQ','HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList(self.log,['seqin=%s' % infile,'autoload=T','autofilter=F']).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog('#QRY','%d other queries found in %s: [%s]' % (len(qacclist),infile,string.join(qacclist,'; '))) self.printLog('#QRY','%d of %d queries processed' % (len(processed),self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI','Executing second round of multiHAQ'); self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ',quitchoice=True)
def slimJimMapping(self): ### Generate SLiMJIM PNGs for all sequences '''Generate SpokeAln PNGs for all spokes.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mapseq = {} # Dictionary of {dataset:[seqs]} scmd = ['autoload=T', 'seqnr=F', 'accnr=F', 'replacechar=F'] mseq = rje_seq.SeqList(self.log, self.cmd_list + scmd) #!# Removed ['minregion=3']+ #!# while mseq.seq: ## ~ [1a] ~ Read in all sequences for one spoke ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pseq = [mseq.seq.pop(0) ] # Pseq = list of sequences for this protein while mseq.seq: if mseq.seq[0].info['Name'].find( 'Motifs') > 0 and string.split( mseq.seq[0].info['Name'])[1] == 'Motifs': break # Next protein pseq.append(mseq.seq.pop(0)) ## ~ [1b] ~ Update relevant sequence dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mapseq[pseq[0].shortName()] = pseq[0:] self.printLog('#ALN', '%d distinct alignments identified' % len(mapseq)) ### ~ [2] ~ Make SLiMJIM visualisations for each protein ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ex = 0 # Number of errors for mapping in rje.sortKeys(mapseq): try: ## ~ [3a] ~ Rename sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = pseq[0].shortName() if self.interactive() > 0 and not rje.yesNo(basefile): continue qryname = pseq[2].shortName() pseq = mapseq[mapping][0:] pseq[0].info['R'] = pseq[0].shortName()[len(qryname) + 1:] pseq[1].info['R'] = 'Masked' for seq in pseq[2:]: seq.info['R'] = seq.info['ID'] ## ~ [3b] ~ Setup new SeqList, strip Query gaps, calculate RelCons ~~~~~~~~~~~~~~~~ ## seqfile = '%s.aln.tdt' % basefile if os.path.exists(seqfile): os.unlink(seqfile) rseq = rje_seq.SeqList( self.log, self.cmd_list + scmd + ['autoload=F']) rseq.seq = pseq rseq.obj['QuerySeq'] = pseq[2] rseq.tidyQueryGaps() rseq.saveR(rseq.seq, seqfile, name='R') rseq.seq = pseq[2:] relfile = '%s.rel.tdt' % basefile if os.path.exists(relfile): os.unlink(relfile) rseq.relCons(relfile) self.deBug(rseq.obj['QuerySeq'].cmd_list) ## ~ [3c] ~ Call R to generate graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rcmd = '%s --no-restore --no-save --args "sfmap2png" "%s"' % ( self.info['RPath'], basefile) rslimjim = '%srje.r' % self.info['Path'] rcmd += ' < "%s" > "%s.r.tmp.txt" 2>&1' % (rslimjim, basefile) self.printLog('#RSLIM', rcmd) problems = os.popen(rcmd).read() if problems: self.errorLog(problems, printerror=False) pngx = len(glob.glob('%s*png' % basefile)) self.printLog( '#PNG', '%d PNG files made for %s' % (pngx, basefile)) if pngx and os.path.exists('%s.r.tmp.txt' % basefile): os.unlink('%s.r.tmp.txt' % basefile) except: self.errorLog('SLiMJIM visualisation error for "%s"' % mapping) ex += 1 self.printLog('#SLIMJIM', 'Generation of SLiMJIMs complete. %d Problems.' % ex) except: self.errorLog(rje_zen.Zen().wisdom())
def run(self): ### Main Run method ''' Main Run method. ''' try: ### SLiMDisc Run ### if self.opt['SLiMDisc']: return self.slimDisc() ### TEIRESIAS ### if self.opt['Teiresias']: ## Setup ## seqlist = rje_seq.SeqList(self.log,self.cmd_list) infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True) outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True) run_teiresias = True if rje.isYounger(outfile,infile) == outfile: if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'): run_teiresias = False ## Run TEIRESIAS ## if run_teiresias: seqlist.saveFasta(seqfile=infile,name='Teiresias') ### Saves sequences in fasta format command = rje.makePath(self.info['TeiresiasPath'],True) command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt']) self.log.printLog('#CMD',command) os.system(command) ## Read Results ## self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1) self.list['Pattern'] = [] RESULTS = open(outfile,'r') line = RESULTS.readline() while line: if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line)) elif len(line) > 3 and line[0] != '#': self.log.errorLog('Did not recognise line: %s' % line,False,False) line = RESULTS.readline() RESULTS.close() patx = len(self.list['Pattern']) self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile)) ## Calculate Information Content ## aafreq = seqlist.aaFreq() self.verbose(0,3,'Calculating Information Content & Length stats...',0) occx = 0 for pattern in self.list['Pattern']: pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq) pattern._makeLength() occx += 1 rje.progressPrint(self,occx,patx/100,patx/10) self.verbose(0,1,'...Done!',2) ## Prepare Results ## delimit = rje.getDelimit(self.cmd_list) if self.info['Name'] == 'None': self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit)) if self.opt['MySQL']: # Two tables patfile = os.path.splitext(self.info['Name']) occfile = '%s.occ%s' % (patfile[0],patfile[1]) patfile = '%s.patterns%s' % (patfile[0],patfile[1]) if self.opt['Append']: PATFILE = open(patfile,'a') OCCFILE = open(occfile,'a') else: PATFILE = open(patfile,'w') rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit) OCCFILE = open(occfile,'a') rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit) else: if self.opt['Append']: RESFILE = open(self.info['Name'],'a') else: RESFILE = open(patfile,'w') rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit) ## Save Results ## occx = 0 for pattern in self.list['Pattern']: patstats = [] for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']: patstats.append('%d' % pattern.stat[stat]) patstats[2] = '%.3f' % pattern.stat['Info'] if self.opt['MySQL']: # Two tables rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit) for occ in rje.sortKeys(pattern.occ): seq = seqlist.seq[occ] for pos in pattern.occ[occ]: match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])] outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match] if self.opt['MySQL']: # Two tables rje.writeDelimit(OCCFILE,outlist,delimit) else: rje.writeDelimit(RESFILE,outlist+patstats,delimit) occx += 1 if self.opt['MySQL']: # Two tables PATFILE.close() OCCFILE.close() self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile)) self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile)) else: RESFILE.close() self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' % (rje.integerString(occx),rje.integerString(patx),self.info['Name'])) ### InfoContent ### elif self.info['Info'] != 'None': ## Setup ## alphabet = rje_seq.alph_protx if not os.path.exists(self.info['Info']): self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False) return False else: mypresto = presto.Presto(self.log,self.cmd_list) mypresto.loadMotifs(file=self.info['Info'],clear=True) seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) if seqlist.seqNum() > 0: aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False) ### Returns dictionary of AA (& gap etc.) frequencies else: aafreq = {} for aa in alphabet: aafreq[aa] = 1.0 / len(alphabet) alphabet = aafreq.keys() maxinfo = 0 for aa in alphabet: maxinfo += (aafreq[aa] * math.log(aafreq[aa],2)) ## Output ## delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext) if self.opt['Append']: OUTFILE = open(outfile,'a') else: OUTFILE = open(outfile,'w') rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit) ## Calculate Information Scores ## for motif in mypresto.motif: self.verbose(2,4,motif.info['Sequence'],0) pattern = string.replace(motif.info['Sequence'],'X','.') elements = string.split(pattern,'-') pattern = '' for el in elements: if el.find('.{') == 0: # Ambiguous spacer length - compress pattern += '.' else: pattern += el self.verbose(2,2,'=> %s' % pattern,1) motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen']) self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1) ## Output ## rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit) ## Finish ## OUTFILE.close() except: self.log.errorLog('Error in run().',printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def readHMMPFamSearch( self, resfile=None, readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile, printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join([ '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)', 'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)' ], '\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [ ] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile, 'r').readline().find('hmmpfam') != 0: self.errorLog( 'File "%s" does not appear to be an hmmpfam results file' % resfile, printerror=False) if rje.yesNo( 'Delete incorrect results file? (Check that hmmpfam=T is right!)', default='N'): os.unlink(resfile) self.printLog('#DEL', 'Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0 hx = 0 seqx = 0 RESFILE = open(resfile, 'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)] newresout = True newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString(len(pfam)), rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)', line): if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) newres = ['', line] newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0] hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [ line, rje.chomp(RESFILE.readline()), rje.chomp(RESFILE.readline()) ] line = rje.chomp(RESFILE.readline()) newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line): newresout = True (dom, start, end, score, eval) = rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({ 'SbjStart': string.atoi(start), 'SbjEnd': string.atoi(end), 'Expect': string.atof(eval), 'BitScore': string.atof(score) }) hitx += 1 self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString( len(pfam)), rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()) newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None newres.append(line) elif rje.matchExp( 'End of rje_hmm reduced results file: (%d) sequences in original', line): seqx = string.atoi( rje.matchExp( 'End of rje_hmm reduced results file: (\d+) sequences in original', line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile, 'a').write( string.join([ '', 'End of rje_hmm reduced results file: %d sequences in original' % seqx ], '\n')) os.unlink(resfile) os.rename(newresfile, resfile) self.printLog( '\r#RED', 'Results file %s replaced with reduced version (%s Hits only)' % (resfile, rje.integerString(hitx))) self.printLog( '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(seqx), rje.integerString( len(pfam)), rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def menu(callobj,headtext='',menulist=[],choicetext='Please select:',changecase=True,default=''): ### Main Menu method ''' Main Menu method. >> callobj:Object for which attributes are to be read and altered. Also controls interactivity and log. >> headtext:str [''] = Introductory text for menu system. >> menulist:list [] = List of menu item tuples (edit code,description,optiontype,optionkey) - e.g. ('0','Sequence file','info','Name') would edit callobj.info['Name']) - If optiontype == 'return' then menu will return the value given in optionkey - If optiontype == '' then description will be printed as a breaker - If optiontype == 'infile' then callobj.info['Name'] would be changed using rje.getFileName(mustexist=True) - If optiontype == 'outfile' then callobj.info['Name'] would be changed using rje.getFileName(confirm=True) - If optiontype == 'showtext' then optionkey should contain text to be printed with verbose - If optiontype == 'addcmd' then commands can be added. >> choicetext:str ['Please select:'] = Text to display for choice option >> changecase:boolean [True] = change all choices and codes to upper text << returns optionkey if appropriate, else True ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Choice Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## choicedict = {} for (code,desc,type,key) in menulist: if not type: continue if changecase: choicedict[code.upper()] = (type,key) else: choicedict[code] = (type,key) ## ~ [0b] Setup Header Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## maxlen = 0 for line in string.split(headtext,'\n'): if len(line) > maxlen: maxlen = len(line) headlist = ['#' * (maxlen + 10)] for line in string.split(headtext,'\n')[0:]: while len(line) < maxlen: line += ' ' headlist.append('# #> %s <# #' % line) headlist.append(headlist[0]) ### ~ [1] Main Menu Loop ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while menulist: ## ~ [1a] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '\n%s' % string.join(headlist,'\n') while mtxt[-2:] != '\n\n': mtxt += '\n' for (code,desc,type,key) in menulist: if type and (code or desc): mtxt += '<%s> %s' % (code,desc) if type in ['info','list','opt','stat','int','str','bool','num']: mtxt += ': %s' % callobj.getAttribute(type,key,default='#!#ERROR#!#') elif type in ['infile','outfile']: mtxt += ': %s' % callobj.getAttribute('info',key,default='#!#ERROR#!#') else: mtxt += desc mtxt += '\n' ## ~ [1b] Give Choices ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print mtxt while mtxt: try:## ~ Input user choice ~~~ ## choice = rje.choice(choicetext,default=default) if changecase: choice = choice.upper() ## ~ Process user choice ~ ## if choicedict.has_key(choice): (type,key) = choicedict[choice] if type in ['str','info']: callobj.setInfo({key:callobj._editChoice(key,callobj.getStr(key))}) if type in ['num','stat']: callobj.setStat({key:callobj._editChoice(key,callobj.getNum(key),numeric=True)}) if type == 'int': callobj.setStat({key:int(callobj._editChoice(key,callobj.getInt(key),numeric=True))}) if type in ['bool','opt']: callobj.setOpt({key: not callobj.getBool(key)}) if type == 'list': callobj.list[key] = string.split(callobj._editChoice(key,callobj.list[key])) if type == 'infile': callobj.setInfo({key: rje.getFileName('%s File Name?' % key,callobj.getStr(key))}) if type == 'outfile': callobj.setInfo({key: rje.getFileName('%s File Name?' % key,callobj.getStr(key),mustexist=False,confirm=True)}) if type == 'showtext': callobj.verbose(-1,-1,key); break if type == 'addcmd': prevcmd = callobj.cmd_list callobj.cmd_list = rje.inputCmds(out,prevcmd) callobj.printLog('#CMD','User Added commands: %s' % callobj.cmd_list) callobj._cmdList() callobj.cmd_list = prevcmd + callobj.cmd_list break if type in ['info','list','opt','stat','infile','outfile','str','bool','int','num']: callobj.printLog('#%s' % type.upper(),'User edited %s parameter' % key); break elif type == 'return': return key print 'Choice "%s" not recognised!\n' % choice except KeyboardInterrupt: if rje.yesNo('Terminate program?'): raise if rje.yesNo('Exit menu and proceed?'): if default: return default else: return True except: raise ### End ### return True except KeyboardInterrupt: raise except: if callobj: callobj.errorLog('Major disaster in rje_menu.menu()',quitchoice=True) else: raise