Example #1
0
 def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes']
     '''Sets up output files self.str['MapFas','MissFas','MapRes'].'''
     ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     delimit = rje.getDelimit(self.cmd_list)
     if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = ''
     else:
         self.bool['Append'] = True
         self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom'])
     ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)}
     if self.getBool('Combine'): files.pop('MissFas')
     if self.str['ResFile'].lower() in ['','none']:
         self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True))
     for file in files.keys():
         self.setStr({file: self.getStr('ResFile') + '.' + files[file]})
         rje.backup(self,self.getStr(file))
     ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     #!# Consider replacing with rje_db object? #!#
     self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score']
     for qh in ['Query','Hit']:
         self.list['Headers'] += ['%s_Species' % qh]
         if self.bool['GablamOut']:
             for st in ['Len','Sim','ID']:
                 self.list['Headers'] += ['%s_%s' % (qh,st)]
     rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
Example #2
0
 def save(self):  ### Saves parsed REST output to files
     '''Saves parsed REST output to files.'''
     rbase = '%s%s' % (self.getStr('RestOutDir'),
                       rje.baseFile(self.getStr('RestBase'),
                                    strip_path=True,
                                    keepext=True))
     rje.mkDir(self, self.getStr('RestOutDir'))
     outputs = rje.sortKeys(self.dict['Output'])
     if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')]
     elif self.getStrLC('Rest') in ['full', 'text']:
         outfile = '%s.rest' % rbase
         open(outfile, 'w').write(self.restFullOutput())
         self.printLog('#OUT', '%s: %s' % (self.getStrLC('Rest'), outfile))
         return True
     elif self.getStrLC('Rest'):
         self.printLog(
             '#OUTFMT', 'REST output format "%s" not recognised.' %
             self.getStrLC('Rest'))
         if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'):
             return False
         outfile = '%s.rest' % rbase
         open(outfile, 'w').write(self.restFullOutput())
         self.printLog('#OUT', 'full: %s' % (outfile))
         return True
     for rkey in outputs:
         if rkey in self.dict['Outfile']:
             rje.backup(self, self.dict['Outfile'][rkey])
             open(self.dict['Outfile'][rkey],
                  'w').write(self.dict['Output'][rkey])
             self.printLog('#OUT',
                           '%s: %s' % (rkey, self.dict['Outfile'][rkey]))
         elif rkey not in ['intro']:
             self.warnLog('No outfile parsed/generated for %s output' %
                          rkey)
Example #3
0
 def release(self):  ### Generate the release information tables.
     '''Generate the release information tables.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         hdb = None      # History table: dir, module, version, update, release
         basefile = self.basefile()
         prevbase = self.getStr('PrevBase')
         backbase = self.getStr('BackBase')
         if backbase == prevbase: raise ValueError('BackBase cannot match PrevBase ("%s")' % prevbase)
         if backbase == basefile: raise ValueError('BackBase cannot match BaseFile ("%s")' % basefile)
         ## ~ [1a] Load & Backup previous release ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         for sfile in ['release.tdt','history.tdt','readme.txt','updates.html']:
             pfile = '%s.%s' % (prevbase,sfile)
             bfile = '%s.%s' % (backbase,sfile)
             if os.path.exists(pfile):
                 if os.path.exists(bfile): rje.backup(self,bfile)
                 open(bfile,'w').write(open(pfile,'r').read())
                 self.printLog('#BACK','%s => %s' % (pfile,bfile))
                 if sfile == 'history.tdt': hdb = db.addTable(filename=pfile,mainkeys=['Dir','Module','Version'],name='history',expect=True)
         ### ~ [2] Generate slimsuite.release.tdt based on *.Module.tdt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rdb = db.copyTable(self.db('Module'),'release')
         rdb.renameField('SourceDir','Dir')
         rdb.newKey(['Dir','Module'])
         rdb.dropFields(['File','Classes','Methods'])
         if 'release' in self.list['Output']: rdb.saveToFile(backup=False)
         ### ~ [3] Generate slimsuite.history.tdt, parsed from docstrings, based on pydoc.distribute() ~~~~~~~~~~~ ###
         if not hdb:
             hdb = db.addEmptyTable('history',['Dir','Module','Version','Update','Release'],['Dir','Module','Version'])
         self.makeHistory()
         # Generate slimsuite.readme.txt based on pydoc.saveDocs()
         if 'readme' in self.list['Output']: self.saveReadMe('%s.readme.txt' % basefile)
         return True
     except: self.errorLog('%s.release error' % self.prog()); return False
Example #4
0
 def readResults(self,
                 clear=True,
                 readaln=False
                 ):  ### Reads results from self.list['HMMRes'] into objects
     '''
     Reads results from self.list['HMMRes'] into objects.
     >> clear:boolean = whether to clear self.search before reading [True]
     >> readaln:boolean = whether to bother reading Alignments into objects [False]
     '''
     try:
         if clear: self.search = []
         for resfile in rje.sortUnique(self.list['HMMRes'], xreplace=False):
             if not os.path.exists(
                     resfile) and self.opt['GZip'] and os.path.exists(
                         '%s.gz' % resfile):
                 os.system('gunzip %s.gz' % resfile)
                 self.printLog('#GUNZIP', 'Gunzipped %s.gz' % resfile)
             if self.opt['HMMPFam']:
                 self.readHMMPFamSearch(resfile, readaln)
             else:
                 self.readHMMSearch(resfile, readaln)
             if self.opt['GZip'] and os.path.exists(resfile):
                 rje.backup(self, '%s.gz' % resfile, unlink=True)
                 os.system('gzip %s' % resfile)
                 self.printLog('#GZIP',
                               '%s gzipped to save space' % resfile)
     except:
         self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',
                           quitchoice=True)
         return False
Example #5
0
    def _run(self):  ### Controls main Class functions
        '''
        Controls main Class functions:
        * 1. Use hmmbuild to construct HMMs from input sequence files
        * 2. Search a sequence database with HMMs files
        * 3. Convert HMMer output into a delimited text file of results.
        '''
        try:
            ### 1. Build ###
            for seqfile in self.list['MakeHMM']:
                hmmfile = self.buildHMM(seqfile)
                if hmmfile: self.list['HMM'].append(hmmfile)

            ### 2. Search ###
            self.deBug(self.list['HMM'])
            if self.list['HMM'] and os.path.exists(
                    self.info['SearchDB']) and self.info['HMMOut'].lower(
                    ) not in ['', 'none']:
                rje.backup(self, self.info['HMMOut'], unlink=True)
            for hmm in self.list['HMM']:
                self.list['HMMRes'].append(
                    self.hmmSearch(hmm, outfile=self.info['HMMOut']))

            ### 3. Tabulate ###
            self.hmmTable(outfile=self.info['HMMTab'],
                          append=self.opt['Append'])

            return True
        except:
            self.log.errorLog('Fatal Error during rje_hmm._run()',
                              quitchoice=True)
            return False
Example #6
0
    def _run(self):     ### Controls main Class functions
        '''
        Controls main Class functions:
        * 1. Use hmmbuild to construct HMMs from input sequence files
        * 2. Search a sequence database with HMMs files
        * 3. Convert HMMer output into a delimited text file of results.
        '''
        try:
            ### 1. Build ###
            for seqfile in self.list['MakeHMM']:
                hmmfile = self.buildHMM(seqfile)
                if hmmfile: self.list['HMM'].append(hmmfile)

            ### 2. Search ###
            self.deBug(self.list['HMM'])
            if self.list['HMM'] and os.path.exists(self.info['SearchDB']) and self.info['HMMOut'].lower() not in ['','none']:
                rje.backup(self,self.info['HMMOut'],unlink=True)
            for hmm in self.list['HMM']: self.list['HMMRes'].append(self.hmmSearch(hmm,outfile=self.info['HMMOut']))

            ### 3. Tabulate ###
            self.hmmTable(outfile=self.info['HMMTab'],append=self.opt['Append'])

            return True                   
        except:
            self.log.errorLog('Fatal Error during rje_hmm._run()',quitchoice=True)
            return False
Example #7
0
 def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes']
     '''Sets up output files self.str['MapFas','MissFas','MapRes'].'''
     ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     delimit = rje.getDelimit(self.cmd_list)
     if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = ''
     else:
         self.bool['Append'] = True
         self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom'])
     ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)}
     if self.getBool('Combine'): files.pop('MissFas')
     if self.str['ResFile'].lower() in ['','none']:
         self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True))
     for file in files.keys():
         self.setStr({file: self.getStr('ResFile') + '.' + files[file]})
         rje.backup(self,self.getStr(file))
     ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     #!# Consider replacing with rje_db object? #!#
     self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score']
     for qh in ['Query','Hit']:
         self.list['Headers'] += ['%s_Species' % qh]
         if self.bool['GablamOut']:
             for st in ['Len','Sim','ID']:
                 self.list['Headers'] += ['%s_%s' % (qh,st)]
     rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
Example #8
0
 def run(self, gtext=''):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.setup(gtext)
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         html = self.glossaryHTML()
         hobj = self.obj['HTML']
         date = string.split(time.asctime(time.localtime(time.time())))
         date = '%s %s %s' % (date[2], date[1], date[-1])
         hobj.info['Copyright'] += '. Generated by rje_glossary.py'
         title = '%s' % self.getStr('Name')
         tabber = self.getStr('HTMLStyle').lower() == 'tab'
         frontpage = True
         html = '%s\n\n%s\n\n%s' % (hobj.htmlHead(
             title, tabber, frontpage), html, hobj.htmlTail(tabber))
         if not gtext:  # Replace with CGI option
             rje.backup(self, self.getStr('OutFile'), appendable=False)
             open(self.getStr('OutFile'), 'w').write(html)
             self.printLog(
                 '#HTML',
                 '%s HTML output to %s' % (title, self.getStr('OutFile')))
         return html
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Example #9
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(): return
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tables = self.db().tables()[0:]
         ## ~ [2a] ~ Calculate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         for table1 in tables:
             for table2 in tables[tables.index(table1) + 1:]:
                 self.difference(table1, table2)
         ## ~ [2b] ~ Calculate Averages & Generate HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         for table in self.db().tables()[0:]:
             self.average(table)
         ## ~ [2c] ~ Output HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.getBool('TopHTML'):
             html = rje_html.HTML(self.log, self.cmd_list)
             hfile = '%s.html' % self.basefile()
             rje.backup(self, hfile)
             open(hfile, 'w').write(
                 html.htmlHead(title=self.basefile(), tabber=False) +
                 self.getStr('TopHTML') + html.htmlTail(False))
         return
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Example #10
0
 def blast2fas(self):    ### Executes BLAST2FAS and copies results files
     '''Executes BLAST2FAS and copies results files.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         need2blast = self.opt['Force']
         null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = []
         if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n')
         self.debug(null_file)
         for seq in self.seqs():
             if seq.info['AccNum'] in null_list: nx += 1; continue
             hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True)
             for db in self.obj['SeqList'].list['Blast2Fas']:
                 self.debug(rje.isYounger(hfile,db))
                 self.debug(rje.isYounger(hfile,db) == hfile)
                 need2blast = need2blast or not rje.isYounger(hfile,db) == hfile
         if not need2blast:
             self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx)
             return False
         ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.backup(self,null_file); nx = 0
         if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')]
         elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')]
         if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')]
         rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir'))
         for seq in self.seqs():
             sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum'])
             if os.path.exists(sbfile):
                 hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True)
                 os.rename(sbfile,hfile)
                 if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile))
                 if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile))
             else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1
         if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file))
         self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx))
         return True
     except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
Example #11
0
 def release(self):  ### Generate the release information tables.
     '''Generate the release information tables.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         hdb = None  # History table: dir, module, version, update, release
         basefile = self.basefile()
         prevbase = self.getStr('PrevBase')
         backbase = self.getStr('BackBase')
         if backbase == prevbase:
             raise ValueError('BackBase cannot match PrevBase ("%s")' %
                              prevbase)
         if backbase == basefile:
             raise ValueError('BackBase cannot match BaseFile ("%s")' %
                              basefile)
         ## ~ [1a] Load & Backup previous release ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         for sfile in [
                 'release.tdt', 'history.tdt', 'readme.txt', 'updates.html'
         ]:
             pfile = '%s.%s' % (prevbase, sfile)
             bfile = '%s.%s' % (backbase, sfile)
             if os.path.exists(pfile):
                 if os.path.exists(bfile): rje.backup(self, bfile)
                 open(bfile, 'w').write(open(pfile, 'r').read())
                 self.printLog('#BACK', '%s => %s' % (pfile, bfile))
                 if sfile == 'history.tdt':
                     hdb = db.addTable(
                         filename=pfile,
                         mainkeys=['Dir', 'Module', 'Version'],
                         name='history',
                         expect=True)
         ### ~ [2] Generate slimsuite.release.tdt based on *.Module.tdt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rdb = db.copyTable(self.db('Module'), 'release')
         rdb.renameField('SourceDir', 'Dir')
         rdb.newKey(['Dir', 'Module'])
         rdb.dropFields(['File', 'Classes', 'Methods'])
         if 'release' in self.list['Output']: rdb.saveToFile(backup=False)
         ### ~ [3] Generate slimsuite.history.tdt, parsed from docstrings, based on pydoc.distribute() ~~~~~~~~~~~ ###
         if not hdb:
             hdb = db.addEmptyTable(
                 'history',
                 ['Dir', 'Module', 'Version', 'Update', 'Release'],
                 ['Dir', 'Module', 'Version'])
         self.makeHistory()
         # Generate slimsuite.readme.txt based on pydoc.saveDocs()
         if 'readme' in self.list['Output']:
             self.saveReadMe('%s.readme.txt' % basefile)
         return True
     except:
         self.errorLog('%s.release error' % self.prog())
         return False
Example #12
0
 def mapTaxa(self,taxin,taxout=['spcode'],nodeonly=False,rankonly=False,savetaxout=True):    ### Takes a list of Taxa and returns mapped Taxa data
     '''
     Takes a list of Taxa and returns mapped Taxa data.
     >> taxin:str or list of taxon identifiers to map from.
     >> taxout:str or list of taxa output formats
     >> nodeonly:bool = whether to limit TaxID mapping to the precise matching nodes (else include children)
     >> rankonly:bool = whether to limit TaxID to those matching self.list['RankTypes'] taxon types.
     >> savetaxout:bool [True] = Whether to save the TaxOut list to a text file
     << taxoutlist:list of mapped taxa if taxout is a string, OR
     << taxoutdict:dict of mapped taxa if taxout is a list
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxout.sort()
         except: tlist = False
         if tlist:
             if not taxout: return {}
             taxout = [taxout]
         elif not taxout: return []
         ### ~ [2] ~ Map to TaxID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxid = self.mapToTaxID(self.list['TaxIn'],nodeonly,rankonly)
         if self.list['RestrictID']:
             tx = len(taxid)
             taxid = rje.listIntersect(taxid,self.list['RestrictID'])
             self.printLog('#TAXID','%s of %s TaxID in %s Restricted IDs.' % (rje.iLen(taxid),rje.iStr(tx),rje.iLen(self.list['RestrictID'])))
         ### ~ [3] ~ Map TaxID and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxdict = {}; taxoutdict = {}
         for taxout in self.list['TaxOut']:
             taxout = taxout.lower()
             if taxout == 'taxid':
                 taxoutlist = taxid
             elif taxout in ['spcode','name','common']:
                 if not taxdict: taxdict = self.taxDict(taxid)
                 taxoutlist = []
                 for t in taxid:
                     try: taxoutlist.append(taxdict[t][taxout])
                     except: self.warnLog('No "%s" data for TaxID %s' % (taxout, t),'Missing_%s' % taxout,suppress=True)
                 taxoutlist.sort()
             else: self.errorLog('TaxOut format "%s" not recognised' % taxout,printerror=False); continue
             taxoutdict[taxout] = taxoutlist
             if savetaxout:
                 if not taxoutlist: self.printLog('#OUT','No %s IDs to output' % taxout); continue
                 tfile = '%s.%s.txt' % (self.baseFile(),taxout)
                 rje.backup(self,tfile)
                 open(tfile,'w').write(string.join(taxoutlist,'\n'))
                 self.printLog('#OUT','%s %s IDs output to %s.' % (rje.iLen(taxoutlist), taxout, tfile))
         if tlist: return taxoutdict
         return taxoutlist
     except: self.errorLog('Problem during %s mapTaxa.' % self); raise
Example #13
0
    def outputSchema(self,format='txt',filename='schema.txt'):  ### Formats and outputs shema
        '''
        Formats and outputs shema.
        >> format:str [txt] = Type of output format
        >> filename:str [schema.txt] = Name for output file
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            schema = self.dict['Schema']
            rje.backup(self,filename)
            level = 1
            ### ~ [2] Process Schema ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.schedict(schema,filename,level)            

        except:
            self.log.errorLog('Problem outputting schema')
            print schema
Example #14
0
 def depthChargeForker(self):  ### Main DepthCharge forking method
     '''
     Work through each sequence and fork it out for DepthCharge analysis.
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqin = self.seqinObj()
         self.list['ToFork'] = seqin.list['Seq'][0:]
         resfile = '{0}.depthcharge.tdt'.format(self.baseFile())
         if self.force(): rje.backup(resfile, appendable=False)
         elif rje.exists(resfile):
             ddb = self.db().addTable(resfile,
                                      ['seqname', 'start', 'end', 'type'])
             ddb.dataFormat({'start': 'int', 'end': 'int'})
             complete = ddb.indexDataList('type', 'all', 'seqname')
             if complete:
                 cx = 0
                 for seq in self.list['ToFork'][0:]:
                     if seqin.shortName(seq) in complete:
                         self.list['ToFork'].remove(seq)
                         cx += 1
                 if cx:
                     self.printLog(
                         '#SKIP',
                         'Skipping {0} previously processed sequences (force=F)'
                         .format(rje.iStr(cx)))
             if not self.list['ToFork']:
                 self.printLog(
                     '#CHARGE',
                     'All sequences previously processed (force=F)')
                 return ddb
         while len(self.list['Forked']) < self.getNum(
                 'Forks') and self.list['ToFork']:
             self.nextFork()
         ### ~ [2] ~ Work through each sequence and fork out ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK',
                       'Forking of %s jobs completed.' %
                       (rje.iStr(seqin.seqNum())),
                       log=self.getBool('LogFork'))
         ddb = self.db().addTable(resfile,
                                  ['seqname', 'start', 'end', 'type'],
                                  replace=True)
         ddb.dataFormat({'start': 'int', 'end': 'int'})
         return ddb
     except:
         self.errorLog('%s.depthChargeForker error' % self.prog())
Example #15
0
    def outputSchema(self,
                     format='txt',
                     filename='schema.txt'):  ### Formats and outputs shema
        '''
        Formats and outputs shema.
        >> format:str [txt] = Type of output format
        >> filename:str [schema.txt] = Name for output file
        '''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            schema = self.dict['Schema']
            rje.backup(self, filename)
            level = 1
            ### ~ [2] Process Schema ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.schedict(schema, filename, level)

        except:
            self.log.errorLog('Problem outputting schema')
            print schema
Example #16
0
 def peptCluster(self):  ### Performs actual peptide clustering and stores results in self.obj['Tree']
     '''Performs actual peptide clustering and stores results in self.obj['Tree'].'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         base = rje.baseFile(self.getStr('SaveDis'))
         pretree = ['treeformats=nwk,text','basefile=%s' % base]
         ### ~ [1] ~ Phylip Neighbor method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('PeptCluster') == 'neighbor':
             disfile = '%s.phy' % base
             fasfile = '%s.fas' % base
             treecmd = ['autoload=T','maketree=neighbor','disin=%s' % disfile,'seqin=%s' % fasfile]
             pretree += ['root=mid']
             if disfile != self.getStr('SaveDis'):
                 rje.backup(self,disfile)
                 self.obj['PeptDis'].saveMatrix(filename=disfile,format='phylip')   ### Saves matrix
             if 'peptides=%s' % fasfile not in self.cmd_list:
                 rje.backup(self,fasfile)
                 FAS = open(fasfile,'w')
                 for pep in self.list['Peptides']: FAS.write('>%s\n%s\n' % (pep,pep))
                 FAS.close()
             tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd)
         ### ~ [2] ~ UPGMA method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         else:
             if self.getStr('PeptCluster') not in ['wpgma','upgma']:
                 self.errorLog('PeptCluster method "%s" not recognised. Will use UPGMA' % self.getStr('PeptCluster'),printerror=False)
                 base = string.replace(base,self.getStr('PeptCluster'),'upgma')
                 pretree += ['basefile=%s' % base]
             if self.getStr('PeptCluster') == 'upgma': nsftree = self.obj['PeptDis'].upgma()
             elif self.getStr('PeptCluster') == 'wpgma': nsftree = self.obj['PeptDis'].wpgma()
             #nwkfile = '%s.nwk' % base
             #treecmd += ['nsfin=%s' % nwkfile]
             #rje.backup(self,nwkfile)
             #open(nwkfile,'w').write(nsftree)
             treecmd = ['autoload=F']
             tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd)
             tree.buildTree(nsftree)
         ### ~ [3] ~ Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for node in tree.node:
             if node.info['Name'] in self.list['Peptides']: node.stat['ID'] = self.list['Peptides'].index(node.info['Name']) + 1
         tree.saveTrees()
         for outfmt in tree.list['TreeFormats']:
             treefile = '%s.%s' % (tree.info['Basefile'],rje_tree.formatext[outfmt])
             self.dict['Output'][outfmt] = treefile
     except: self.errorLog('%s.peptDis error' % self);
Example #17
0
 def saveTimePoints(self,filename='',format='tdt',entries=[]):   ### Saves TimePoints to a file
     '''
     Saves TimePoints to a file from main TimePoints table.
     >> filename:str [''] = Output filename. Will use basefile if none given.
     >> format:str ['tdt'] = Output file format (csv/tsv/txt/db)
     >> entries:list [] = Entries from main table to output. (All if none given).
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('TimePoints')
         if format.lower() in ['','none']: format = string.split(filename.lower(),'.')[-1]
         if not filename: filename = '%s.%s' % (self.basefile(),format)
         if not entries: entries = db.entries()
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if format in ['csv','tdt']: 
             self.blanksToEmpty()
             rje.delimitedFileOutput(self,filename,db.fields(),rje_backup=True)
             for entry in entries: rje.delimitedFileOutput(self,filename,db.fields(),datadict=entry)
         ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             self.emptyToBlank()
             rje.backup(self,filename)
             OUT = open(filename,'a')
             for entry in entries:
                 if format == 'db':
                     outlist = []
                     for field in db.fields(): outlist.append(entry[field])
                     out_txt = '%s' % outlist
                     OUT.write('(%s);\n' % out_txt[1:-1])
                 else:
                     # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                     out_text = '%s. (TimePoint) ' % entry['TimePoint Name']
                     if entry['month'] in ['','blank']: out_text += '%s %s.' % (entry['Year'],entry['yearUnit'])
                     else: out_text += '%s %s, %s %s.' % (entry['Year'],entry['yearUnit'],entry['month'],entry['day'])
                     out_text = '%s %s Source: <%s>[%s].' % (out_text,entry['TimePoint Description'],entry['Source URL'],entry['Source URL'])
                     klist = []
                     for i in range(1,6):
                         if entry['keyword%d' % i] not in ['','blank']: klist.append(entry['keyword%d' % i])
                     out_text = '%s (Keywords: %s)' % (out_text,string.join(klist,', '))
                     OUT.write('%s\n' % out_text)
         self.printLog('#OUT','%d entries output to %s' % (len(entries),filename))
     except: self.errorLog('%s.saveTimePoints(%s) error' % (self,filename)); return False
Example #18
0
 def readResults(self,clear=True,readaln=False):  ### Reads results from self.list['HMMRes'] into objects
     '''
     Reads results from self.list['HMMRes'] into objects.
     >> clear:boolean = whether to clear self.search before reading [True]
     >> readaln:boolean = whether to bother reading Alignments into objects [False]
     '''
     try:
         if clear: self.search = []
         for resfile in rje.sortUnique(self.list['HMMRes'],xreplace=False):
             if not os.path.exists(resfile) and self.opt['GZip'] and os.path.exists('%s.gz' % resfile):
                 os.system('gunzip %s.gz' % resfile)
                 self.printLog('#GUNZIP','Gunzipped %s.gz' % resfile)
             if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile,readaln)
             else: self.readHMMSearch(resfile,readaln)
             if self.opt['GZip'] and os.path.exists(resfile):
                 rje.backup(self,'%s.gz' % resfile,unlink=True)
                 os.system('gzip %s' % resfile)
                 self.printLog('#GZIP','%s gzipped to save space' % resfile)
     except:
         self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',quitchoice=True)
         return False
Example #19
0
 def hmmSearch(self,hmm,dbase=None,outfile=None,wait=True):    ### Performs HMMer Search using object attributes
     '''
     Performs HMMer Search using object attributes.
     >> hmm:str = Name of HMM file 
     >> dbase:str = Name of DBase file [self.info['SearchDB']]
     >> outfile:str = Name of Output file file [self.info['HMMOut']]
     >> wait:boolean  = whether to wait for HMMer. [True]
     << returns outfile or None if fails
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.checkForFile(hmm): self.printLog('#ERR','HMM file %s is missing!' % hmm); return None
         if not dbase: dbase = self.info['SearchDB']
         if not rje.checkForFile(dbase): self.printLog('#ERR','Database file "%s" is missing!' % dbase); return None
         ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not outfile or outfile.lower() in ['','none']:       # Make an outfile per search
             outfile = '%s.%s.hmmer' % (rje.baseFile(hmm,True),rje.baseFile(dbase,True))
             resfile = outfile
             if not os.path.exists(outfile) and self.opt['GZip'] and os.path.exists('%s.gz' % outfile) and not self.opt['Force']:
                 resfile = '%s.gz' % outfile
             if not self.opt['Force'] and rje.isYounger(resfile,hmm) == resfile and rje.isYounger(resfile,dbase) == resfile:
                 self.printLog('#HMM','HMM results file "%s" exists.' % resfile)
                 return outfile      # Already exists
             else: rje.backup(self,outfile,unlink=True)
         ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.opt['HMMPFam']:
             _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile)
         else: _command = 'hmmsearch %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile)
         self.log.printLog('#HMM',_command)
         if not wait: os.system(self.info['HMMerPath'] + _command + ' &')
         elif not os.path.exists(outfile) or self.opt['Force']: open(outfile,'a').write(os.popen(self.info['HMMerPath'] + _command).read())
         self.printLog('#HMM','Outfile produced for %s: %s.' % (hmm,outfile))
         if self.opt['GZip']:
             rje.backup(self,'%s.gz' % outfile,unlink=True)
             os.system('gzip %s' % outfile)
             self.printLog('#GZIP','%s gzipped to save space' % outfile)
         return outfile
     except:
         self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm)
         return None
Example #20
0
 def run(self,gtext=''):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.setup(gtext)
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         html = self.glossaryHTML()
         hobj = self.obj['HTML']
         date = string.split(time.asctime(time.localtime(time.time())))
         date = '%s %s %s' % (date[2],date[1],date[-1])
         hobj.info['Copyright'] += '. Generated by rje_glossary.py'
         title = '%s' % self.getStr('Name')
         tabber = self.getStr('HTMLStyle').lower() == 'tab'
         frontpage = True
         html = '%s\n\n%s\n\n%s' % (hobj.htmlHead(title,tabber,frontpage),html,hobj.htmlTail(tabber))
         if not gtext:   # Replace with CGI option
             rje.backup(self,self.getStr('OutFile'),appendable=False)
             open(self.getStr('OutFile'),'w').write(html)
             self.printLog('#HTML','%s HTML output to %s' % (title,self.getStr('OutFile')))
         return html
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #21
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(): return
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tables = self.db().tables()[0:]
         ## ~ [2a] ~ Calculate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         for table1 in tables:
             for table2 in tables[tables.index(table1)+1:]: 
                 self.difference(table1,table2)
         ## ~ [2b] ~ Calculate Averages & Generate HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         for table in self.db().tables()[0:]: self.average(table)
         ## ~ [2c] ~ Output HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.getBool('TopHTML'):
             html = rje_html.HTML(self.log,self.cmd_list)
             hfile = '%s.html' % self.basefile()
             rje.backup(self,hfile)
             open(hfile,'w').write(html.htmlHead(title=self.basefile(),tabber=False)+self.getStr('TopHTML')+html.htmlTail(False))
         return
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #22
0
 def haqBatch(
         self,
         force=False):  ### Generates Batch and INI files for HAQESAC runs
     '''Generates Batch and INI files for HAQESAC runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],
                                wholepath=True)
         inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],
                                wholepath=True)
         if force or self.force(
         ) or not rje.exists(batfile) or not rje.exists(inifile):
             rje.backup(self, batfile)
             rje.backup(self, inifile)
         else:
             return self.printLog('#HAQBAT', 'HAQESAC Batch files found.')
         ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         haqcmd = []
         for cmd in self.cmd_list:
             if cmd[:4].lower() != 'ini=': haqcmd.append(cmd)
         if self.opt['MultiHAQ']: haqcmd += ['multihaq=T', 'force=F']
         open(inifile, 'w').write(string.join(haqcmd, '\n'))
         ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             acc = seq.info['AccNum']
             haqcmd = [
                 'seqin=%s.fas' % acc,
                 'query=%s' % acc,
                 'basefile=%s' % acc
             ]
             open(batfile,
                  'a').write('python %shaqesac.py %s\n' %
                             (self.info['Path'], string.join(haqcmd)))
         self.printLog('#HAQBAT',
                       'HAQESAC Batch file output to %s' % batfile)
     except:
         self.errorLog('Major problem with MultiHAQ.haqBatch',
                       quitchoice=True)
Example #23
0
 def save(self):     ### Saves parsed REST output to files
     '''Saves parsed REST output to files.'''
     rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True))
     rje.mkDir(self,self.getStr('RestOutDir'))
     outputs = rje.sortKeys(self.dict['Output'])
     if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')]
     elif self.getStrLC('Rest') in ['full','text']:
         outfile = '%s.rest' % rbase
         open(outfile,'w').write(self.restFullOutput())
         self.printLog('#OUT','%s: %s' % (self.getStrLC('Rest'),outfile))
         return True
     elif self.getStrLC('Rest'):
         self.printLog('#OUTFMT','REST output format "%s" not recognised.' % self.getStrLC('Rest'))
         if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False
         outfile = '%s.rest' % rbase
         open(outfile,'w').write(self.restFullOutput())
         self.printLog('#OUT','full: %s' % (outfile))
         return True
     for rkey in outputs:
         if rkey in self.dict['Outfile']:
             rje.backup(self,self.dict['Outfile'][rkey])
             open(self.dict['Outfile'][rkey],'w').write(self.dict['Output'][rkey])
             self.printLog('#OUT','%s: %s' % (rkey,self.dict['Outfile'][rkey]))
         elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
Example #24
0
 def uniFake(self,seqs=[],store=False):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx,seqnum) = (0,seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(self.log,self.cmd_list)   # UniProt object for saving data
         if self.info['DatOut'].lower() in ['','none']: self.info['DatOut'] = rje.baseFile(seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self,datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log,self.cmd_list+['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self,hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else: hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log,self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()                    
             self.printLog('#SEQ','Processing %s (%s aa) %s...' % (seq.shortName(),rje.integerString(seq.aaLen()),seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),seq.info['AccNum'])
                 open('%s.fas' % utmp,'w').write('>%s\n%s\n' % (seq.shortName(),seq.info['Sequence']))
                 udata = {'CC':['-!- Features generated using unifake.py'],'AC':[]}
                 if seq.info['SpecCode'] in ['Unknown','UNK']: seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']):
                     details = rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name'])
                     self.addAlias(seq.info['AccNum'],details[0])
                     self.addAlias(seq.info['AccNum'],details[1])
                     self.addAlias(seq.info['AccNum'],details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(),seq.info['AccNum']]:
                     if id in self.dict['Aliases']: udata['AC'].append('%s;' % string.join(self.dict['Aliases'][id],'; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []     # List of features for sequence
                 for id in [seq.shortName(),seq.info['AccNum'],seq.info['ID']]:
                     if id in self.dict['Features']: ft += self.dict['Features'][id]                        
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list['RegionDisorder']:
                             ft.append({'Type':'DISORDER','Desc':'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'],'Start':disorder[0],'End':disorder[1]})
                             if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % (ft[-1]['Desc'],dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({'Type':'ORDER','Desc':'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'],'Start':fold[0],'End':fold[1]})
                             if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % (ft[-1]['Desc'],dis.stat['IUCut'])
                     except: self.log.errorLog('UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({'SearchDB':'%s.fas' % utmp,'HMMOut':'%s.hmm.out' % utmp})      # This will be made for each sequence                    
                         hmm.search = []
                         hmm.list['HMMRes'] = [hmm.hmmSearch(self.info['PFam'],outfile=hmm.info['HMMOut'])]   # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile,append=True)
                         if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list['ResidueDisorder']          # individual (IUPRed) residue results
                         else: disorder = []
                         if hmm.search: udata['CC'].append('PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'],time.ctime(os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append('-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type':'!ERROR!','Name':name}
                             rje.delimitedFileOutput(self,hmmfile,['Type','Name','Start','End','Eval','Score'],datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {'Start':aln.stat['SbjStart'],'End':aln.stat['SbjEnd'],'Type':'PFAM',
                                                'Desc':'%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'],aln.stat['Expect'],aln.stat['BitScore'])}
                                     if disorder:
                                         region = disorder[aln.stat['SbjStart']-1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(sum(region)) / len(region)
                                         pfamft['Desc'] = '%s; IUPRed: %.2f' % (pfamft['Desc'],hmmdisorder)
                                         if hmmdisorder < self.stat['DisDom']: pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except: self.log.errorLog('UniFake PFam HMM problem for %s.' % name)                  
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen('%s %s.fas -short' % (self.info['TMHMM'],utmp)).readlines()
                         domlist = rje_tm.domainList(rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1: udata['CC'].append('TMHMM: %d TM domains; N-Term %s' % ((len(domlist)-1)/2,domlist[0]['Type']))
                         else: udata['CC'].append('TMHMM: 0 TM domains')
                     except: self.log.errorLog('UniFake TMHMM problem for %s.' % name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system('%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'],utmp,utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else: desc += ' (HMM also Y)'
                         if cpos > 0: ft.append({'Type':'SIGNALP','Desc':desc,'Start':1,'End':cpos})
                     except: self.log.errorLog('UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq,udata,ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(seq,data=udata,ft=ft):    ### Converts into UniProtEntry object 
                     if not store: uniprot.saveUniProt(datfile,append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except: self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp): os.unlink(tmp)
             self.printLog('#UNIFAKE','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx),rje.integerString(seqnum-sx)),log=False)
         if store: uniprot.saveUniProt(datfile,append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)            
     except: self.errorLog('Oh, the shame of it! Trouble during UniFake.uniFake()')
Example #25
0
	def slimDisc(self):	### Runs SLiMDisc on batch of files
		'''Runs SLiMDisc on batch of files.'''
		try:
			### Setup ###
			if self.stat['MinSup'] > self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1:
				self.stat['MinSup'] = self.stat['SlimSupport']
			if self.stat['MaxSup'] > 0  and self.stat['MaxSup'] < self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1:
				self.stat['MaxSup'] = self.stat['SlimSupport']
			### Make File List ##
			_stage = 'Make File List'
			if self.info['SeqIn'].lower() not in ['','none']:
				if os.path.exists(self.info['SeqIn']):
					gfiles = [self.info['SeqIn']]
				else:
					self.log.errorLog('"seqin" file "%s" not found! No SLiMDisc analysis.' % self.info['SeqIn'],printerror=False)
					return False
			else:
				gfiles = rje.getFileList(callobj=self,filelist=self.list['SlimFiles'],subfolders=False,summary=False)
			self.log.printLog('#FILES','%s files identified for SLiMDisc analysis.' % rje.integerString(len(gfiles)))
			## Sort by size and filter by MinSup and MaxSup ###
			datasize = {}   # Dictionary for crude sorting of files by total AA content
			seqnum = {}		# Number of sequences in each file
			qry = {}		# Query sequence name (if any) for file
			tmpseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autofilter=F'])
			gx = 0
			while gx < len(gfiles):
				seqfilename = gfiles[gx]
				gx += 1
				seqfile = seqfilename[0:]
				tmpseq.seq = []
				tmpseq.loadSeqs(seqfile)
				## *** Special RemHub process *** ##
				checkhub = True
				for hubtype in ['rem','kept','no']:
					if seqfile.find('-%shub.fas' % hubtype) > 0:
						checkhub = False
				if self.stat['RemHub'] > 0.0 and checkhub:
					if rje.matchExp('(\S+)_PPI',seqfile):
						hub_acc = rje.matchExp('(\S+)_PPI',rje.baseFile(seqfile,strip_path=True))[0]
					else:
						hub_acc = rje.baseFile(seqfile,strip_path=True)
					hub_base = rje.matchExp('(\S+)%s' % hub_acc,seqfilename)[0]
					basefile = seqfile
					while rje.baseFile(basefile) != basefile:
						basefile = rje.baseFile(basefile)
					if tmpseq.querySeq(query=hub_acc):     ### Sets Hub as Query Sequence
						self.log.printLog('#HUB','Removing hub protein %s and >=%.1f%% ID from PPI dataset %s.' % (hub_acc,self.stat['RemHub'],seqfile))
						tmpseq.makeNR(text='Hub protein homologues',nrid=self.stat['RemHub'],blast=tmpseq.seqNum(),nrsim=0,nr_qry=tmpseq.obj['QuerySeq'])
						tmpseq.removeSeq(text='PPI Hub Protein (self-interactor)',seq=tmpseq.obj['QuerySeq'])
						tmpseq.obj['QuerySeq'] = None
						seqfile = '%s-remhub.fas' % basefile
						tmpseq.saveFasta(seqfile=seqfile)	### Saves sequences in fasta format
						keptfile = '%s-kepthub.fas' % basefile
						os.rename(seqfilename,keptfile)
						gfiles.append(keptfile)
					else:
						seqfile = '%s-nohub.fas' % basefile
						os.rename(seqfilename,seqfile)
						self.log.printLog('#HUB','Hub protein %s not in PPI dataset %s => %s.' % (hub_acc,seqfilename,seqfile))
						#X#print tmpseq.obj['QuerySeq']
				## Support Range ###				
				if tmpseq.seqNum() < self.stat['MinSup'] or (self.stat['MaxSup'] > 0 and tmpseq.seqNum() > self.stat['MaxSup']):
					self.log.printLog('#REJ','%s rejected: %s sequences = outside acceptable range of %d-%d.' % (seqfile,rje.integerString(tmpseq.seqNum()),self.stat['MinSup'],self.stat['MaxSup']))
					continue
				aasize = tmpseq.aaCount()
				self.log.printLog('#AA','%s = %s aa.' % (seqfile,rje.integerString(aasize)))
				while datasize.has_key(aasize):
					aasize += 1
				datasize[aasize] = seqfile
				seqnum[seqfile] = tmpseq.seqNum()
				## Query ##
				qry[seqfile] = None
				if self.opt['SlimQuery']:
					if rje.matchExp('qry_(\S+)\.',seqfilename):
						if tmpseq.querySeq(query=rje.matchExp('qry_(\S+)\.',seqfilename)[0]):     ### Sets Query Sequence if appropriate
							qry[seqfile] = tmpseq.obj['QuerySeq'].shortName()
			self.log.printLog('#INF','%s Datasets to process.' % rje.integerString(len(seqnum)))

			### Batch Output Mode ###
			batchout = None
			if self.info['BatchOut'].lower() not in ['','none']:
				batchout = self.info['BatchOut']
				if not self.opt['Append'] and os.path.exists(batchout):
					rje.backup(self,batchout)

			### Work through Files ###
			_stage = 'Work through files'
			for key in rje.sortKeys(datasize,revsort=self.opt['BigFirst']):
				seqfile = datasize[key]
				basefile = seqfile
				while rje.baseFile(basefile) != basefile:
					basefile = rje.baseFile(basefile)
				base = rje.baseFile(basefile,True)
				self.log.printLog('#DAT',seqfile,timeout=False)
				if not self.opt['UseRes']:
					slim_cmd = '-BT -TT'
				else:
					## Detect old files ##
					_stage = 'Detect old files'
					old_rank = '%s/%s.rank' % (basefile,base)
					self.log.printLog('#RES','Existing SLiMDisc Output?: %s' % (os.path.exists(old_rank)))
					old_b_list = glob.glob('%s/results/*.blastp' % basefile)
					old_t_file = '%s/%s.fasta.out' % (basefile,base)
					self.log.printLog('#RES','Existng TEIRESIAS Output?: %s' % (os.path.exists(old_t_file)))
					self.log.printLog('#RES','%s of %s BLAST files detected.' % (rje.integerString(len(old_b_list)),rje.integerString(seqnum[seqfile])))
					## TEIRESIAS ##
					if (os.path.exists(old_rank) or len(old_b_list) > 0) and os.path.exists(old_t_file):  # BLAST started: TEIRESIAS finished!
						slim_cmd = '-TF'
					else:
						slim_cmd = '-TT'
					## BLAST ##
					if len(old_b_list) != seqnum[seqfile]:	# Need BLAST
						slim_cmd += ' -BT'
					else:
						slim_cmd += ' -BF'
				## Query ##
				if self.opt['SlimQuery'] and qry[seqfile]:
					slim_cmd += ' -q %s' % qry[seqfile]
				## Ranks ##
				slim_cmd += ' -n %d' % self.stat['SlimRanks']
				## Support ##
				if self.stat['SlimSupport'] > 0 and self.stat['SlimSupport'] < 1:
					slim_cmd += ' -S %.1f' % self.stat['SlimSupport']
				elif self.stat['SlimSupport'] > 0:
					slim_cmd += ' -S %d' % self.stat['SlimSupport']
				## WallTime ##
				slim_cmd += ' -W %d' % self.stat['SlimWall']
				## MemSaver ##
				if self.opt['MemSaver']:
					slim_cmd += ' -X T'
				else:
					slim_cmd += ' -X F'
				## SlimOpt ##
				if self.info['SlimOpt']:
					slim_cmd += ' %s' % self.info['SlimOpt']
				## Perform SLiMDisc Run ##
				_stage = 'Peform SLiMDisc Run (%s)' % (seqfile)
				if batchout:
					BATCH = open(batchout,'a')
					BATCH.write('%s -i %s -Q0 %s\n' % (self.info['SlimCall'],seqfile,slim_cmd))
					BATCH.close()
				else:
					if self.stat['Verbose'] > 0:
						syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q2 %s' % (self.info['SlimVersion'],seqfile,slim_cmd)
					else:
						syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q0 %s' % (self.info['SlimVersion'],seqfile,slim_cmd)
					self.log.printLog('#SYS',syscmd)
					os.system(syscmd)
				if not batchout:
					new_rank = '%s/%s.rank' % (basefile,base)
					self.log.printLog('#RES','New rank result %s produced?: %s' % (new_rank,os.path.exists(new_rank)))

		except:
			self.log.errorLog('rje_pattern_discovery banjaxed in slimDisc() %s' % _stage,quitchoice=True)
Example #26
0
    def gasp(self):  ### Performs GASP: Gapped Ancestral Sequence Prediction
        """Performs GASP: Gapped Ancestral Sequence Prediction."""
        try:
            ### <a> ### Preparation
            self.obj["Tree"].cmd_list.append("unkspec=T")
            self.obj["Tree"].obj["SeqList"].opt["UnkSpec"] = True
            ## <i> ## Screen Output
            self.verbose(0, 3, "\nMaking Ancestral Sequences", 0)
            if self.stat["FixPam"] > 0:
                self.verbose(0, 3, "- Fixed PAM%d" % self.stat["FixPam"], 1)
            else:
                self.verbose(0, 3, "- Variable PAM Weighting", 1)
            ## <ii> ## PAM Matrix Setup
            try:
                if self.obj["Tree"].obj["PAM"] == None:
                    self.obj["Tree"].obj["PAM"] = rje_pam.PamCtrl(log=self.log, cmd_list=self.cmd_list)
                if self.stat["FixPam"] <= 0:
                    maxblen = 0
                    for b in self.obj["Tree"].branch:
                        if b.stat["Length"] > maxblen:
                            maxblen = b.stat["Length"]
                    self.verbose(1, 3, "Max Branch Length = %f: " % maxblen, 0)
                    maxblen = int(maxblen * 100) + 1
                else:
                    maxblen = self.stat["FixPam"]
                self.verbose(1, 3, "Max PAM = %d" % maxblen, 1)
                # print tree.pam.getPamMax(), maxblen
                if self.obj["Tree"].obj["PAM"].stat["PamMax"] < maxblen:
                    # print 'Upping PAM!'
                    self.obj["Tree"].obj["PAM"].stat["PamMax"] = maxblen
                    self.obj["Tree"].obj["PAM"].pamUp()
            except:
                self.log.errorLog("Fatal run Exception during PAM Matrix Setup\n")
                raise
            ##<iii> ## AA Freqs
            aalist = self.obj["Tree"].obj["PAM"].alphabet
            self.verbose(1, 3, aalist, 1)
            if aalist.count("-") == 0:
                aalist.append("-")
            if aalist.count("X") == 0:
                aalist.append("X")
            self.aafreq = self.obj["Tree"].obj["SeqList"].aaFreq(alphabet=aalist)
            self.aafreq["-"] = 0.0
            self.aafreq["X"] = 0.0
            # tree.deBug(aafreq)

            ### <b> ### Terminal sequences - probabilities etc. are known (sequences are known!)
            self.gaspnode = {}  # Array of GaspNode objects
            for node in self.obj["Tree"].node:
                ## <i> ## Check Sequence Exists
                if node.stat["ID"] > self.obj["Tree"].stat["SeqNum"]:
                    if node.obj["Sequence"] == None:
                        self.obj["Tree"].obj["SeqList"]._addSeq(
                            node.info["Name"], "X" * self.obj["Tree"].obj["SeqList"].seq[0].seqLen()
                        )
                        node.obj["Sequence"] = self.obj["Tree"].obj["SeqList"].seq[-1]
                ## <ii> ## Create GaspNode object
                self.gaspnode[node] = GaspNode(node, aalist, self.log)
                ## <iii> ## Termini
                if node.stat["ID"] <= self.obj["Tree"].stat["SeqNum"]:
                    self.gaspnode[node].probFromSeq()
                    # print s, len(gaspnode[s].sequence), gaspnode[s].ancfix
                    self.gaspnode[node].ancfix = [True] * len(node.obj["Sequence"].info["Sequence"])

            ### <c> ### GASP 1: Gap Status
            self._gapStatus()

            ##  <d>  ## From tips to root
            # X#self.verbose(0,4,"GASP",0)
            aalist.remove("-")
            if aalist.count("X") > 0:
                aalist.remove("X")
            self._gaspProbs(
                aalist=aalist,
                useanc=False,
                dir="down",
                aaprobs=True,
                aasub=self.opt["FixDown"],
                aafix=self.opt["FixDown"],
            )
            if self.opt["FixDown"]:
                self.obj["Tree"].ancSeqOut(file="%s.anc.fas" % self.info["Name"], ordered=self.opt["Ordered"])
                return
            # Should now have matrix of aa probabilities right back to root...
            ##  <b>  ## Fix Root
            self._gaspProbs(aalist=aalist, useanc=False, dir="root", aaprobs=False, aasub=True, aafix=self.opt["FixUp"])
            ##  <c>  ## Back up tree using all 3 branches
            self._gaspProbs(aalist=aalist, useanc=True, dir="up", aaprobs=True, aasub=True, aafix=self.opt["FixUp"])

            ##  <d>  ## Back down tree with all 3 branches to soften 'outgroup sweep' near root
            for x in range(self.stat["XPass"]):
                # X#self.verbose(0,4,":%d:" % (x+1),0)
                self._gaspProbs(
                    aalist=aalist, useanc=True, dir="down", aaprobs=True, aasub=False, aafix=False, gpass=(x + 1)
                )
                self._gaspProbs(
                    aalist=aalist, useanc=True, dir="down", aaprobs=True, aasub=True, aafix=True, gpass=(x + 1)
                )

            ### <4> ### Finished => Save
            for node in self.obj["Tree"].node:
                node.obj["Sequence"].info["Sequence"] = self.gaspnode[node].sequence
            # X#self.verbose(0,2,"Done!",1)
            self.log.printLog("\r#GASP", "Gapped Ancestral Sequence Prediction Complete.")
            self.obj["Tree"].ancSeqOut(file="%s.anc.fas" % self.info["Name"], ordered=self.opt["Ordered"])

            ### <5> ### PAM Distances & PAM Tree
            if self.opt["PamTree"]:
                try:
                    self.obj["Tree"].branchPam()
                    self.obj["Tree"].saveTree(
                        filename="%s.anc.nsf" % self.info["Name"],
                        type="nsf",
                        seqnum=1,
                        seqname="short",
                        maxnamelen=127,
                        blen="pam",
                        bootstraps="node",
                        multiline=1,
                    )
                    self.obj["Tree"].textTree(
                        seqnum=1,
                        seqname="short",
                        maxnamelen=30,
                        nodename="short",
                        showboot=1,
                        showlen="branch",
                        blen="pam",
                        scale=4,
                        spacer=1,
                        compress=False,
                    )
                    self.obj["Tree"].textTree(
                        filename="%s.anc.txt" % self.info["Name"],
                        seqnum=1,
                        seqname="short",
                        maxnamelen=30,
                        nodename="short",
                        showboot=1,
                        showlen="branch",
                        blen="pam",
                        scale=4,
                        spacer=1,
                        compress=False,
                    )
                except:
                    self.log.errorLog("Major Problem with PAM Tree.")
                    raise

            ### <6> ### RST Output
            if self.opt["RST"]:
                rstfile = "%s.rst" % self.info["Name"]
                rje.backup(self, rstfile)
                RST = open(rstfile, "a")
                RST.write("Supplemental results for GASP - main output %s.anc.fas\n\n" % self.info["Name"])
                for node in self.obj["Tree"].node[self.obj["Tree"].stat["SeqNum"] :]:
                    gn = self.gaspnode[node]
                    RST.write("%s\n\n" % string.join(gn.rst, "\n"))
                RST.close()
                self.log.printLog("RST output %s.rst complete." % self.info["Name"])
        except:
            self.log.errorLog("Fatal Error during GASP.")
            raise
Example #27
0
    def inSilicoHybrid(
        self
    ):  ### Filter and combine subreads from parent and output to fasta file.
        '''
        Filter and combine subreads from parent and output to fasta file.

        This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each
        parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each
        parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell
        identifier table.)

        A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the
        unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest
        selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the
        two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added.
        This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is
        no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional
        subreads will be added from the other parent if they reduce the difference in cumulative output for each parent.

        Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for
        which the subread length distributions should also be similar. This is to overcome biases in resulting diploid
        assemblies, where one parent has higher quality data than the other.

        NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much
        higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to  reads with a
        minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be
        relaxed.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1'))
            base1 = rje.baseFile(self.getStr('Parent1'))
            parent1 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent1'),
                 'basefile=%s' % base1])
            parent1.setup()
            udb1 = parent1.udb()
            cdb = parent1.db('smrt', add=True, mainkeys=['Name'])
            cdb.dataFormat({'SMRT': 'int'})
            cx = cdb.entryNum()
            ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2'))
            base2 = rje.baseFile(self.getStr('Parent2'))
            parent2 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent2'),
                 'basefile=%s' % base2])
            parent2.setup()
            udb2 = parent2.udb()
            cdb2 = parent2.db('smrt', add=True, mainkeys=['Name'])
            cdb2.dataFormat({'SMRT': 'int'})
            # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1
            for entry in cdb2.entries() + udb2.entries():
                entry['SMRT'] = entry['SMRT'] + cx
            cdb = parent1.db().mergeTables(cdb, cdb2)
            ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #'
            )
            minlen = self.getInt('LenFilter')
            minrq = self.getNum('RQFilter')
            rqstr = '%s' % minrq
            filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen,
                                             rqstr[2:])
            ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqbatch = []  # List of SeqList objects
            self.printLog(
                '#BATCH', '%s sequence files to process.' %
                rje.iLen(parent1.list['Batch'] + parent2.list['Batch']))
            for seqfile in parent1.list['Batch'] + parent2.list['Batch']:
                seqcmd = self.cmd_list + [
                    'seqmode=file', 'autoload=T', 'summarise=F',
                    'seqin=%s' % seqfile, 'autofilter=F'
                ]
                seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd))
            self.printLog(
                '#BATCH',
                '%s sequence files to summarise.' % rje.iLen(seqbatch))
            if not seqbatch:
                raise IOError(
                    'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.'
                )
            ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elists = [
                udb1.sortedEntries('Len', reverse=True),
                udb2.sortedEntries('Len', reverse=True)
            ]
            plen = [0, 0]  # Summed lengths for each parent
            pseq = [0, 0]  # Total sequence number for each parent
            prq = [0, 0]  # Total sequence RQ for each parent (convert to mean)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            lastlen = max(elists[0][0]['Len'],
                          elists[1][0]['Len'])  # Length of last selected read
            for elist in elists:
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            nextp = 0  # Index of next parent to use
            if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1

            ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            zmwlist = []  # List of (smrt,zmw) meeting filtering criteria
            ux = 0.0
            utot = len(elists[0]) + len(elists[1])
            while lastlen:
                self.progLog('\r#DIP',
                             'Diploidising subreads: %.2f%%' % (ux / utot))
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
                    ux += 100.0
                if elist and elist[0]['Len'] < minlen:
                    ux += 100.0 * len(elist)
                    elist = []
                if not elist:
                    nextp = 1 - nextp
                    break  # Finish
                entry = elist.pop(0)
                ux += 100.0
                zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                plen[nextp] += entry['Len']
                prq[nextp] += entry['RQ']
                pseq[nextp] += 1
                if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp
                lastlen = entry['Len']
            ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            while elists[nextp]:
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    elist.pop(0)
                    ux += 100.0
                while elist and elist[0]['Len'] >= minlen:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    entry = elist.pop(0)
                    ux += 100.0
                    pdiff = rje.modulus(plen[0] - plen[1])
                    ediff = rje.modulus(plen[nextp] + entry['Len'] -
                                        plen[1 - nextp])
                    if ediff >= pdiff:
                        elists[nextp] = []
                        break  #Finish!
                    zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                    plen[nextp] += entry['Len']
                    prq[nextp] += entry['RQ']
                    pseq[nextp] += 1
            self.printLog(
                '\r#DIP',
                'Diploidising subreads complete: %s subreads to output.' %
                rje.iLen(zmwlist))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]),
                 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0]))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]),
                 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1]))
            ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rje.backup(self, filtfile)
            SEQOUT = open(filtfile, 'w')
            sx = 0.0
            stot = 0
            sn = len(seqbatch)
            fx = 0
            for seqlist in seqbatch:
                #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784
                si = 100.0 / seqlist.seqNum()
                stot += seqlist.seqNum()
                for seq in seqlist.seqs():
                    self.progLog('\r#OUT',
                                 'Extracting subreads: %.2f%%' % (sx / sn))
                    sx += si
                    (name, sequence) = seqlist.getSeq(seq)
                    try:
                        [smrt, zmw, pos,
                         rq] = string.split(string.replace(name, '/', ' '))
                    except:
                        [smrt, zmw,
                         pos] = string.split(string.replace(name, '/', ' '))
                        rq = minrq
                    if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist:
                        continue
                    SEQOUT.write('>%s\n%s\n' % (name, sequence))
                    fx += 1
            self.printLog(
                '\r#OUT',
                'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile))

            ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqcmd = self.cmd_list + [
                'seqmode=file', 'autoload=T', 'summarise=T',
                'seqin=%s' % filtfile, 'autofilter=F'
            ]
            rje_seqlist.SeqList(self.log, seqcmd)

            return True
        except:
            self.errorLog('%s.run error' % self.prog())
            return False
Example #28
0
 def uniFake(
     self,
     seqs=[],
     store=False
 ):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx, seqnum) = (0, seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(
             self.log, self.cmd_list)  # UniProt object for saving data
         if self.info['DatOut'].lower() in ['', 'none']:
             self.info['DatOut'] = rje.baseFile(
                 seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self, datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self, hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else:
             hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()
             self.printLog(
                 '#SEQ', 'Processing %s (%s aa) %s...' %
                 (seq.shortName(), rje.integerString(
                     seq.aaLen()), seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),
                                      seq.info['AccNum'])
                 open('%s.fas' % utmp, 'w').write(
                     '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence']))
                 udata = {
                     'CC': ['-!- Features generated using unifake.py'],
                     'AC': []
                 }
                 if seq.info['SpecCode'] in ['Unknown', 'UNK']:
                     seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name']):
                     details = rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name'])
                     self.addAlias(seq.info['AccNum'], details[0])
                     self.addAlias(seq.info['AccNum'], details[1])
                     self.addAlias(seq.info['AccNum'], details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(), seq.info['AccNum']]:
                     if id in self.dict['Aliases']:
                         udata['AC'].append(
                             '%s;' %
                             string.join(self.dict['Aliases'][id], '; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []  # List of features for sequence
                 for id in [
                         seq.shortName(), seq.info['AccNum'], seq.info['ID']
                 ]:
                     if id in self.dict['Features']:
                         ft += self.dict['Features'][id]
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list[
                                 'RegionDisorder']:
                             ft.append({
                                 'Type':
                                 'DISORDER',
                                 'Desc':
                                 'Predicted disorder: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 disorder[0],
                                 'End':
                                 disorder[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s > %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({
                                 'Type':
                                 'ORDER',
                                 'Desc':
                                 'Predicted order: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 fold[0],
                                 'End':
                                 fold[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s <= %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                     except:
                         self.log.errorLog(
                             'UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({
                             'SearchDB': '%s.fas' % utmp,
                             'HMMOut': '%s.hmm.out' % utmp
                         })  # This will be made for each sequence
                         hmm.search = []
                         hmm.list['HMMRes'] = [
                             hmm.hmmSearch(self.info['PFam'],
                                           outfile=hmm.info['HMMOut'])
                         ]  # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile, append=True)
                         if 'disorder' in self.list['UniFake']:
                             disorder = seq.obj['Disorder'].list[
                                 'ResidueDisorder']  # individual (IUPRed) residue results
                         else:
                             disorder = []
                         if hmm.search:
                             udata['CC'].append(
                                 'PFam: HMMer PFam search vs %s (Modified %s)'
                                 %
                                 (self.info['PFam'],
                                  time.ctime(
                                      os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append(
                                 '-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type': '!ERROR!', 'Name': name}
                             rje.delimitedFileOutput(
                                 self,
                                 hmmfile, [
                                     'Type', 'Name', 'Start', 'End', 'Eval',
                                     'Score'
                                 ],
                                 datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {
                                         'Start':
                                         aln.stat['SbjStart'],
                                         'End':
                                         aln.stat['SbjEnd'],
                                         'Type':
                                         'PFAM',
                                         'Desc':
                                         '%s PFam HMM Eval: %.2e; Score: %.1f'
                                         % (search.info['Name'],
                                            aln.stat['Expect'],
                                            aln.stat['BitScore'])
                                     }
                                     if disorder:
                                         region = disorder[
                                             aln.stat['SbjStart'] -
                                             1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(
                                             sum(region)) / len(region)
                                         pfamft[
                                             'Desc'] = '%s; IUPRed: %.2f' % (
                                                 pfamft['Desc'],
                                                 hmmdisorder)
                                         if hmmdisorder < self.stat[
                                                 'DisDom']:
                                             pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except:
                         self.log.errorLog(
                             'UniFake PFam HMM problem for %s.' % name)
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen(
                             '%s %s.fas -short' %
                             (self.info['TMHMM'], utmp)).readlines()
                         domlist = rje_tm.domainList(
                             rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1:
                             udata['CC'].append(
                                 'TMHMM: %d TM domains; N-Term %s' %
                                 ((len(domlist) - 1) / 2,
                                  domlist[0]['Type']))
                         else:
                             udata['CC'].append('TMHMM: 0 TM domains')
                     except:
                         self.log.errorLog('UniFake TMHMM problem for %s.' %
                                           name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system(
                             '%s -f short -t euk %s.fas > %s.signalp' %
                             (self.info['SignalP'], utmp, utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else:
                                     desc += ' (HMM also Y)'
                         if cpos > 0:
                             ft.append({
                                 'Type': 'SIGNALP',
                                 'Desc': desc,
                                 'Start': 1,
                                 'End': cpos
                             })
                     except:
                         self.log.errorLog(
                             'UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq, udata, ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(
                         seq, data=udata,
                         ft=ft):  ### Converts into UniProtEntry object
                     if not store: uniprot.saveUniProt(datfile, append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except:
                 self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp):
                 os.unlink(tmp)
             self.printLog(
                 '#UNIFAKE',
                 '|---------- %s run <<<|>>> %s to go -----------|' %
                 (rje.integerString(sx), rje.integerString(seqnum - sx)),
                 log=False)
         if store: uniprot.saveUniProt(datfile, append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)
     except:
         self.errorLog(
             'Oh, the shame of it! Trouble during UniFake.uniFake()')
Example #29
0
 def hmmSearch(
         self,
         hmm,
         dbase=None,
         outfile=None,
         wait=True):  ### Performs HMMer Search using object attributes
     '''
     Performs HMMer Search using object attributes.
     >> hmm:str = Name of HMM file 
     >> dbase:str = Name of DBase file [self.info['SearchDB']]
     >> outfile:str = Name of Output file file [self.info['HMMOut']]
     >> wait:boolean  = whether to wait for HMMer. [True]
     << returns outfile or None if fails
     '''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.checkForFile(hmm):
             self.printLog('#ERR', 'HMM file %s is missing!' % hmm)
             return None
         if not dbase: dbase = self.info['SearchDB']
         if not rje.checkForFile(dbase):
             self.printLog('#ERR', 'Database file "%s" is missing!' % dbase)
             return None
         ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not outfile or outfile.lower() in [
                 '', 'none'
         ]:  # Make an outfile per search
             outfile = '%s.%s.hmmer' % (rje.baseFile(
                 hmm, True), rje.baseFile(dbase, True))
             resfile = outfile
             if not os.path.exists(
                     outfile) and self.opt['GZip'] and os.path.exists(
                         '%s.gz' % outfile) and not self.opt['Force']:
                 resfile = '%s.gz' % outfile
             if not self.opt['Force'] and rje.isYounger(
                     resfile, hmm) == resfile and rje.isYounger(
                         resfile, dbase) == resfile:
                 self.printLog('#HMM',
                               'HMM results file "%s" exists.' % resfile)
                 return outfile  # Already exists
             else:
                 rje.backup(self, outfile, unlink=True)
         ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.opt['HMMPFam']:
             _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(
                 self.list['HMMOptions']), hmm, dbase, outfile)
         else:
             _command = 'hmmsearch %s %s %s > %s' % (string.join(
                 self.list['HMMOptions']), hmm, dbase, outfile)
         self.log.printLog('#HMM', _command)
         if not wait: os.system(self.info['HMMerPath'] + _command + ' &')
         elif not os.path.exists(outfile) or self.opt['Force']:
             open(outfile, 'a').write(
                 os.popen(self.info['HMMerPath'] + _command).read())
         self.printLog('#HMM',
                       'Outfile produced for %s: %s.' % (hmm, outfile))
         if self.opt['GZip']:
             rje.backup(self, '%s.gz' % outfile, unlink=True)
             os.system('gzip %s' % outfile)
             self.printLog('#GZIP', '%s gzipped to save space' % outfile)
         return outfile
     except:
         self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm)
         return None
Example #30
0
    def saveXGMML(self,filename=None,format='Cytoscape'):       ### Saves object data to file in XGMML format
        '''
        Saves object data to file in XGMML format.
        >> filename:str [None] = Output file. Will use name.xgmml if None.
        >> format:str [Cytoscape] = Target for output file
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not filename or filename.lower() == 'none': filename = '%s.xgmml' % self.info['Name']
            self.log.printLog('#XGMML','Output of XGMML file %s for %s...' % (filename,format),log=False,newline=False)
            rje.backup(self,filename)
            date = rje.dateTime()
            OUT = open(filename,'w')
            
            ### ~ [2] Output headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OUT.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n')
            OUT.write('<graph label="%s" id="%s" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.cs.rpi.edu/XGMML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n' % (self.info['Name'],self.info['Name']))
            OUT.write('    <att name="documentVersion" value="1.0"/>\n')
            ## ~ [2a] Cytoscape format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            OUT.write('    <att name="networkMetadata">\n')
            OUT.write('        <rdf:RDF>\n')
            OUT.write('            <rdf:Description rdf:about="http://www.cytoscape.org/">\n')
            OUT.write('                <dc:source>RJE_XGMML</dc:source>\n')
            OUT.write('                <dc:format>Cytoscape-XGMML</dc:format>\n')
            OUT.write('                <dc:description>%s</dc:description>\n' % self.info['Description'])  
            OUT.write('                <dc:date>%s</dc:date>\n' % date)
            OUT.write('                <dc:type>%s</dc:type>\n' % self.info['Type'])
            OUT.write('                <dc:identifier>N/A</dc:identifier>\n')
            OUT.write('                <dc:title>%s</dc:title>\n' % self.info['Name'])
            OUT.write('            </rdf:Description>\n')
            OUT.write('        </rdf:RDF>\n')
            OUT.write('    </att>\n\n')

            ### ~ [3] Output Nodes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            size = 35.0
            nodelist = rje.sortKeys(self.dict['Node'])
            (n,x,y) = (int(math.sqrt(len(nodelist))),0,0)
            for node in nodelist:
                try:
                    ## ~ [3a] Basic node attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    OUT.write('    <node label="%s" id="%d">\n' % (node,nodelist.index(node)))
                except: self.errorLog('!'); continue
                try:
                    for att in rje.sortKeys(self.dict['Node'][node]):
                        if att not in self.dict['NodeAtt']: continue
                        type = self.dict['NodeAtt'][att]
                        value = string.replace('%s' % self.dict['Node'][node][att],'&','and')
                        OUT.write('        <att type="%s" name="%s" label="%s" value="%s"/>\n' % (type,att,att,value))
                    ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    #!# Add control for these at some point! #!#
                    if node in self.dict['NodePos']:
                        (nx,ny) = self.dict['NodePos'][node]
                        try: nx * size
                        except: self.errorLog('%s nodepos X = %s' % (node,nx)); nx = x
                        try: ny * size
                        except: self.errorLog('%s nodepos Y = %s' % (node,ny)); ny = y
                    else: [nx,ny] = [x,y]
                    if self.getBool('XGMMLAtt'):
                        OUT.write('        <graphics w="%.1f" h="%.1f" width="1" type="ellipse" outline="#000000" fill="#ff9999" y="%.1f" x="%.1f">\n' % (size,size,ny*2*size,nx*2*size))
                        OUT.write('            <att name="cytoscapeNodeGraphicsAttributes">\n')
                        OUT.write('                <att name="nodeTransparency" value="1.0"/>\n')
                        #OUT.write('                <att name="nodeLabelFont" value="Default-0-12"/>\n')
                        OUT.write('                <att name="borderLineType" value="solid"/>\n')
                        OUT.write('            </att>\n')
                    else:
                        OUT.write('        <graphics y="%.1f" x="%.1f">\n' % (ny*2*size,nx*2*size))
                    OUT.write('        </graphics>\n')
                    x += 1
                    if x > n: (x,y) = (0,y+1)
                    ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                except: self.errorLog('!')
                OUT.write('    </node>\n')
            
            ### ~ [4] Output Edges ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for etype in rje.sortKeys(self.dict['Edge']):
                for edge in rje.sortKeys(self.dict['Edge'][etype]):
                    try:
                        ## ~ [3a] Basic edge attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                        id = '%s (%s) %s' % (edge[0],etype,edge[1])
                        OUT.write('    <edge label="%s" id="%s" target="%d" source="%d">\n' % (id,id,nodelist.index(edge[1]),nodelist.index(edge[0])))
                    except: self.errorLog('!'); continue
                    try:
                        OUT.write('        <att type="string" name="canonicalName" label="canonicalName" value="%s"/>\n' % id)
                        OUT.write('        <att type="string" name="TYPE" label="TYPE" value="%s"/>\n' % etype)
                        if 'interaction' not in self.dict['EdgeAtt']: OUT.write('        <att type="string" name="interaction" label="interaction" value="%s"/>\n' % etype)
                        OUT.write('        <att type="string" name="EDGE_TYPE" label="EDGE_TYPE" value="DefaultEdge"/>\n')
                        for att in self.dict['Edge'][etype][edge]:
                            if att.lower() == 'type': continue
                            if att not in self.dict['EdgeAtt']: continue
                            type = self.dict['EdgeAtt'][att]
                            value = string.replace('%s' % self.dict['Edge'][etype][edge][att],'&','and')
                            OUT.write('        <att type="%s" name="%s" label="%s" value="%s"/>\n' % (type,att,att,value))
                        ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                        #!# Update these at some point! #!#
                        #OUT.write('        <graphics width="1" fill="#0000ff">\n')
                        #OUT.write('            <att name="cytoscapeEdgeGraphicsAttributes">\n')
                        #OUT.write('                <att name="sourceArrow" value="0"/>\n')
                        #OUT.write('                <att name="targetArrow" value="0"/>\n')
                        #OUT.write('                <att name="edgeLabelFont" value="Default-0-10"/>\n')
                        #OUT.write('                <att name="edgeLineType" value="SOLID"/>\n')
                        #OUT.write('                <att name="sourceArrowColor" value="#000000"/>\n')
                        #OUT.write('                <att name="targetArrowColor" value="#000000"/>\n')
                        #OUT.write('                <att name="curved" value="STRAIGHT_LINES"/>\n')
                        #OUT.write('            </att>\n')
                        #OUT.write('        </graphics>\n')
                        ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    except: self.errorLog('!')
                    OUT.write('    </edge>\n')
                            
            ### ~ [5] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OUT.write('</graph>\n')
            OUT.close()
            self.log.printLog('\r#XGMML','Output of XGMML file %s for %s complete.' % (filename,format))
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Example #31
0
 def haqBatch(self,force=False): ### Generates Batch and INI files for HAQESAC runs
     '''Generates Batch and INI files for HAQESAC runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)
         inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],wholepath=True)
         if force or self.force() or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self,batfile); rje.backup(self,inifile)
         else: return self.printLog('#HAQBAT','HAQESAC Batch files found.')
         ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         haqcmd = []
         for cmd in self.cmd_list:
             if cmd[:4].lower() != 'ini=': haqcmd.append(cmd)
         if self.opt['MultiHAQ']: haqcmd += ['multihaq=T','force=F']
         open(inifile,'w').write(string.join(haqcmd,'\n'))
         ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             acc = seq.info['AccNum']
             haqcmd = ['seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc]
             open(batfile,'a').write('python %shaqesac.py %s\n' % (self.info['Path'],string.join(haqcmd)))
         self.printLog('#HAQBAT','HAQESAC Batch file output to %s' % batfile)
     except: self.errorLog('Major problem with MultiHAQ.haqBatch',quitchoice=True)
Example #32
0
 def blast2fas(self):  ### Executes BLAST2FAS and copies results files
     '''Executes BLAST2FAS and copies results files.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         need2blast = self.opt['Force']
         null_file = '%s.blast2fas_null.txt' % self.baseFile()
         nx = 0
         null_list = []
         if os.path.exists(null_file):
             null_list = string.split(open(null_file, 'r').read(), '\n')
         self.debug(null_file)
         for seq in self.seqs():
             if seq.info['AccNum'] in null_list:
                 nx += 1
                 continue
             hfile = rje.makePath('%s%s.fas' %
                                  (self.info['HaqDir'], seq.info['AccNum']),
                                  wholepath=True)
             for db in self.obj['SeqList'].list['Blast2Fas']:
                 self.debug(rje.isYounger(hfile, db))
                 self.debug(rje.isYounger(hfile, db) == hfile)
                 need2blast = need2blast or not rje.isYounger(hfile,
                                                              db) == hfile
         if not need2blast:
             self.printLog(
                 '#BLAST',
                 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)'
                 % nx)
             return False
         ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.backup(self, null_file)
         nx = 0
         if self.getInt('MultiCut'):
             self.obj['SeqList'].cmd_list += [
                 'blastb=%d' % self.getInt('MultiCut'),
                 'blastv=%d' % self.getInt('MultiCut')
             ]
         elif self.getInt('BlastCut'):
             self.obj['SeqList'].cmd_list += [
                 'blastb=%d' % self.getInt('BlastCut'),
                 'blastv=%d' % self.getInt('BlastCut')
             ]
         if self.getInt('Forks'):
             self.obj['SeqList'].cmd_list += [
                 'blasta=%d' % self.getInt('Forks')
             ]
         rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir'))
         for seq in self.seqs():
             sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),
                                          seq.info['AccNum'])
             if os.path.exists(sbfile):
                 hfile = rje.makePath(
                     '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']),
                     wholepath=True)
                 os.rename(sbfile, hfile)
                 if os.path.exists('%s.pickle' % rje.baseFile(hfile)):
                     os.unlink('%s.pickle' % rje.baseFile(hfile))
                 if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)):
                     os.unlink('%s.pickle.gz' % rje.baseFile(hfile))
             else:
                 open(null_file, 'a').write('%s\n' % seq.info['AccNum'])
                 nx += 1
         if nx:
             self.printLog(
                 '#BLAST',
                 '%s Accession Numbers without BLAST2Fas hits output to %s'
                 % (nx, null_file))
         self.printLog(
             '#BLAST', '%s HAQESAC input files made using BLAST2Fas' %
             (self.seqNum() - nx))
         return True
     except:
         self.errorLog('Major problem with MultiHAQ.blast2fas')
         raise
Example #33
0
    def run(self):  ### Main run method
        '''
        # DepthCharge: genome assembly quality control and misassembly repair.

        DepthCharge is an assembly quality control and misassembly repair program. It uses mapped long read depth of
        coverage to charge through a genome assembly and identify coverage "cliffs" that may indicate a misassembly.
        If appropriate, it will then blast the assembly into fragment at those misassemblies.

        DepthCharge uses a genome assembly and PAF file of mapped reads as input. If no file is provided, minimap2 will
        be used to generate one.

        For each sequence, DepthCharge starts at the beginning of the sequence and scans through the PAF file for
        coverage to drop below the `mindepth=INT` threshold (default = 1 read). These positions are marked as "bad" and
        compressed into regions of adjacent bad positions. Regions at the start or end of a sequnece are labelled "end".
        Regions overlapping gaps are labelled "gap". Otherwise, regions are labelled "bad". All regions are output to
        `*.depthcharge.tdt` along with the length of each sequence (region type "all").

        Future versions will either fragment the assembly at "bad" regions (and "gap" regions if `breakgaps=T`. If
        `breakmode=gap` then DepthCharge will replace bad regions with a gap (`NNNN...`) of length `gapsize=INT`. If
        `breakmode=report` then no additional processing of the assembly will be performed. Otherwise, the processed
        assembly will be saved as `*.depthcharge.fasta`.

        ---

        # Running DepthCharge

        DepthCharge is written in Python 2.x and can be run directly from the commandline:

            python $CODEPATH/depthcharge.py [OPTIONS]

        If running as part of [SLiMSuite](http://slimsuite.blogspot.com/), `$CODEPATH` will be the SLiMSuite `tools/`
        directory. If running from the standalone [DepthCharge git repo](https://github.com/slimsuite/depthcharge), `$CODEPATH`
        will be the path the to `code/` directory. Please see details in the [DepthCharge git repo](https://github.com/slimsuite/depthcharge)
        for running on example data.

        ## Dependencies

        DepthCharge uses `grep` and `awk`. To generate documentation with `dochtml`, R will need to be installed and a
        pandoc environment variable must be set, e.g.

            export RSTUDIO_PANDOC=/Applications/RStudio.app/Contents/MacOS/pandoc

        If a PAF file is not provided, [minimap2](https://github.com/lh3/minimap2) must be installed and either added to
        the environment `$PATH` or given with the `minimap2=PROG` setting.

        For full documentation of the DepthCharge workflow, run with `dochtml=T` and read the `*.docs.html` file generated.


        ## Commandline options


        ```
        ### ~ Main DepthCharge run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        seqin=FILE      : Input sequence assembly [None]
        basefile=FILE   : Root of output file names [$SEQIN basefile]
        paf=FILE        : PAF file of long reads mapped onto assembly [$BASEFILE.paf]
        breakmode=X     : How to treat misassemblies (report/gap/fragment) [fragment]
        breakgaps=T/F   : Whether to break at gaps where coverage drops if breakmode=fragment [False]
        gapsize=INT     : Size of gaps to insert when breakmode=gap [100]
        mindepth=INT    : Minimum depth to class as OK [1]
        ### ~ PAF file generation options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        reads=FILELIST  : List of fasta/fastq files containing reads. Wildcard allowed. Can be gzipped. []
        readtype=LIST   : List of ont/pb/hifi file types matching reads for minimap2 mapping [ont]
        minimap2=PROG   : Full path to run minimap2 [minimap2]
        mapopt=CDICT    : Dictionary of minimap2 options [N:100,p:0.0001,x:asm5]
        ### ~ Additional options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        dochtml=T/F     : Generate HTML Diploidocus documentation (*.docs.html) instead of main run [False]
        logfork=T/F     : Whether to log forking in main log [False]
        tmpdir=PATH     : Path for temporary output files during forking (not all modes) [./tmpdir/]
        ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        ```

        '''
        try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.getBool('DocHTML'): return rje_rmd.docHTML(self)
            if not self.setup(): return False
            ### ~ [2] ~ DepthCharge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #i# Fork out processing of the PAF file for each input sequence.
            resfile = '{0}.depthcharge.tdt'.format(self.baseFile())
            ddb = self.depthChargeForker(
            )  # depthcharge table - ['seqname','start','end','type']
            if not ddb: raise IOError('Generation of DepthCharge table failed')
            ddb.indexReport('type')
            breakup = 'bad' in ddb.index('type') or (
                self.getBool('BreakGaps') and 'gap' in ddb.index('type'))
            if breakup:
                ddb.printLog(
                    '#RESULT',
                    'Regions of bad coverage output to {0}'.format(resfile))
            elif 'gap' in ddb.index('type'):
                ddb.printLog(
                    '#RESULT',
                    'Gaps of bad coverage output to {0}'.format(resfile))
            else:
                ddb.printLog('#RESULT',
                             'No regions of bad coverage to output!')
            if self.getStrLC('BreakMode') == 'report' or not breakup:
                return True
            ### ~ [3] ~ Fragment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #i# Fragment, insert gaps or just report regions
            fasfile = '{0}.depthcharge.fasta'.format(self.basefile())
            rje.backup(self, fasfile)
            FRAGFAS = open(fasfile, 'w')
            seqin = self.seqinObj()
            seqx = 0
            for seq in seqin.seqs():
                (seqname, sequence) = seqin.getSeq(seq)
                sname = string.split(seqname)[0]
                seqlen = len(sequence)
                regions = []
                for entry in ddb.indexEntries('seqname', sname):
                    if entry['type'] == 'bad' or (self.getBool('BreakGaps')
                                                  and entry['type'] == 'gap'):
                        regions.append((entry['start'], entry['end']))
                if not regions:
                    FRAGFAS.write('>{0}\n{1}\n'.format(seqname, sequence))
                    seqx += 1
                    continue
                regions += [(0, 0), (seqlen, seqlen)]
                regions.sort()
                fragx = 0
                newseq = ''
                while len(regions) > 1:
                    fragx += 1
                    if self.getStrLC('BreakMode') == 'gap':
                        if newseq: newseq += 'N' * self.getInt('GapSize')
                        newseq += sequence[regions[0][1]:regions[1][0] - 1]
                    elif self.getStrLC('BreakMode') == 'fragment':
                        newname = '{0}.{1} {2}'.format(sname, fragx, seqname)
                        newseq = sequence[regions[0][1]:regions[1][0] - 1]
                        FRAGFAS.write('>{0}\n{1}\n'.format(newname, newseq))
                        seqx += 1
                    regions.pop(0)
                if self.getStrLC('BreakMode') == 'gap':
                    newname = '{0}+{1}gaps {2}'.format(sname, fragx - 1,
                                                       seqname)
                    FRAGFAS.write('>{0}\n{1}\n'.format(newname, newseq))
                    seqx += 1
                    self.printLog(
                        '#ADDGAP',
                        '{0} gaps added to {1}'.format(fragx - 1, sname))
                else:
                    self.printLog(
                        '#FRAG', '{0} fragments of {1} output to {2}'.format(
                            fragx, sname, fasfile))
            self.printLog('#FASOUT',
                          '{0} sequences output to {1}'.format(seqx, fasfile))
            # self.warnLog('BreakMode "{0}" not yet implemented!'.format(self.getStrLC('BreakMode')))
            return False
        except:
            self.errorLog(self.zen())
            return True  # Delete this if method error not terrible
Example #34
0
    def saveXGMML(
            self,
            filename=None,
            format='Cytoscape'):  ### Saves object data to file in XGMML format
        '''
        Saves object data to file in XGMML format.
        >> filename:str [None] = Output file. Will use name.xgmml if None.
        >> format:str [Cytoscape] = Target for output file
        '''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not filename or filename.lower() == 'none':
                filename = '%s.xgmml' % self.info['Name']
            self.log.printLog('#XGMML',
                              'Output of XGMML file %s for %s...' %
                              (filename, format),
                              log=False,
                              newline=False)
            rje.backup(self, filename)
            date = rje.dateTime()
            OUT = open(filename, 'w')

            ### ~ [2] Output headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OUT.write(
                '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n')
            OUT.write(
                '<graph label="%s" id="%s" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.cs.rpi.edu/XGMML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n'
                % (self.info['Name'], self.info['Name']))
            OUT.write('    <att name="documentVersion" value="1.0"/>\n')
            ## ~ [2a] Cytoscape format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            OUT.write('    <att name="networkMetadata">\n')
            OUT.write('        <rdf:RDF>\n')
            OUT.write(
                '            <rdf:Description rdf:about="http://www.cytoscape.org/">\n'
            )
            OUT.write('                <dc:source>RJE_XGMML</dc:source>\n')
            OUT.write(
                '                <dc:format>Cytoscape-XGMML</dc:format>\n')
            OUT.write('                <dc:description>%s</dc:description>\n' %
                      self.info['Description'])
            OUT.write('                <dc:date>%s</dc:date>\n' % date)
            OUT.write('                <dc:type>%s</dc:type>\n' %
                      self.info['Type'])
            OUT.write('                <dc:identifier>N/A</dc:identifier>\n')
            OUT.write('                <dc:title>%s</dc:title>\n' %
                      self.info['Name'])
            OUT.write('            </rdf:Description>\n')
            OUT.write('        </rdf:RDF>\n')
            OUT.write('    </att>\n\n')

            ### ~ [3] Output Nodes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            size = 35.0
            nodelist = rje.sortKeys(self.dict['Node'])
            (n, x, y) = (int(math.sqrt(len(nodelist))), 0, 0)
            for node in nodelist:
                try:
                    ## ~ [3a] Basic node attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    OUT.write('    <node label="%s" id="%d">\n' %
                              (node, nodelist.index(node)))
                except:
                    self.errorLog('!')
                    continue
                try:
                    for att in rje.sortKeys(self.dict['Node'][node]):
                        if att not in self.dict['NodeAtt']: continue
                        type = self.dict['NodeAtt'][att]
                        value = string.replace(
                            '%s' % self.dict['Node'][node][att], '&', 'and')
                        OUT.write(
                            '        <att type="%s" name="%s" label="%s" value="%s"/>\n'
                            % (type, att, att, value))
                    ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    #!# Add control for these at some point! #!#
                    if node in self.dict['NodePos']:
                        (nx, ny) = self.dict['NodePos'][node]
                        try:
                            nx * size
                        except:
                            self.errorLog('%s nodepos X = %s' % (node, nx))
                            nx = x
                        try:
                            ny * size
                        except:
                            self.errorLog('%s nodepos Y = %s' % (node, ny))
                            ny = y
                    else:
                        [nx, ny] = [x, y]
                    if self.getBool('XGMMLAtt'):
                        OUT.write(
                            '        <graphics w="%.1f" h="%.1f" width="1" type="ellipse" outline="#000000" fill="#ff9999" y="%.1f" x="%.1f">\n'
                            % (size, size, ny * 2 * size, nx * 2 * size))
                        OUT.write(
                            '            <att name="cytoscapeNodeGraphicsAttributes">\n'
                        )
                        OUT.write(
                            '                <att name="nodeTransparency" value="1.0"/>\n'
                        )
                        #OUT.write('                <att name="nodeLabelFont" value="Default-0-12"/>\n')
                        OUT.write(
                            '                <att name="borderLineType" value="solid"/>\n'
                        )
                        OUT.write('            </att>\n')
                    else:
                        OUT.write('        <graphics y="%.1f" x="%.1f">\n' %
                                  (ny * 2 * size, nx * 2 * size))
                    OUT.write('        </graphics>\n')
                    x += 1
                    if x > n: (x, y) = (0, y + 1)
                    ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                except:
                    self.errorLog('!')
                OUT.write('    </node>\n')

            ### ~ [4] Output Edges ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for etype in rje.sortKeys(self.dict['Edge']):
                for edge in rje.sortKeys(self.dict['Edge'][etype]):
                    try:
                        ## ~ [3a] Basic edge attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                        id = '%s (%s) %s' % (edge[0], etype, edge[1])
                        OUT.write(
                            '    <edge label="%s" id="%s" target="%d" source="%d">\n'
                            % (id, id, nodelist.index(
                                edge[1]), nodelist.index(edge[0])))
                    except:
                        self.errorLog('!')
                        continue
                    try:
                        OUT.write(
                            '        <att type="string" name="canonicalName" label="canonicalName" value="%s"/>\n'
                            % id)
                        OUT.write(
                            '        <att type="string" name="TYPE" label="TYPE" value="%s"/>\n'
                            % etype)
                        if 'interaction' not in self.dict['EdgeAtt']:
                            OUT.write(
                                '        <att type="string" name="interaction" label="interaction" value="%s"/>\n'
                                % etype)
                        OUT.write(
                            '        <att type="string" name="EDGE_TYPE" label="EDGE_TYPE" value="DefaultEdge"/>\n'
                        )
                        for att in self.dict['Edge'][etype][edge]:
                            if att.lower() == 'type': continue
                            if att not in self.dict['EdgeAtt']: continue
                            type = self.dict['EdgeAtt'][att]
                            value = string.replace(
                                '%s' % self.dict['Edge'][etype][edge][att],
                                '&', 'and')
                            OUT.write(
                                '        <att type="%s" name="%s" label="%s" value="%s"/>\n'
                                % (type, att, att, value))
                        ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                        #!# Update these at some point! #!#
                        #OUT.write('        <graphics width="1" fill="#0000ff">\n')
                        #OUT.write('            <att name="cytoscapeEdgeGraphicsAttributes">\n')
                        #OUT.write('                <att name="sourceArrow" value="0"/>\n')
                        #OUT.write('                <att name="targetArrow" value="0"/>\n')
                        #OUT.write('                <att name="edgeLabelFont" value="Default-0-10"/>\n')
                        #OUT.write('                <att name="edgeLineType" value="SOLID"/>\n')
                        #OUT.write('                <att name="sourceArrowColor" value="#000000"/>\n')
                        #OUT.write('                <att name="targetArrowColor" value="#000000"/>\n')
                        #OUT.write('                <att name="curved" value="STRAIGHT_LINES"/>\n')
                        #OUT.write('            </att>\n')
                        #OUT.write('        </graphics>\n')
                        ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    except:
                        self.errorLog('!')
                    OUT.write('    </edge>\n')

            ### ~ [5] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OUT.write('</graph>\n')
            OUT.close()
            self.log.printLog(
                '\r#XGMML', 'Output of XGMML file %s for %s complete.' %
                (filename, format))
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Example #35
0
 def saveTimePoints(self,
                    filename='',
                    format='tdt',
                    entries=[]):  ### Saves TimePoints to a file
     '''
     Saves TimePoints to a file from main TimePoints table.
     >> filename:str [''] = Output filename. Will use basefile if none given.
     >> format:str ['tdt'] = Output file format (csv/tsv/txt/db)
     >> entries:list [] = Entries from main table to output. (All if none given).
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('TimePoints')
         if format.lower() in ['', 'none']:
             format = string.split(filename.lower(), '.')[-1]
         if not filename: filename = '%s.%s' % (self.basefile(), format)
         if not entries: entries = db.entries()
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if format in ['csv', 'tdt']:
             self.blanksToEmpty()
             rje.delimitedFileOutput(self,
                                     filename,
                                     db.fields(),
                                     rje_backup=True)
             for entry in entries:
                 rje.delimitedFileOutput(self,
                                         filename,
                                         db.fields(),
                                         datadict=entry)
         ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             self.emptyToBlank()
             rje.backup(self, filename)
             OUT = open(filename, 'a')
             for entry in entries:
                 if format == 'db':
                     outlist = []
                     for field in db.fields():
                         outlist.append(entry[field])
                     out_txt = '%s' % outlist
                     OUT.write('(%s);\n' % out_txt[1:-1])
                 else:
                     # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                     out_text = '%s. (TimePoint) ' % entry['TimePoint Name']
                     if entry['month'] in ['', 'blank']:
                         out_text += '%s %s.' % (entry['Year'],
                                                 entry['yearUnit'])
                     else:
                         out_text += '%s %s, %s %s.' % (
                             entry['Year'], entry['yearUnit'],
                             entry['month'], entry['day'])
                     out_text = '%s %s Source: <%s>[%s].' % (
                         out_text, entry['TimePoint Description'],
                         entry['Source URL'], entry['Source URL'])
                     klist = []
                     for i in range(1, 6):
                         if entry['keyword%d' % i] not in ['', 'blank']:
                             klist.append(entry['keyword%d' % i])
                     out_text = '%s (Keywords: %s)' % (
                         out_text, string.join(klist, ', '))
                     OUT.write('%s\n' % out_text)
         self.printLog('#OUT',
                       '%d entries output to %s' % (len(entries), filename))
     except:
         self.errorLog('%s.saveTimePoints(%s) error' % (self, filename))
         return False
Example #36
0
    def gasp(self):  ### Performs GASP: Gapped Ancestral Sequence Prediction
        '''Performs GASP: Gapped Ancestral Sequence Prediction.'''
        try:
            ### <a> ### Preparation
            self.obj['Tree'].cmd_list.append('unkspec=T')
            self.obj['Tree'].obj['SeqList'].opt['UnkSpec'] = True
            ## <i> ## Screen Output
            self.verbose(0, 3, "\nMaking Ancestral Sequences", 0)
            if self.stat['FixPam'] > 0:
                self.verbose(0, 3, "- Fixed PAM%d" % self.stat['FixPam'], 1)
            else:
                self.verbose(0, 3, "- Variable PAM Weighting", 1)
            ## <ii> ## PAM Matrix Setup
            try:
                if self.obj['Tree'].obj['PAM'] == None:
                    self.obj['Tree'].obj['PAM'] = rje_pam.PamCtrl(
                        log=self.log, cmd_list=self.cmd_list)
                if self.stat['FixPam'] <= 0:
                    maxblen = 0
                    for b in self.obj['Tree'].branch:
                        if b.stat['Length'] > maxblen:
                            maxblen = b.stat['Length']
                    self.verbose(1, 3, 'Max Branch Length = %f: ' % maxblen, 0)
                    maxblen = int(maxblen * 100) + 1
                else:
                    maxblen = self.stat['FixPam']
                self.verbose(1, 3, 'Max PAM = %d' % maxblen, 1)
                #print tree.pam.getPamMax(), maxblen
                if self.obj['Tree'].obj['PAM'].stat['PamMax'] < maxblen:
                    #print 'Upping PAM!'
                    self.obj['Tree'].obj['PAM'].stat['PamMax'] = maxblen
                    self.obj['Tree'].obj['PAM'].pamUp()
            except:
                self.log.errorLog(
                    "Fatal run Exception during PAM Matrix Setup\n")
                raise
            ##<iii> ## AA Freqs
            aalist = self.obj['Tree'].obj['PAM'].alphabet
            self.verbose(1, 3, aalist, 1)
            if aalist.count('-') == 0:
                aalist.append('-')
            if aalist.count('X') == 0:
                aalist.append('X')
            self.aafreq = self.obj['Tree'].obj['SeqList'].aaFreq(
                alphabet=aalist)
            self.aafreq['-'] = 0.0
            self.aafreq['X'] = 0.0
            #tree.deBug(aafreq)

            ### <b> ### Terminal sequences - probabilities etc. are known (sequences are known!)
            self.gaspnode = {}  # Array of GaspNode objects
            for node in self.obj['Tree'].node:
                ## <i> ## Check Sequence Exists
                if node.stat['ID'] > self.obj['Tree'].stat['SeqNum']:
                    if node.obj['Sequence'] == None:
                        self.obj['Tree'].obj['SeqList']._addSeq(
                            node.info['Name'], 'X' *
                            self.obj['Tree'].obj['SeqList'].seq[0].seqLen())
                        node.obj['Sequence'] = self.obj['Tree'].obj[
                            'SeqList'].seq[-1]
                ## <ii> ## Create GaspNode object
                self.gaspnode[node] = GaspNode(node, aalist, self.log)
                ## <iii> ## Termini
                if node.stat['ID'] <= self.obj['Tree'].stat['SeqNum']:
                    self.gaspnode[node].probFromSeq()
                    #print s, len(gaspnode[s].sequence), gaspnode[s].ancfix
                    self.gaspnode[node].ancfix = [True] * len(
                        node.obj['Sequence'].info['Sequence'])

            ### <c> ### GASP 1: Gap Status
            self._gapStatus()

            ##  <d>  ## From tips to root
            #X#self.verbose(0,4,"GASP",0)
            aalist.remove('-')
            if aalist.count('X') > 0:
                aalist.remove('X')
            self._gaspProbs(aalist=aalist,
                            useanc=False,
                            dir='down',
                            aaprobs=True,
                            aasub=self.opt['FixDown'],
                            aafix=self.opt['FixDown'])
            if self.opt['FixDown']:
                self.obj['Tree'].ancSeqOut(file='%s.anc.fas' %
                                           self.info['Name'],
                                           ordered=self.opt['Ordered'])
                return
            # Should now have matrix of aa probabilities right back to root...
            ##  <b>  ## Fix Root
            self._gaspProbs(aalist=aalist,
                            useanc=False,
                            dir='root',
                            aaprobs=False,
                            aasub=True,
                            aafix=self.opt['FixUp'])
            ##  <c>  ## Back up tree using all 3 branches
            self._gaspProbs(aalist=aalist,
                            useanc=True,
                            dir='up',
                            aaprobs=True,
                            aasub=True,
                            aafix=self.opt['FixUp'])

            ##  <d>  ## Back down tree with all 3 branches to soften 'outgroup sweep' near root
            for x in range(self.stat['XPass']):
                #X#self.verbose(0,4,":%d:" % (x+1),0)
                self._gaspProbs(aalist=aalist,
                                useanc=True,
                                dir='down',
                                aaprobs=True,
                                aasub=False,
                                aafix=False,
                                gpass=(x + 1))
                self._gaspProbs(aalist=aalist,
                                useanc=True,
                                dir='down',
                                aaprobs=True,
                                aasub=True,
                                aafix=True,
                                gpass=(x + 1))

            ### <4> ### Finished => Save
            for node in self.obj['Tree'].node:
                node.obj['Sequence'].info['Sequence'] = self.gaspnode[
                    node].sequence
            #X#self.verbose(0,2,"Done!",1)
            self.log.printLog(
                '\r#GASP', 'Gapped Ancestral Sequence Prediction Complete.')
            self.obj['Tree'].ancSeqOut(file='%s.anc.fas' % self.info['Name'],
                                       ordered=self.opt['Ordered'])

            ### <5> ### PAM Distances & PAM Tree
            if self.opt['PamTree']:
                try:
                    self.obj['Tree'].branchPam()
                    self.obj['Tree'].saveTree(filename='%s.anc.nsf' %
                                              self.info['Name'],
                                              type='nsf',
                                              seqnum=1,
                                              seqname='short',
                                              maxnamelen=127,
                                              blen='pam',
                                              bootstraps='node',
                                              multiline=1)
                    self.obj['Tree'].textTree(seqnum=1,
                                              seqname='short',
                                              maxnamelen=30,
                                              nodename='short',
                                              showboot=1,
                                              showlen='branch',
                                              blen='pam',
                                              scale=4,
                                              spacer=1,
                                              compress=False)
                    self.obj['Tree'].textTree(filename='%s.anc.txt' %
                                              self.info['Name'],
                                              seqnum=1,
                                              seqname='short',
                                              maxnamelen=30,
                                              nodename='short',
                                              showboot=1,
                                              showlen='branch',
                                              blen='pam',
                                              scale=4,
                                              spacer=1,
                                              compress=False)
                except:
                    self.log.errorLog("Major Problem with PAM Tree.")
                    raise

            ### <6> ### RST Output
            if self.opt['RST']:
                rstfile = '%s.rst' % self.info['Name']
                rje.backup(self, rstfile)
                RST = open(rstfile, 'a')
                RST.write(
                    'Supplemental results for GASP - main output %s.anc.fas\n\n'
                    % self.info['Name'])
                for node in self.obj['Tree'].node[self.obj['Tree'].
                                                  stat['SeqNum']:]:
                    gn = self.gaspnode[node]
                    RST.write('%s\n\n' % string.join(gn.rst, '\n'))
                RST.close()
                self.log.printLog('RST output %s.rst complete.' %
                                  self.info['Name'])
        except:
            self.log.errorLog('Fatal Error during GASP.')
            raise
Example #37
0
 def exonerate(self,qryfas, genome, model,exonerate='exonerate',bestn=0):
     '''
     Runs exonerate and parses output into lists for processing.
     { query: {'gff':[outputlines], 'cigar':[outputlines], 'alignment':[outputlines], 'vulgar':[[headerlist], {header:value}, {header:value}, ...] }
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         EXFILE = None
         exfile = '%s.%s' % (self.baseFile(),model)  # Used in memsaver mode
         query_dic = {}
         header_list = ['query_id', 'query_start', 'query_end', 'query_strand', 'target_id', 'target_start', 'target_end', 'target_strand', 'score', '<label, query_length, target_length> triplets']
         excmd = [exonerate, qryfas, genome, '--showtargetgff', '--showcigar']
         if model: excmd += ['--model', model]
         if bestn: excmd += ['--bestn', '%d' % bestn]
         if self.getStrLC('ExOpt'): excmd += string.split(self.getStr('ExOpt'))
         self.printLog('#RUN',string.join(excmd))
         extext = []
         if self.getBool('MemSaver'):
             gzfile = '%s.gz' % exfile
             if rje.exists(gzfile): self.gUnzip(gzfile)
             if rje.exists(exfile) and not self.force():
                 self.printLog('#EXFILE','Found %s (force=F). Assuming complete.' % exfile)
             else:
                 rje.backup(self,exfile)
                 self.printLog('#SAVER','memsaver=T: Exonerate output directed to %s.' % exfile)
                 EXFILE = open(exfile,'w')
                 if subprocess.call(excmd, stdout=EXFILE): raise IOError('Exonerate call did not complete!')
                 EXFILE.close()
                 self.printLog('#EXFILE','%s generated.' % exfile)
             EXFILE = open(exfile,'r')
         else:
             extext = Popen(excmd, stdout=PIPE).stdout.readlines()
         output_format = ''
         while extext or EXFILE:
             #line = process.stdout.readline().rstrip()
             if EXFILE:
                 line = EXFILE.readline()
                 if not line: break
                 line = rje.chomp(line)
             else: line = rje.chomp(extext.pop(0))
             if line:
                 if line.startswith('         Query:'):
                     query = line.split(':', 1)[1].split(' ')[1]
                     #for q in rje.sortKeys(query_dic):
                     #    self.bugPrint('%s: %s' % (q,rje.sortKeys(query_dic[q])))
                     #self.debug(query)
                 if line == 'C4 Alignment:':
                     output_format = 'alignment'
                 elif line == '# --- START OF GFF DUMP ---':
                     output_format = 'gff'
                 elif line.startswith('vulgar:'):
                     output_format = 'vulgar'
                     fields = line.split(' ', 10)[1:]
                     if output_format in query_dic[query]:
                         query_dic[query][output_format].append({})
                     else:
                         query_dic[query][output_format] = [header_list, {}]
                     for header, field in zip(header_list, fields):
                         query_dic[query][output_format][-1][header] = field
                     #self.debug(query_dic[query][output_format])
                 elif line.startswith('cigar:'):
                     output_format = 'cigar'
                     if output_format in query_dic[query]:
                         query_dic[query][output_format].append(line.replace('cigar: ', ''))
                     else:
                         query_dic[query][output_format] = [line.replace('cigar: ', '')]
                 elif line == '------------' or line.startswith('Command line:') or line.startswith('Hostname:') or line == '# --- END OF GFF DUMP ---' or line == '#' or line.startswith('-- completed exonerate analysis'):
                     pass
                 elif output_format:
                     if query in query_dic:
                         if output_format in query_dic[query]:
                             query_dic[query][output_format].append(line)
                         else:
                             query_dic[query][output_format] = [line]
                     else:
                         query_dic[query] = {output_format:[line]}
             #elif process.poll() is not None:
             #    break
             elif output_format == 'alignment':
                 try: query_dic[query][output_format].append(line)
                 except: pass
             self.vPrint(line,v=1)
         if EXFILE:
             EXFILE.close()
             if self.getBool('Cleanup'):
                 os.unlink(exfile)
                 self.printLog('#CLEAN','%s deleted.' % exfile)
             elif self.getBool('GZip'): self.gZip(exfile)
         return query_dic
     except: self.errorLog('%s.exonerate error' % self.prog()); raise