def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes'] '''Sets up output files self.str['MapFas','MissFas','MapRes'].''' ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### delimit = rje.getDelimit(self.cmd_list) if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = '' else: self.bool['Append'] = True self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom']) ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)} if self.getBool('Combine'): files.pop('MissFas') if self.str['ResFile'].lower() in ['','none']: self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True)) for file in files.keys(): self.setStr({file: self.getStr('ResFile') + '.' + files[file]}) rje.backup(self,self.getStr(file)) ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Consider replacing with rje_db object? #!# self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score'] for qh in ['Query','Hit']: self.list['Headers'] += ['%s_Species' % qh] if self.bool['GablamOut']: for st in ['Len','Sim','ID']: self.list['Headers'] += ['%s_%s' % (qh,st)] rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'), rje.baseFile(self.getStr('RestBase'), strip_path=True, keepext=True)) rje.mkDir(self, self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full', 'text']: outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', '%s: %s' % (self.getStrLC('Rest'), outfile)) return True elif self.getStrLC('Rest'): self.printLog( '#OUTFMT', 'REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', 'full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self, self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey], 'w').write(self.dict['Output'][rkey]) self.printLog('#OUT', '%s: %s' % (rkey, self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def release(self): ### Generate the release information tables. '''Generate the release information tables.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() hdb = None # History table: dir, module, version, update, release basefile = self.basefile() prevbase = self.getStr('PrevBase') backbase = self.getStr('BackBase') if backbase == prevbase: raise ValueError('BackBase cannot match PrevBase ("%s")' % prevbase) if backbase == basefile: raise ValueError('BackBase cannot match BaseFile ("%s")' % basefile) ## ~ [1a] Load & Backup previous release ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for sfile in ['release.tdt','history.tdt','readme.txt','updates.html']: pfile = '%s.%s' % (prevbase,sfile) bfile = '%s.%s' % (backbase,sfile) if os.path.exists(pfile): if os.path.exists(bfile): rje.backup(self,bfile) open(bfile,'w').write(open(pfile,'r').read()) self.printLog('#BACK','%s => %s' % (pfile,bfile)) if sfile == 'history.tdt': hdb = db.addTable(filename=pfile,mainkeys=['Dir','Module','Version'],name='history',expect=True) ### ~ [2] Generate slimsuite.release.tdt based on *.Module.tdt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rdb = db.copyTable(self.db('Module'),'release') rdb.renameField('SourceDir','Dir') rdb.newKey(['Dir','Module']) rdb.dropFields(['File','Classes','Methods']) if 'release' in self.list['Output']: rdb.saveToFile(backup=False) ### ~ [3] Generate slimsuite.history.tdt, parsed from docstrings, based on pydoc.distribute() ~~~~~~~~~~~ ### if not hdb: hdb = db.addEmptyTable('history',['Dir','Module','Version','Update','Release'],['Dir','Module','Version']) self.makeHistory() # Generate slimsuite.readme.txt based on pydoc.saveDocs() if 'readme' in self.list['Output']: self.saveReadMe('%s.readme.txt' % basefile) return True except: self.errorLog('%s.release error' % self.prog()); return False
def readResults(self, clear=True, readaln=False ): ### Reads results from self.list['HMMRes'] into objects ''' Reads results from self.list['HMMRes'] into objects. >> clear:boolean = whether to clear self.search before reading [True] >> readaln:boolean = whether to bother reading Alignments into objects [False] ''' try: if clear: self.search = [] for resfile in rje.sortUnique(self.list['HMMRes'], xreplace=False): if not os.path.exists( resfile) and self.opt['GZip'] and os.path.exists( '%s.gz' % resfile): os.system('gunzip %s.gz' % resfile) self.printLog('#GUNZIP', 'Gunzipped %s.gz' % resfile) if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile, readaln) else: self.readHMMSearch(resfile, readaln) if self.opt['GZip'] and os.path.exists(resfile): rje.backup(self, '%s.gz' % resfile, unlink=True) os.system('gzip %s' % resfile) self.printLog('#GZIP', '%s gzipped to save space' % resfile) except: self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!', quitchoice=True) return False
def _run(self): ### Controls main Class functions ''' Controls main Class functions: * 1. Use hmmbuild to construct HMMs from input sequence files * 2. Search a sequence database with HMMs files * 3. Convert HMMer output into a delimited text file of results. ''' try: ### 1. Build ### for seqfile in self.list['MakeHMM']: hmmfile = self.buildHMM(seqfile) if hmmfile: self.list['HMM'].append(hmmfile) ### 2. Search ### self.deBug(self.list['HMM']) if self.list['HMM'] and os.path.exists( self.info['SearchDB']) and self.info['HMMOut'].lower( ) not in ['', 'none']: rje.backup(self, self.info['HMMOut'], unlink=True) for hmm in self.list['HMM']: self.list['HMMRes'].append( self.hmmSearch(hmm, outfile=self.info['HMMOut'])) ### 3. Tabulate ### self.hmmTable(outfile=self.info['HMMTab'], append=self.opt['Append']) return True except: self.log.errorLog('Fatal Error during rje_hmm._run()', quitchoice=True) return False
def _run(self): ### Controls main Class functions ''' Controls main Class functions: * 1. Use hmmbuild to construct HMMs from input sequence files * 2. Search a sequence database with HMMs files * 3. Convert HMMer output into a delimited text file of results. ''' try: ### 1. Build ### for seqfile in self.list['MakeHMM']: hmmfile = self.buildHMM(seqfile) if hmmfile: self.list['HMM'].append(hmmfile) ### 2. Search ### self.deBug(self.list['HMM']) if self.list['HMM'] and os.path.exists(self.info['SearchDB']) and self.info['HMMOut'].lower() not in ['','none']: rje.backup(self,self.info['HMMOut'],unlink=True) for hmm in self.list['HMM']: self.list['HMMRes'].append(self.hmmSearch(hmm,outfile=self.info['HMMOut'])) ### 3. Tabulate ### self.hmmTable(outfile=self.info['HMMTab'],append=self.opt['Append']) return True except: self.log.errorLog('Fatal Error during rje_hmm._run()',quitchoice=True) return False
def run(self, gtext=''): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setup(gtext) ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### html = self.glossaryHTML() hobj = self.obj['HTML'] date = string.split(time.asctime(time.localtime(time.time()))) date = '%s %s %s' % (date[2], date[1], date[-1]) hobj.info['Copyright'] += '. Generated by rje_glossary.py' title = '%s' % self.getStr('Name') tabber = self.getStr('HTMLStyle').lower() == 'tab' frontpage = True html = '%s\n\n%s\n\n%s' % (hobj.htmlHead( title, tabber, frontpage), html, hobj.htmlTail(tabber)) if not gtext: # Replace with CGI option rje.backup(self, self.getStr('OutFile'), appendable=False) open(self.getStr('OutFile'), 'w').write(html) self.printLog( '#HTML', '%s HTML output to %s' % (title, self.getStr('OutFile'))) return html except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(): return ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tables = self.db().tables()[0:] ## ~ [2a] ~ Calculate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for table1 in tables: for table2 in tables[tables.index(table1) + 1:]: self.difference(table1, table2) ## ~ [2b] ~ Calculate Averages & Generate HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for table in self.db().tables()[0:]: self.average(table) ## ~ [2c] ~ Output HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.getBool('TopHTML'): html = rje_html.HTML(self.log, self.cmd_list) hfile = '%s.html' % self.basefile() rje.backup(self, hfile) open(hfile, 'w').write( html.htmlHead(title=self.basefile(), tabber=False) + self.getStr('TopHTML') + html.htmlTail(False)) return except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1; continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile,db)) self.debug(rje.isYounger(hfile,db) == hfile) need2blast = need2blast or not rje.isYounger(hfile,db) == hfile if not need2blast: self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self,null_file); nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')] rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) os.rename(sbfile,hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1 if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file)) self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
def release(self): ### Generate the release information tables. '''Generate the release information tables.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() hdb = None # History table: dir, module, version, update, release basefile = self.basefile() prevbase = self.getStr('PrevBase') backbase = self.getStr('BackBase') if backbase == prevbase: raise ValueError('BackBase cannot match PrevBase ("%s")' % prevbase) if backbase == basefile: raise ValueError('BackBase cannot match BaseFile ("%s")' % basefile) ## ~ [1a] Load & Backup previous release ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for sfile in [ 'release.tdt', 'history.tdt', 'readme.txt', 'updates.html' ]: pfile = '%s.%s' % (prevbase, sfile) bfile = '%s.%s' % (backbase, sfile) if os.path.exists(pfile): if os.path.exists(bfile): rje.backup(self, bfile) open(bfile, 'w').write(open(pfile, 'r').read()) self.printLog('#BACK', '%s => %s' % (pfile, bfile)) if sfile == 'history.tdt': hdb = db.addTable( filename=pfile, mainkeys=['Dir', 'Module', 'Version'], name='history', expect=True) ### ~ [2] Generate slimsuite.release.tdt based on *.Module.tdt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rdb = db.copyTable(self.db('Module'), 'release') rdb.renameField('SourceDir', 'Dir') rdb.newKey(['Dir', 'Module']) rdb.dropFields(['File', 'Classes', 'Methods']) if 'release' in self.list['Output']: rdb.saveToFile(backup=False) ### ~ [3] Generate slimsuite.history.tdt, parsed from docstrings, based on pydoc.distribute() ~~~~~~~~~~~ ### if not hdb: hdb = db.addEmptyTable( 'history', ['Dir', 'Module', 'Version', 'Update', 'Release'], ['Dir', 'Module', 'Version']) self.makeHistory() # Generate slimsuite.readme.txt based on pydoc.saveDocs() if 'readme' in self.list['Output']: self.saveReadMe('%s.readme.txt' % basefile) return True except: self.errorLog('%s.release error' % self.prog()) return False
def mapTaxa(self,taxin,taxout=['spcode'],nodeonly=False,rankonly=False,savetaxout=True): ### Takes a list of Taxa and returns mapped Taxa data ''' Takes a list of Taxa and returns mapped Taxa data. >> taxin:str or list of taxon identifiers to map from. >> taxout:str or list of taxa output formats >> nodeonly:bool = whether to limit TaxID mapping to the precise matching nodes (else include children) >> rankonly:bool = whether to limit TaxID to those matching self.list['RankTypes'] taxon types. >> savetaxout:bool [True] = Whether to save the TaxOut list to a text file << taxoutlist:list of mapped taxa if taxout is a string, OR << taxoutdict:dict of mapped taxa if taxout is a list ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxout.sort() except: tlist = False if tlist: if not taxout: return {} taxout = [taxout] elif not taxout: return [] ### ~ [2] ~ Map to TaxID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxid = self.mapToTaxID(self.list['TaxIn'],nodeonly,rankonly) if self.list['RestrictID']: tx = len(taxid) taxid = rje.listIntersect(taxid,self.list['RestrictID']) self.printLog('#TAXID','%s of %s TaxID in %s Restricted IDs.' % (rje.iLen(taxid),rje.iStr(tx),rje.iLen(self.list['RestrictID']))) ### ~ [3] ~ Map TaxID and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxdict = {}; taxoutdict = {} for taxout in self.list['TaxOut']: taxout = taxout.lower() if taxout == 'taxid': taxoutlist = taxid elif taxout in ['spcode','name','common']: if not taxdict: taxdict = self.taxDict(taxid) taxoutlist = [] for t in taxid: try: taxoutlist.append(taxdict[t][taxout]) except: self.warnLog('No "%s" data for TaxID %s' % (taxout, t),'Missing_%s' % taxout,suppress=True) taxoutlist.sort() else: self.errorLog('TaxOut format "%s" not recognised' % taxout,printerror=False); continue taxoutdict[taxout] = taxoutlist if savetaxout: if not taxoutlist: self.printLog('#OUT','No %s IDs to output' % taxout); continue tfile = '%s.%s.txt' % (self.baseFile(),taxout) rje.backup(self,tfile) open(tfile,'w').write(string.join(taxoutlist,'\n')) self.printLog('#OUT','%s %s IDs output to %s.' % (rje.iLen(taxoutlist), taxout, tfile)) if tlist: return taxoutdict return taxoutlist except: self.errorLog('Problem during %s mapTaxa.' % self); raise
def outputSchema(self,format='txt',filename='schema.txt'): ### Formats and outputs shema ''' Formats and outputs shema. >> format:str [txt] = Type of output format >> filename:str [schema.txt] = Name for output file ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### schema = self.dict['Schema'] rje.backup(self,filename) level = 1 ### ~ [2] Process Schema ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.schedict(schema,filename,level) except: self.log.errorLog('Problem outputting schema') print schema
def depthChargeForker(self): ### Main DepthCharge forking method ''' Work through each sequence and fork it out for DepthCharge analysis. ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqin = self.seqinObj() self.list['ToFork'] = seqin.list['Seq'][0:] resfile = '{0}.depthcharge.tdt'.format(self.baseFile()) if self.force(): rje.backup(resfile, appendable=False) elif rje.exists(resfile): ddb = self.db().addTable(resfile, ['seqname', 'start', 'end', 'type']) ddb.dataFormat({'start': 'int', 'end': 'int'}) complete = ddb.indexDataList('type', 'all', 'seqname') if complete: cx = 0 for seq in self.list['ToFork'][0:]: if seqin.shortName(seq) in complete: self.list['ToFork'].remove(seq) cx += 1 if cx: self.printLog( '#SKIP', 'Skipping {0} previously processed sequences (force=F)' .format(rje.iStr(cx))) if not self.list['ToFork']: self.printLog( '#CHARGE', 'All sequences previously processed (force=F)') return ddb while len(self.list['Forked']) < self.getNum( 'Forks') and self.list['ToFork']: self.nextFork() ### ~ [2] ~ Work through each sequence and fork out ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK', 'Forking of %s jobs completed.' % (rje.iStr(seqin.seqNum())), log=self.getBool('LogFork')) ddb = self.db().addTable(resfile, ['seqname', 'start', 'end', 'type'], replace=True) ddb.dataFormat({'start': 'int', 'end': 'int'}) return ddb except: self.errorLog('%s.depthChargeForker error' % self.prog())
def outputSchema(self, format='txt', filename='schema.txt'): ### Formats and outputs shema ''' Formats and outputs shema. >> format:str [txt] = Type of output format >> filename:str [schema.txt] = Name for output file ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### schema = self.dict['Schema'] rje.backup(self, filename) level = 1 ### ~ [2] Process Schema ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.schedict(schema, filename, level) except: self.log.errorLog('Problem outputting schema') print schema
def peptCluster(self): ### Performs actual peptide clustering and stores results in self.obj['Tree'] '''Performs actual peptide clustering and stores results in self.obj['Tree'].''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### base = rje.baseFile(self.getStr('SaveDis')) pretree = ['treeformats=nwk,text','basefile=%s' % base] ### ~ [1] ~ Phylip Neighbor method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('PeptCluster') == 'neighbor': disfile = '%s.phy' % base fasfile = '%s.fas' % base treecmd = ['autoload=T','maketree=neighbor','disin=%s' % disfile,'seqin=%s' % fasfile] pretree += ['root=mid'] if disfile != self.getStr('SaveDis'): rje.backup(self,disfile) self.obj['PeptDis'].saveMatrix(filename=disfile,format='phylip') ### Saves matrix if 'peptides=%s' % fasfile not in self.cmd_list: rje.backup(self,fasfile) FAS = open(fasfile,'w') for pep in self.list['Peptides']: FAS.write('>%s\n%s\n' % (pep,pep)) FAS.close() tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd) ### ~ [2] ~ UPGMA method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### else: if self.getStr('PeptCluster') not in ['wpgma','upgma']: self.errorLog('PeptCluster method "%s" not recognised. Will use UPGMA' % self.getStr('PeptCluster'),printerror=False) base = string.replace(base,self.getStr('PeptCluster'),'upgma') pretree += ['basefile=%s' % base] if self.getStr('PeptCluster') == 'upgma': nsftree = self.obj['PeptDis'].upgma() elif self.getStr('PeptCluster') == 'wpgma': nsftree = self.obj['PeptDis'].wpgma() #nwkfile = '%s.nwk' % base #treecmd += ['nsfin=%s' % nwkfile] #rje.backup(self,nwkfile) #open(nwkfile,'w').write(nsftree) treecmd = ['autoload=F'] tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd) tree.buildTree(nsftree) ### ~ [3] ~ Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for node in tree.node: if node.info['Name'] in self.list['Peptides']: node.stat['ID'] = self.list['Peptides'].index(node.info['Name']) + 1 tree.saveTrees() for outfmt in tree.list['TreeFormats']: treefile = '%s.%s' % (tree.info['Basefile'],rje_tree.formatext[outfmt]) self.dict['Output'][outfmt] = treefile except: self.errorLog('%s.peptDis error' % self);
def saveTimePoints(self,filename='',format='tdt',entries=[]): ### Saves TimePoints to a file ''' Saves TimePoints to a file from main TimePoints table. >> filename:str [''] = Output filename. Will use basefile if none given. >> format:str ['tdt'] = Output file format (csv/tsv/txt/db) >> entries:list [] = Entries from main table to output. (All if none given). ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db('TimePoints') if format.lower() in ['','none']: format = string.split(filename.lower(),'.')[-1] if not filename: filename = '%s.%s' % (self.basefile(),format) if not entries: entries = db.entries() ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if format in ['csv','tdt']: self.blanksToEmpty() rje.delimitedFileOutput(self,filename,db.fields(),rje_backup=True) for entry in entries: rje.delimitedFileOutput(self,filename,db.fields(),datadict=entry) ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: self.emptyToBlank() rje.backup(self,filename) OUT = open(filename,'a') for entry in entries: if format == 'db': outlist = [] for field in db.fields(): outlist.append(entry[field]) out_txt = '%s' % outlist OUT.write('(%s);\n' % out_txt[1:-1]) else: # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) out_text = '%s. (TimePoint) ' % entry['TimePoint Name'] if entry['month'] in ['','blank']: out_text += '%s %s.' % (entry['Year'],entry['yearUnit']) else: out_text += '%s %s, %s %s.' % (entry['Year'],entry['yearUnit'],entry['month'],entry['day']) out_text = '%s %s Source: <%s>[%s].' % (out_text,entry['TimePoint Description'],entry['Source URL'],entry['Source URL']) klist = [] for i in range(1,6): if entry['keyword%d' % i] not in ['','blank']: klist.append(entry['keyword%d' % i]) out_text = '%s (Keywords: %s)' % (out_text,string.join(klist,', ')) OUT.write('%s\n' % out_text) self.printLog('#OUT','%d entries output to %s' % (len(entries),filename)) except: self.errorLog('%s.saveTimePoints(%s) error' % (self,filename)); return False
def readResults(self,clear=True,readaln=False): ### Reads results from self.list['HMMRes'] into objects ''' Reads results from self.list['HMMRes'] into objects. >> clear:boolean = whether to clear self.search before reading [True] >> readaln:boolean = whether to bother reading Alignments into objects [False] ''' try: if clear: self.search = [] for resfile in rje.sortUnique(self.list['HMMRes'],xreplace=False): if not os.path.exists(resfile) and self.opt['GZip'] and os.path.exists('%s.gz' % resfile): os.system('gunzip %s.gz' % resfile) self.printLog('#GUNZIP','Gunzipped %s.gz' % resfile) if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile,readaln) else: self.readHMMSearch(resfile,readaln) if self.opt['GZip'] and os.path.exists(resfile): rje.backup(self,'%s.gz' % resfile,unlink=True) os.system('gzip %s' % resfile) self.printLog('#GZIP','%s gzipped to save space' % resfile) except: self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',quitchoice=True) return False
def hmmSearch(self,hmm,dbase=None,outfile=None,wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR','HMM file %s is missing!' % hmm); return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR','Database file "%s" is missing!' % dbase); return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in ['','none']: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile(hmm,True),rje.baseFile(dbase,True)) resfile = outfile if not os.path.exists(outfile) and self.opt['GZip'] and os.path.exists('%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger(resfile,hmm) == resfile and rje.isYounger(resfile,dbase) == resfile: self.printLog('#HMM','HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self,outfile,unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) self.log.printLog('#HMM',_command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile,'a').write(os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM','Outfile produced for %s: %s.' % (hmm,outfile)) if self.opt['GZip']: rje.backup(self,'%s.gz' % outfile,unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP','%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def run(self,gtext=''): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setup(gtext) ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### html = self.glossaryHTML() hobj = self.obj['HTML'] date = string.split(time.asctime(time.localtime(time.time()))) date = '%s %s %s' % (date[2],date[1],date[-1]) hobj.info['Copyright'] += '. Generated by rje_glossary.py' title = '%s' % self.getStr('Name') tabber = self.getStr('HTMLStyle').lower() == 'tab' frontpage = True html = '%s\n\n%s\n\n%s' % (hobj.htmlHead(title,tabber,frontpage),html,hobj.htmlTail(tabber)) if not gtext: # Replace with CGI option rje.backup(self,self.getStr('OutFile'),appendable=False) open(self.getStr('OutFile'),'w').write(html) self.printLog('#HTML','%s HTML output to %s' % (title,self.getStr('OutFile'))) return html except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(): return ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tables = self.db().tables()[0:] ## ~ [2a] ~ Calculate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for table1 in tables: for table2 in tables[tables.index(table1)+1:]: self.difference(table1,table2) ## ~ [2b] ~ Calculate Averages & Generate HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for table in self.db().tables()[0:]: self.average(table) ## ~ [2c] ~ Output HTML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.getBool('TopHTML'): html = rje_html.HTML(self.log,self.cmd_list) hfile = '%s.html' % self.basefile() rje.backup(self,hfile) open(hfile,'w').write(html.htmlHead(title=self.basefile(),tabber=False)+self.getStr('TopHTML')+html.htmlTail(False)) return except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def haqBatch( self, force=False): ### Generates Batch and INI files for HAQESAC runs '''Generates Batch and INI files for HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True) inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'], wholepath=True) if force or self.force( ) or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self, batfile) rje.backup(self, inifile) else: return self.printLog('#HAQBAT', 'HAQESAC Batch files found.') ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### haqcmd = [] for cmd in self.cmd_list: if cmd[:4].lower() != 'ini=': haqcmd.append(cmd) if self.opt['MultiHAQ']: haqcmd += ['multihaq=T', 'force=F'] open(inifile, 'w').write(string.join(haqcmd, '\n')) ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): acc = seq.info['AccNum'] haqcmd = [ 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc ] open(batfile, 'a').write('python %shaqesac.py %s\n' % (self.info['Path'], string.join(haqcmd))) self.printLog('#HAQBAT', 'HAQESAC Batch file output to %s' % batfile) except: self.errorLog('Major problem with MultiHAQ.haqBatch', quitchoice=True)
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) rje.mkDir(self,self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full','text']: outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','%s: %s' % (self.getStrLC('Rest'),outfile)) return True elif self.getStrLC('Rest'): self.printLog('#OUTFMT','REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self,self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey],'w').write(self.dict['Output'][rkey]) self.printLog('#OUT','%s: %s' % (rkey,self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def uniFake(self,seqs=[],store=False): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx,seqnum) = (0,seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt(self.log,self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['','none']: self.info['DatOut'] = rje.baseFile(seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self,datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log,self.cmd_list+['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self,hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log,self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog('#SEQ','Processing %s (%s aa) %s...' % (seq.shortName(),rje.integerString(seq.aaLen()),seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5),seq.info['AccNum']) open('%s.fas' % utmp,'w').write('>%s\n%s\n' % (seq.shortName(),seq.info['Sequence'])) udata = {'CC':['-!- Features generated using unifake.py'],'AC':[]} if seq.info['SpecCode'] in ['Unknown','UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']): details = rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']) self.addAlias(seq.info['AccNum'],details[0]) self.addAlias(seq.info['AccNum'],details[1]) self.addAlias(seq.info['AccNum'],details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(),seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append('%s;' % string.join(self.dict['Aliases'][id],'; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [seq.shortName(),seq.info['AccNum'],seq.info['ID']]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list['RegionDisorder']: ft.append({'Type':'DISORDER','Desc':'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'],'Start':disorder[0],'End':disorder[1]}) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % (ft[-1]['Desc'],dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({'Type':'ORDER','Desc':'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'],'Start':fold[0],'End':fold[1]}) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % (ft[-1]['Desc'],dis.stat['IUCut']) except: self.log.errorLog('UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({'SearchDB':'%s.fas' % utmp,'HMMOut':'%s.hmm.out' % utmp}) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [hmm.hmmSearch(self.info['PFam'],outfile=hmm.info['HMMOut'])] # Used in hmmTable hmm.hmmTable(outfile=hmmfile,append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list['ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append('PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'],time.ctime(os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append('-!- ERROR: PFam HMMer Search failure!') out = {'Type':'!ERROR!','Name':name} rje.delimitedFileOutput(self,hmmfile,['Type','Name','Start','End','Eval','Score'],datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = {'Start':aln.stat['SbjStart'],'End':aln.stat['SbjEnd'],'Type':'PFAM', 'Desc':'%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'],aln.stat['Expect'],aln.stat['BitScore'])} if disorder: region = disorder[aln.stat['SbjStart']-1:aln.stat['SbjEnd']] hmmdisorder = float(sum(region)) / len(region) pfamft['Desc'] = '%s; IUPRed: %.2f' % (pfamft['Desc'],hmmdisorder) if hmmdisorder < self.stat['DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog('UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen('%s %s.fas -short' % (self.info['TMHMM'],utmp)).readlines() domlist = rje_tm.domainList(rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append('TMHMM: %d TM domains; N-Term %s' % ((len(domlist)-1)/2,domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system('%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'],utmp,utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({'Type':'SIGNALP','Desc':desc,'Start':1,'End':cpos}) except: self.log.errorLog('UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq,udata,ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq(seq,data=udata,ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile,append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog('#UNIFAKE','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx),rje.integerString(seqnum-sx)),log=False) if store: uniprot.saveUniProt(datfile,append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog('Oh, the shame of it! Trouble during UniFake.uniFake()')
def slimDisc(self): ### Runs SLiMDisc on batch of files '''Runs SLiMDisc on batch of files.''' try: ### Setup ### if self.stat['MinSup'] > self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1: self.stat['MinSup'] = self.stat['SlimSupport'] if self.stat['MaxSup'] > 0 and self.stat['MaxSup'] < self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1: self.stat['MaxSup'] = self.stat['SlimSupport'] ### Make File List ## _stage = 'Make File List' if self.info['SeqIn'].lower() not in ['','none']: if os.path.exists(self.info['SeqIn']): gfiles = [self.info['SeqIn']] else: self.log.errorLog('"seqin" file "%s" not found! No SLiMDisc analysis.' % self.info['SeqIn'],printerror=False) return False else: gfiles = rje.getFileList(callobj=self,filelist=self.list['SlimFiles'],subfolders=False,summary=False) self.log.printLog('#FILES','%s files identified for SLiMDisc analysis.' % rje.integerString(len(gfiles))) ## Sort by size and filter by MinSup and MaxSup ### datasize = {} # Dictionary for crude sorting of files by total AA content seqnum = {} # Number of sequences in each file qry = {} # Query sequence name (if any) for file tmpseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autofilter=F']) gx = 0 while gx < len(gfiles): seqfilename = gfiles[gx] gx += 1 seqfile = seqfilename[0:] tmpseq.seq = [] tmpseq.loadSeqs(seqfile) ## *** Special RemHub process *** ## checkhub = True for hubtype in ['rem','kept','no']: if seqfile.find('-%shub.fas' % hubtype) > 0: checkhub = False if self.stat['RemHub'] > 0.0 and checkhub: if rje.matchExp('(\S+)_PPI',seqfile): hub_acc = rje.matchExp('(\S+)_PPI',rje.baseFile(seqfile,strip_path=True))[0] else: hub_acc = rje.baseFile(seqfile,strip_path=True) hub_base = rje.matchExp('(\S+)%s' % hub_acc,seqfilename)[0] basefile = seqfile while rje.baseFile(basefile) != basefile: basefile = rje.baseFile(basefile) if tmpseq.querySeq(query=hub_acc): ### Sets Hub as Query Sequence self.log.printLog('#HUB','Removing hub protein %s and >=%.1f%% ID from PPI dataset %s.' % (hub_acc,self.stat['RemHub'],seqfile)) tmpseq.makeNR(text='Hub protein homologues',nrid=self.stat['RemHub'],blast=tmpseq.seqNum(),nrsim=0,nr_qry=tmpseq.obj['QuerySeq']) tmpseq.removeSeq(text='PPI Hub Protein (self-interactor)',seq=tmpseq.obj['QuerySeq']) tmpseq.obj['QuerySeq'] = None seqfile = '%s-remhub.fas' % basefile tmpseq.saveFasta(seqfile=seqfile) ### Saves sequences in fasta format keptfile = '%s-kepthub.fas' % basefile os.rename(seqfilename,keptfile) gfiles.append(keptfile) else: seqfile = '%s-nohub.fas' % basefile os.rename(seqfilename,seqfile) self.log.printLog('#HUB','Hub protein %s not in PPI dataset %s => %s.' % (hub_acc,seqfilename,seqfile)) #X#print tmpseq.obj['QuerySeq'] ## Support Range ### if tmpseq.seqNum() < self.stat['MinSup'] or (self.stat['MaxSup'] > 0 and tmpseq.seqNum() > self.stat['MaxSup']): self.log.printLog('#REJ','%s rejected: %s sequences = outside acceptable range of %d-%d.' % (seqfile,rje.integerString(tmpseq.seqNum()),self.stat['MinSup'],self.stat['MaxSup'])) continue aasize = tmpseq.aaCount() self.log.printLog('#AA','%s = %s aa.' % (seqfile,rje.integerString(aasize))) while datasize.has_key(aasize): aasize += 1 datasize[aasize] = seqfile seqnum[seqfile] = tmpseq.seqNum() ## Query ## qry[seqfile] = None if self.opt['SlimQuery']: if rje.matchExp('qry_(\S+)\.',seqfilename): if tmpseq.querySeq(query=rje.matchExp('qry_(\S+)\.',seqfilename)[0]): ### Sets Query Sequence if appropriate qry[seqfile] = tmpseq.obj['QuerySeq'].shortName() self.log.printLog('#INF','%s Datasets to process.' % rje.integerString(len(seqnum))) ### Batch Output Mode ### batchout = None if self.info['BatchOut'].lower() not in ['','none']: batchout = self.info['BatchOut'] if not self.opt['Append'] and os.path.exists(batchout): rje.backup(self,batchout) ### Work through Files ### _stage = 'Work through files' for key in rje.sortKeys(datasize,revsort=self.opt['BigFirst']): seqfile = datasize[key] basefile = seqfile while rje.baseFile(basefile) != basefile: basefile = rje.baseFile(basefile) base = rje.baseFile(basefile,True) self.log.printLog('#DAT',seqfile,timeout=False) if not self.opt['UseRes']: slim_cmd = '-BT -TT' else: ## Detect old files ## _stage = 'Detect old files' old_rank = '%s/%s.rank' % (basefile,base) self.log.printLog('#RES','Existing SLiMDisc Output?: %s' % (os.path.exists(old_rank))) old_b_list = glob.glob('%s/results/*.blastp' % basefile) old_t_file = '%s/%s.fasta.out' % (basefile,base) self.log.printLog('#RES','Existng TEIRESIAS Output?: %s' % (os.path.exists(old_t_file))) self.log.printLog('#RES','%s of %s BLAST files detected.' % (rje.integerString(len(old_b_list)),rje.integerString(seqnum[seqfile]))) ## TEIRESIAS ## if (os.path.exists(old_rank) or len(old_b_list) > 0) and os.path.exists(old_t_file): # BLAST started: TEIRESIAS finished! slim_cmd = '-TF' else: slim_cmd = '-TT' ## BLAST ## if len(old_b_list) != seqnum[seqfile]: # Need BLAST slim_cmd += ' -BT' else: slim_cmd += ' -BF' ## Query ## if self.opt['SlimQuery'] and qry[seqfile]: slim_cmd += ' -q %s' % qry[seqfile] ## Ranks ## slim_cmd += ' -n %d' % self.stat['SlimRanks'] ## Support ## if self.stat['SlimSupport'] > 0 and self.stat['SlimSupport'] < 1: slim_cmd += ' -S %.1f' % self.stat['SlimSupport'] elif self.stat['SlimSupport'] > 0: slim_cmd += ' -S %d' % self.stat['SlimSupport'] ## WallTime ## slim_cmd += ' -W %d' % self.stat['SlimWall'] ## MemSaver ## if self.opt['MemSaver']: slim_cmd += ' -X T' else: slim_cmd += ' -X F' ## SlimOpt ## if self.info['SlimOpt']: slim_cmd += ' %s' % self.info['SlimOpt'] ## Perform SLiMDisc Run ## _stage = 'Peform SLiMDisc Run (%s)' % (seqfile) if batchout: BATCH = open(batchout,'a') BATCH.write('%s -i %s -Q0 %s\n' % (self.info['SlimCall'],seqfile,slim_cmd)) BATCH.close() else: if self.stat['Verbose'] > 0: syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q2 %s' % (self.info['SlimVersion'],seqfile,slim_cmd) else: syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q0 %s' % (self.info['SlimVersion'],seqfile,slim_cmd) self.log.printLog('#SYS',syscmd) os.system(syscmd) if not batchout: new_rank = '%s/%s.rank' % (basefile,base) self.log.printLog('#RES','New rank result %s produced?: %s' % (new_rank,os.path.exists(new_rank))) except: self.log.errorLog('rje_pattern_discovery banjaxed in slimDisc() %s' % _stage,quitchoice=True)
def gasp(self): ### Performs GASP: Gapped Ancestral Sequence Prediction """Performs GASP: Gapped Ancestral Sequence Prediction.""" try: ### <a> ### Preparation self.obj["Tree"].cmd_list.append("unkspec=T") self.obj["Tree"].obj["SeqList"].opt["UnkSpec"] = True ## <i> ## Screen Output self.verbose(0, 3, "\nMaking Ancestral Sequences", 0) if self.stat["FixPam"] > 0: self.verbose(0, 3, "- Fixed PAM%d" % self.stat["FixPam"], 1) else: self.verbose(0, 3, "- Variable PAM Weighting", 1) ## <ii> ## PAM Matrix Setup try: if self.obj["Tree"].obj["PAM"] == None: self.obj["Tree"].obj["PAM"] = rje_pam.PamCtrl(log=self.log, cmd_list=self.cmd_list) if self.stat["FixPam"] <= 0: maxblen = 0 for b in self.obj["Tree"].branch: if b.stat["Length"] > maxblen: maxblen = b.stat["Length"] self.verbose(1, 3, "Max Branch Length = %f: " % maxblen, 0) maxblen = int(maxblen * 100) + 1 else: maxblen = self.stat["FixPam"] self.verbose(1, 3, "Max PAM = %d" % maxblen, 1) # print tree.pam.getPamMax(), maxblen if self.obj["Tree"].obj["PAM"].stat["PamMax"] < maxblen: # print 'Upping PAM!' self.obj["Tree"].obj["PAM"].stat["PamMax"] = maxblen self.obj["Tree"].obj["PAM"].pamUp() except: self.log.errorLog("Fatal run Exception during PAM Matrix Setup\n") raise ##<iii> ## AA Freqs aalist = self.obj["Tree"].obj["PAM"].alphabet self.verbose(1, 3, aalist, 1) if aalist.count("-") == 0: aalist.append("-") if aalist.count("X") == 0: aalist.append("X") self.aafreq = self.obj["Tree"].obj["SeqList"].aaFreq(alphabet=aalist) self.aafreq["-"] = 0.0 self.aafreq["X"] = 0.0 # tree.deBug(aafreq) ### <b> ### Terminal sequences - probabilities etc. are known (sequences are known!) self.gaspnode = {} # Array of GaspNode objects for node in self.obj["Tree"].node: ## <i> ## Check Sequence Exists if node.stat["ID"] > self.obj["Tree"].stat["SeqNum"]: if node.obj["Sequence"] == None: self.obj["Tree"].obj["SeqList"]._addSeq( node.info["Name"], "X" * self.obj["Tree"].obj["SeqList"].seq[0].seqLen() ) node.obj["Sequence"] = self.obj["Tree"].obj["SeqList"].seq[-1] ## <ii> ## Create GaspNode object self.gaspnode[node] = GaspNode(node, aalist, self.log) ## <iii> ## Termini if node.stat["ID"] <= self.obj["Tree"].stat["SeqNum"]: self.gaspnode[node].probFromSeq() # print s, len(gaspnode[s].sequence), gaspnode[s].ancfix self.gaspnode[node].ancfix = [True] * len(node.obj["Sequence"].info["Sequence"]) ### <c> ### GASP 1: Gap Status self._gapStatus() ## <d> ## From tips to root # X#self.verbose(0,4,"GASP",0) aalist.remove("-") if aalist.count("X") > 0: aalist.remove("X") self._gaspProbs( aalist=aalist, useanc=False, dir="down", aaprobs=True, aasub=self.opt["FixDown"], aafix=self.opt["FixDown"], ) if self.opt["FixDown"]: self.obj["Tree"].ancSeqOut(file="%s.anc.fas" % self.info["Name"], ordered=self.opt["Ordered"]) return # Should now have matrix of aa probabilities right back to root... ## <b> ## Fix Root self._gaspProbs(aalist=aalist, useanc=False, dir="root", aaprobs=False, aasub=True, aafix=self.opt["FixUp"]) ## <c> ## Back up tree using all 3 branches self._gaspProbs(aalist=aalist, useanc=True, dir="up", aaprobs=True, aasub=True, aafix=self.opt["FixUp"]) ## <d> ## Back down tree with all 3 branches to soften 'outgroup sweep' near root for x in range(self.stat["XPass"]): # X#self.verbose(0,4,":%d:" % (x+1),0) self._gaspProbs( aalist=aalist, useanc=True, dir="down", aaprobs=True, aasub=False, aafix=False, gpass=(x + 1) ) self._gaspProbs( aalist=aalist, useanc=True, dir="down", aaprobs=True, aasub=True, aafix=True, gpass=(x + 1) ) ### <4> ### Finished => Save for node in self.obj["Tree"].node: node.obj["Sequence"].info["Sequence"] = self.gaspnode[node].sequence # X#self.verbose(0,2,"Done!",1) self.log.printLog("\r#GASP", "Gapped Ancestral Sequence Prediction Complete.") self.obj["Tree"].ancSeqOut(file="%s.anc.fas" % self.info["Name"], ordered=self.opt["Ordered"]) ### <5> ### PAM Distances & PAM Tree if self.opt["PamTree"]: try: self.obj["Tree"].branchPam() self.obj["Tree"].saveTree( filename="%s.anc.nsf" % self.info["Name"], type="nsf", seqnum=1, seqname="short", maxnamelen=127, blen="pam", bootstraps="node", multiline=1, ) self.obj["Tree"].textTree( seqnum=1, seqname="short", maxnamelen=30, nodename="short", showboot=1, showlen="branch", blen="pam", scale=4, spacer=1, compress=False, ) self.obj["Tree"].textTree( filename="%s.anc.txt" % self.info["Name"], seqnum=1, seqname="short", maxnamelen=30, nodename="short", showboot=1, showlen="branch", blen="pam", scale=4, spacer=1, compress=False, ) except: self.log.errorLog("Major Problem with PAM Tree.") raise ### <6> ### RST Output if self.opt["RST"]: rstfile = "%s.rst" % self.info["Name"] rje.backup(self, rstfile) RST = open(rstfile, "a") RST.write("Supplemental results for GASP - main output %s.anc.fas\n\n" % self.info["Name"]) for node in self.obj["Tree"].node[self.obj["Tree"].stat["SeqNum"] :]: gn = self.gaspnode[node] RST.write("%s\n\n" % string.join(gn.rst, "\n")) RST.close() self.log.printLog("RST output %s.rst complete." % self.info["Name"]) except: self.log.errorLog("Fatal Error during GASP.") raise
def inSilicoHybrid( self ): ### Filter and combine subreads from parent and output to fasta file. ''' Filter and combine subreads from parent and output to fasta file. This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell identifier table.) A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added. This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional subreads will be added from the other parent if they reduce the difference in cumulative output for each parent. Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for which the subread length distributions should also be similar. This is to overcome biases in resulting diploid assemblies, where one parent has higher quality data than the other. NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to reads with a minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be relaxed. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1')) base1 = rje.baseFile(self.getStr('Parent1')) parent1 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent1'), 'basefile=%s' % base1]) parent1.setup() udb1 = parent1.udb() cdb = parent1.db('smrt', add=True, mainkeys=['Name']) cdb.dataFormat({'SMRT': 'int'}) cx = cdb.entryNum() ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2')) base2 = rje.baseFile(self.getStr('Parent2')) parent2 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent2'), 'basefile=%s' % base2]) parent2.setup() udb2 = parent2.udb() cdb2 = parent2.db('smrt', add=True, mainkeys=['Name']) cdb2.dataFormat({'SMRT': 'int'}) # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1 for entry in cdb2.entries() + udb2.entries(): entry['SMRT'] = entry['SMRT'] + cx cdb = parent1.db().mergeTables(cdb, cdb2) ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #' ) minlen = self.getInt('LenFilter') minrq = self.getNum('RQFilter') rqstr = '%s' % minrq filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen, rqstr[2:]) ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqbatch = [] # List of SeqList objects self.printLog( '#BATCH', '%s sequence files to process.' % rje.iLen(parent1.list['Batch'] + parent2.list['Batch'])) for seqfile in parent1.list['Batch'] + parent2.list['Batch']: seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=F', 'seqin=%s' % seqfile, 'autofilter=F' ] seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd)) self.printLog( '#BATCH', '%s sequence files to summarise.' % rje.iLen(seqbatch)) if not seqbatch: raise IOError( 'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.' ) ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elists = [ udb1.sortedEntries('Len', reverse=True), udb2.sortedEntries('Len', reverse=True) ] plen = [0, 0] # Summed lengths for each parent pseq = [0, 0] # Total sequence number for each parent prq = [0, 0] # Total sequence RQ for each parent (convert to mean) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') lastlen = max(elists[0][0]['Len'], elists[1][0]['Len']) # Length of last selected read for elist in elists: while elist and elist[0]['RQ'] < minrq: elist.pop(0) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') nextp = 0 # Index of next parent to use if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1 ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## zmwlist = [] # List of (smrt,zmw) meeting filtering criteria ux = 0.0 utot = len(elists[0]) + len(elists[1]) while lastlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: elist.pop(0) ux += 100.0 if elist and elist[0]['Len'] < minlen: ux += 100.0 * len(elist) elist = [] if not elist: nextp = 1 - nextp break # Finish entry = elist.pop(0) ux += 100.0 zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp lastlen = entry['Len'] ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while elists[nextp]: elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist.pop(0) ux += 100.0 while elist and elist[0]['Len'] >= minlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) entry = elist.pop(0) ux += 100.0 pdiff = rje.modulus(plen[0] - plen[1]) ediff = rje.modulus(plen[nextp] + entry['Len'] - plen[1 - nextp]) if ediff >= pdiff: elists[nextp] = [] break #Finish! zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 self.printLog( '\r#DIP', 'Diploidising subreads complete: %s subreads to output.' % rje.iLen(zmwlist)) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]), 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0])) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]), 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1])) ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rje.backup(self, filtfile) SEQOUT = open(filtfile, 'w') sx = 0.0 stot = 0 sn = len(seqbatch) fx = 0 for seqlist in seqbatch: #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784 si = 100.0 / seqlist.seqNum() stot += seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#OUT', 'Extracting subreads: %.2f%%' % (sx / sn)) sx += si (name, sequence) = seqlist.getSeq(seq) try: [smrt, zmw, pos, rq] = string.split(string.replace(name, '/', ' ')) except: [smrt, zmw, pos] = string.split(string.replace(name, '/', ' ')) rq = minrq if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist: continue SEQOUT.write('>%s\n%s\n' % (name, sequence)) fx += 1 self.printLog( '\r#OUT', 'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile)) ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=T', 'seqin=%s' % filtfile, 'autofilter=F' ] rje_seqlist.SeqList(self.log, seqcmd) return True except: self.errorLog('%s.run error' % self.prog()) return False
def uniFake( self, seqs=[], store=False ): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx, seqnum) = (0, seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt( self.log, self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['', 'none']: self.info['DatOut'] = rje.baseFile( seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self, datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self, hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog( '#SEQ', 'Processing %s (%s aa) %s...' % (seq.shortName(), rje.integerString( seq.aaLen()), seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5), seq.info['AccNum']) open('%s.fas' % utmp, 'w').write( '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence'])) udata = { 'CC': ['-!- Features generated using unifake.py'], 'AC': [] } if seq.info['SpecCode'] in ['Unknown', 'UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']): details = rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']) self.addAlias(seq.info['AccNum'], details[0]) self.addAlias(seq.info['AccNum'], details[1]) self.addAlias(seq.info['AccNum'], details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(), seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append( '%s;' % string.join(self.dict['Aliases'][id], '; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [ seq.shortName(), seq.info['AccNum'], seq.info['ID'] ]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list[ 'RegionDisorder']: ft.append({ 'Type': 'DISORDER', 'Desc': 'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': disorder[0], 'End': disorder[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({ 'Type': 'ORDER', 'Desc': 'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': fold[0], 'End': fold[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) except: self.log.errorLog( 'UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({ 'SearchDB': '%s.fas' % utmp, 'HMMOut': '%s.hmm.out' % utmp }) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [ hmm.hmmSearch(self.info['PFam'], outfile=hmm.info['HMMOut']) ] # Used in hmmTable hmm.hmmTable(outfile=hmmfile, append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list[ 'ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append( 'PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'], time.ctime( os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append( '-!- ERROR: PFam HMMer Search failure!') out = {'Type': '!ERROR!', 'Name': name} rje.delimitedFileOutput( self, hmmfile, [ 'Type', 'Name', 'Start', 'End', 'Eval', 'Score' ], datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = { 'Start': aln.stat['SbjStart'], 'End': aln.stat['SbjEnd'], 'Type': 'PFAM', 'Desc': '%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'], aln.stat['Expect'], aln.stat['BitScore']) } if disorder: region = disorder[ aln.stat['SbjStart'] - 1:aln.stat['SbjEnd']] hmmdisorder = float( sum(region)) / len(region) pfamft[ 'Desc'] = '%s; IUPRed: %.2f' % ( pfamft['Desc'], hmmdisorder) if hmmdisorder < self.stat[ 'DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog( 'UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen( '%s %s.fas -short' % (self.info['TMHMM'], utmp)).readlines() domlist = rje_tm.domainList( rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append( 'TMHMM: %d TM domains; N-Term %s' % ((len(domlist) - 1) / 2, domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system( '%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'], utmp, utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({ 'Type': 'SIGNALP', 'Desc': desc, 'Start': 1, 'End': cpos }) except: self.log.errorLog( 'UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq, udata, ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq( seq, data=udata, ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile, append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog( '#UNIFAKE', '|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx), rje.integerString(seqnum - sx)), log=False) if store: uniprot.saveUniProt(datfile, append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog( 'Oh, the shame of it! Trouble during UniFake.uniFake()')
def hmmSearch( self, hmm, dbase=None, outfile=None, wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR', 'HMM file %s is missing!' % hmm) return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR', 'Database file "%s" is missing!' % dbase) return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in [ '', 'none' ]: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile( hmm, True), rje.baseFile(dbase, True)) resfile = outfile if not os.path.exists( outfile) and self.opt['GZip'] and os.path.exists( '%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger( resfile, hmm) == resfile and rje.isYounger( resfile, dbase) == resfile: self.printLog('#HMM', 'HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self, outfile, unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) self.log.printLog('#HMM', _command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile, 'a').write( os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM', 'Outfile produced for %s: %s.' % (hmm, outfile)) if self.opt['GZip']: rje.backup(self, '%s.gz' % outfile, unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP', '%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def saveXGMML(self,filename=None,format='Cytoscape'): ### Saves object data to file in XGMML format ''' Saves object data to file in XGMML format. >> filename:str [None] = Output file. Will use name.xgmml if None. >> format:str [Cytoscape] = Target for output file ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not filename or filename.lower() == 'none': filename = '%s.xgmml' % self.info['Name'] self.log.printLog('#XGMML','Output of XGMML file %s for %s...' % (filename,format),log=False,newline=False) rje.backup(self,filename) date = rje.dateTime() OUT = open(filename,'w') ### ~ [2] Output headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OUT.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n') OUT.write('<graph label="%s" id="%s" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.cs.rpi.edu/XGMML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n' % (self.info['Name'],self.info['Name'])) OUT.write(' <att name="documentVersion" value="1.0"/>\n') ## ~ [2a] Cytoscape format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## OUT.write(' <att name="networkMetadata">\n') OUT.write(' <rdf:RDF>\n') OUT.write(' <rdf:Description rdf:about="http://www.cytoscape.org/">\n') OUT.write(' <dc:source>RJE_XGMML</dc:source>\n') OUT.write(' <dc:format>Cytoscape-XGMML</dc:format>\n') OUT.write(' <dc:description>%s</dc:description>\n' % self.info['Description']) OUT.write(' <dc:date>%s</dc:date>\n' % date) OUT.write(' <dc:type>%s</dc:type>\n' % self.info['Type']) OUT.write(' <dc:identifier>N/A</dc:identifier>\n') OUT.write(' <dc:title>%s</dc:title>\n' % self.info['Name']) OUT.write(' </rdf:Description>\n') OUT.write(' </rdf:RDF>\n') OUT.write(' </att>\n\n') ### ~ [3] Output Nodes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### size = 35.0 nodelist = rje.sortKeys(self.dict['Node']) (n,x,y) = (int(math.sqrt(len(nodelist))),0,0) for node in nodelist: try: ## ~ [3a] Basic node attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## OUT.write(' <node label="%s" id="%d">\n' % (node,nodelist.index(node))) except: self.errorLog('!'); continue try: for att in rje.sortKeys(self.dict['Node'][node]): if att not in self.dict['NodeAtt']: continue type = self.dict['NodeAtt'][att] value = string.replace('%s' % self.dict['Node'][node][att],'&','and') OUT.write(' <att type="%s" name="%s" label="%s" value="%s"/>\n' % (type,att,att,value)) ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Add control for these at some point! #!# if node in self.dict['NodePos']: (nx,ny) = self.dict['NodePos'][node] try: nx * size except: self.errorLog('%s nodepos X = %s' % (node,nx)); nx = x try: ny * size except: self.errorLog('%s nodepos Y = %s' % (node,ny)); ny = y else: [nx,ny] = [x,y] if self.getBool('XGMMLAtt'): OUT.write(' <graphics w="%.1f" h="%.1f" width="1" type="ellipse" outline="#000000" fill="#ff9999" y="%.1f" x="%.1f">\n' % (size,size,ny*2*size,nx*2*size)) OUT.write(' <att name="cytoscapeNodeGraphicsAttributes">\n') OUT.write(' <att name="nodeTransparency" value="1.0"/>\n') #OUT.write(' <att name="nodeLabelFont" value="Default-0-12"/>\n') OUT.write(' <att name="borderLineType" value="solid"/>\n') OUT.write(' </att>\n') else: OUT.write(' <graphics y="%.1f" x="%.1f">\n' % (ny*2*size,nx*2*size)) OUT.write(' </graphics>\n') x += 1 if x > n: (x,y) = (0,y+1) ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.errorLog('!') OUT.write(' </node>\n') ### ~ [4] Output Edges ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for etype in rje.sortKeys(self.dict['Edge']): for edge in rje.sortKeys(self.dict['Edge'][etype]): try: ## ~ [3a] Basic edge attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## id = '%s (%s) %s' % (edge[0],etype,edge[1]) OUT.write(' <edge label="%s" id="%s" target="%d" source="%d">\n' % (id,id,nodelist.index(edge[1]),nodelist.index(edge[0]))) except: self.errorLog('!'); continue try: OUT.write(' <att type="string" name="canonicalName" label="canonicalName" value="%s"/>\n' % id) OUT.write(' <att type="string" name="TYPE" label="TYPE" value="%s"/>\n' % etype) if 'interaction' not in self.dict['EdgeAtt']: OUT.write(' <att type="string" name="interaction" label="interaction" value="%s"/>\n' % etype) OUT.write(' <att type="string" name="EDGE_TYPE" label="EDGE_TYPE" value="DefaultEdge"/>\n') for att in self.dict['Edge'][etype][edge]: if att.lower() == 'type': continue if att not in self.dict['EdgeAtt']: continue type = self.dict['EdgeAtt'][att] value = string.replace('%s' % self.dict['Edge'][etype][edge][att],'&','and') OUT.write(' <att type="%s" name="%s" label="%s" value="%s"/>\n' % (type,att,att,value)) ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Update these at some point! #!# #OUT.write(' <graphics width="1" fill="#0000ff">\n') #OUT.write(' <att name="cytoscapeEdgeGraphicsAttributes">\n') #OUT.write(' <att name="sourceArrow" value="0"/>\n') #OUT.write(' <att name="targetArrow" value="0"/>\n') #OUT.write(' <att name="edgeLabelFont" value="Default-0-10"/>\n') #OUT.write(' <att name="edgeLineType" value="SOLID"/>\n') #OUT.write(' <att name="sourceArrowColor" value="#000000"/>\n') #OUT.write(' <att name="targetArrowColor" value="#000000"/>\n') #OUT.write(' <att name="curved" value="STRAIGHT_LINES"/>\n') #OUT.write(' </att>\n') #OUT.write(' </graphics>\n') ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.errorLog('!') OUT.write(' </edge>\n') ### ~ [5] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OUT.write('</graph>\n') OUT.close() self.log.printLog('\r#XGMML','Output of XGMML file %s for %s complete.' % (filename,format)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def haqBatch(self,force=False): ### Generates Batch and INI files for HAQESAC runs '''Generates Batch and INI files for HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True) inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],wholepath=True) if force or self.force() or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self,batfile); rje.backup(self,inifile) else: return self.printLog('#HAQBAT','HAQESAC Batch files found.') ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### haqcmd = [] for cmd in self.cmd_list: if cmd[:4].lower() != 'ini=': haqcmd.append(cmd) if self.opt['MultiHAQ']: haqcmd += ['multihaq=T','force=F'] open(inifile,'w').write(string.join(haqcmd,'\n')) ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): acc = seq.info['AccNum'] haqcmd = ['seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc] open(batfile,'a').write('python %shaqesac.py %s\n' % (self.info['Path'],string.join(haqcmd))) self.printLog('#HAQBAT','HAQESAC Batch file output to %s' % batfile) except: self.errorLog('Major problem with MultiHAQ.haqBatch',quitchoice=True)
def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile() nx = 0 null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file, 'r').read(), '\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1 continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile, db)) self.debug(rje.isYounger(hfile, db) == hfile) need2blast = need2blast or not rje.isYounger(hfile, db) == hfile if not need2blast: self.printLog( '#BLAST', 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self, null_file) nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('MultiCut'), 'blastv=%d' % self.getInt('MultiCut') ] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('BlastCut'), 'blastv=%d' % self.getInt('BlastCut') ] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += [ 'blasta=%d' % self.getInt('Forks') ] rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'), seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath( '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) os.rename(sbfile, hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file, 'a').write('%s\n' % seq.info['AccNum']) nx += 1 if nx: self.printLog( '#BLAST', '%s Accession Numbers without BLAST2Fas hits output to %s' % (nx, null_file)) self.printLog( '#BLAST', '%s HAQESAC input files made using BLAST2Fas' % (self.seqNum() - nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas') raise
def run(self): ### Main run method ''' # DepthCharge: genome assembly quality control and misassembly repair. DepthCharge is an assembly quality control and misassembly repair program. It uses mapped long read depth of coverage to charge through a genome assembly and identify coverage "cliffs" that may indicate a misassembly. If appropriate, it will then blast the assembly into fragment at those misassemblies. DepthCharge uses a genome assembly and PAF file of mapped reads as input. If no file is provided, minimap2 will be used to generate one. For each sequence, DepthCharge starts at the beginning of the sequence and scans through the PAF file for coverage to drop below the `mindepth=INT` threshold (default = 1 read). These positions are marked as "bad" and compressed into regions of adjacent bad positions. Regions at the start or end of a sequnece are labelled "end". Regions overlapping gaps are labelled "gap". Otherwise, regions are labelled "bad". All regions are output to `*.depthcharge.tdt` along with the length of each sequence (region type "all"). Future versions will either fragment the assembly at "bad" regions (and "gap" regions if `breakgaps=T`. If `breakmode=gap` then DepthCharge will replace bad regions with a gap (`NNNN...`) of length `gapsize=INT`. If `breakmode=report` then no additional processing of the assembly will be performed. Otherwise, the processed assembly will be saved as `*.depthcharge.fasta`. --- # Running DepthCharge DepthCharge is written in Python 2.x and can be run directly from the commandline: python $CODEPATH/depthcharge.py [OPTIONS] If running as part of [SLiMSuite](http://slimsuite.blogspot.com/), `$CODEPATH` will be the SLiMSuite `tools/` directory. If running from the standalone [DepthCharge git repo](https://github.com/slimsuite/depthcharge), `$CODEPATH` will be the path the to `code/` directory. Please see details in the [DepthCharge git repo](https://github.com/slimsuite/depthcharge) for running on example data. ## Dependencies DepthCharge uses `grep` and `awk`. To generate documentation with `dochtml`, R will need to be installed and a pandoc environment variable must be set, e.g. export RSTUDIO_PANDOC=/Applications/RStudio.app/Contents/MacOS/pandoc If a PAF file is not provided, [minimap2](https://github.com/lh3/minimap2) must be installed and either added to the environment `$PATH` or given with the `minimap2=PROG` setting. For full documentation of the DepthCharge workflow, run with `dochtml=T` and read the `*.docs.html` file generated. ## Commandline options ``` ### ~ Main DepthCharge run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqin=FILE : Input sequence assembly [None] basefile=FILE : Root of output file names [$SEQIN basefile] paf=FILE : PAF file of long reads mapped onto assembly [$BASEFILE.paf] breakmode=X : How to treat misassemblies (report/gap/fragment) [fragment] breakgaps=T/F : Whether to break at gaps where coverage drops if breakmode=fragment [False] gapsize=INT : Size of gaps to insert when breakmode=gap [100] mindepth=INT : Minimum depth to class as OK [1] ### ~ PAF file generation options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### reads=FILELIST : List of fasta/fastq files containing reads. Wildcard allowed. Can be gzipped. [] readtype=LIST : List of ont/pb/hifi file types matching reads for minimap2 mapping [ont] minimap2=PROG : Full path to run minimap2 [minimap2] mapopt=CDICT : Dictionary of minimap2 options [N:100,p:0.0001,x:asm5] ### ~ Additional options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dochtml=T/F : Generate HTML Diploidocus documentation (*.docs.html) instead of main run [False] logfork=T/F : Whether to log forking in main log [False] tmpdir=PATH : Path for temporary output files during forking (not all modes) [./tmpdir/] ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ``` ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('DocHTML'): return rje_rmd.docHTML(self) if not self.setup(): return False ### ~ [2] ~ DepthCharge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #i# Fork out processing of the PAF file for each input sequence. resfile = '{0}.depthcharge.tdt'.format(self.baseFile()) ddb = self.depthChargeForker( ) # depthcharge table - ['seqname','start','end','type'] if not ddb: raise IOError('Generation of DepthCharge table failed') ddb.indexReport('type') breakup = 'bad' in ddb.index('type') or ( self.getBool('BreakGaps') and 'gap' in ddb.index('type')) if breakup: ddb.printLog( '#RESULT', 'Regions of bad coverage output to {0}'.format(resfile)) elif 'gap' in ddb.index('type'): ddb.printLog( '#RESULT', 'Gaps of bad coverage output to {0}'.format(resfile)) else: ddb.printLog('#RESULT', 'No regions of bad coverage to output!') if self.getStrLC('BreakMode') == 'report' or not breakup: return True ### ~ [3] ~ Fragment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #i# Fragment, insert gaps or just report regions fasfile = '{0}.depthcharge.fasta'.format(self.basefile()) rje.backup(self, fasfile) FRAGFAS = open(fasfile, 'w') seqin = self.seqinObj() seqx = 0 for seq in seqin.seqs(): (seqname, sequence) = seqin.getSeq(seq) sname = string.split(seqname)[0] seqlen = len(sequence) regions = [] for entry in ddb.indexEntries('seqname', sname): if entry['type'] == 'bad' or (self.getBool('BreakGaps') and entry['type'] == 'gap'): regions.append((entry['start'], entry['end'])) if not regions: FRAGFAS.write('>{0}\n{1}\n'.format(seqname, sequence)) seqx += 1 continue regions += [(0, 0), (seqlen, seqlen)] regions.sort() fragx = 0 newseq = '' while len(regions) > 1: fragx += 1 if self.getStrLC('BreakMode') == 'gap': if newseq: newseq += 'N' * self.getInt('GapSize') newseq += sequence[regions[0][1]:regions[1][0] - 1] elif self.getStrLC('BreakMode') == 'fragment': newname = '{0}.{1} {2}'.format(sname, fragx, seqname) newseq = sequence[regions[0][1]:regions[1][0] - 1] FRAGFAS.write('>{0}\n{1}\n'.format(newname, newseq)) seqx += 1 regions.pop(0) if self.getStrLC('BreakMode') == 'gap': newname = '{0}+{1}gaps {2}'.format(sname, fragx - 1, seqname) FRAGFAS.write('>{0}\n{1}\n'.format(newname, newseq)) seqx += 1 self.printLog( '#ADDGAP', '{0} gaps added to {1}'.format(fragx - 1, sname)) else: self.printLog( '#FRAG', '{0} fragments of {1} output to {2}'.format( fragx, sname, fasfile)) self.printLog('#FASOUT', '{0} sequences output to {1}'.format(seqx, fasfile)) # self.warnLog('BreakMode "{0}" not yet implemented!'.format(self.getStrLC('BreakMode'))) return False except: self.errorLog(self.zen()) return True # Delete this if method error not terrible
def saveXGMML( self, filename=None, format='Cytoscape'): ### Saves object data to file in XGMML format ''' Saves object data to file in XGMML format. >> filename:str [None] = Output file. Will use name.xgmml if None. >> format:str [Cytoscape] = Target for output file ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not filename or filename.lower() == 'none': filename = '%s.xgmml' % self.info['Name'] self.log.printLog('#XGMML', 'Output of XGMML file %s for %s...' % (filename, format), log=False, newline=False) rje.backup(self, filename) date = rje.dateTime() OUT = open(filename, 'w') ### ~ [2] Output headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OUT.write( '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n') OUT.write( '<graph label="%s" id="%s" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.cs.rpi.edu/XGMML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n' % (self.info['Name'], self.info['Name'])) OUT.write(' <att name="documentVersion" value="1.0"/>\n') ## ~ [2a] Cytoscape format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## OUT.write(' <att name="networkMetadata">\n') OUT.write(' <rdf:RDF>\n') OUT.write( ' <rdf:Description rdf:about="http://www.cytoscape.org/">\n' ) OUT.write(' <dc:source>RJE_XGMML</dc:source>\n') OUT.write( ' <dc:format>Cytoscape-XGMML</dc:format>\n') OUT.write(' <dc:description>%s</dc:description>\n' % self.info['Description']) OUT.write(' <dc:date>%s</dc:date>\n' % date) OUT.write(' <dc:type>%s</dc:type>\n' % self.info['Type']) OUT.write(' <dc:identifier>N/A</dc:identifier>\n') OUT.write(' <dc:title>%s</dc:title>\n' % self.info['Name']) OUT.write(' </rdf:Description>\n') OUT.write(' </rdf:RDF>\n') OUT.write(' </att>\n\n') ### ~ [3] Output Nodes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### size = 35.0 nodelist = rje.sortKeys(self.dict['Node']) (n, x, y) = (int(math.sqrt(len(nodelist))), 0, 0) for node in nodelist: try: ## ~ [3a] Basic node attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## OUT.write(' <node label="%s" id="%d">\n' % (node, nodelist.index(node))) except: self.errorLog('!') continue try: for att in rje.sortKeys(self.dict['Node'][node]): if att not in self.dict['NodeAtt']: continue type = self.dict['NodeAtt'][att] value = string.replace( '%s' % self.dict['Node'][node][att], '&', 'and') OUT.write( ' <att type="%s" name="%s" label="%s" value="%s"/>\n' % (type, att, att, value)) ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Add control for these at some point! #!# if node in self.dict['NodePos']: (nx, ny) = self.dict['NodePos'][node] try: nx * size except: self.errorLog('%s nodepos X = %s' % (node, nx)) nx = x try: ny * size except: self.errorLog('%s nodepos Y = %s' % (node, ny)) ny = y else: [nx, ny] = [x, y] if self.getBool('XGMMLAtt'): OUT.write( ' <graphics w="%.1f" h="%.1f" width="1" type="ellipse" outline="#000000" fill="#ff9999" y="%.1f" x="%.1f">\n' % (size, size, ny * 2 * size, nx * 2 * size)) OUT.write( ' <att name="cytoscapeNodeGraphicsAttributes">\n' ) OUT.write( ' <att name="nodeTransparency" value="1.0"/>\n' ) #OUT.write(' <att name="nodeLabelFont" value="Default-0-12"/>\n') OUT.write( ' <att name="borderLineType" value="solid"/>\n' ) OUT.write(' </att>\n') else: OUT.write(' <graphics y="%.1f" x="%.1f">\n' % (ny * 2 * size, nx * 2 * size)) OUT.write(' </graphics>\n') x += 1 if x > n: (x, y) = (0, y + 1) ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.errorLog('!') OUT.write(' </node>\n') ### ~ [4] Output Edges ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for etype in rje.sortKeys(self.dict['Edge']): for edge in rje.sortKeys(self.dict['Edge'][etype]): try: ## ~ [3a] Basic edge attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## id = '%s (%s) %s' % (edge[0], etype, edge[1]) OUT.write( ' <edge label="%s" id="%s" target="%d" source="%d">\n' % (id, id, nodelist.index( edge[1]), nodelist.index(edge[0]))) except: self.errorLog('!') continue try: OUT.write( ' <att type="string" name="canonicalName" label="canonicalName" value="%s"/>\n' % id) OUT.write( ' <att type="string" name="TYPE" label="TYPE" value="%s"/>\n' % etype) if 'interaction' not in self.dict['EdgeAtt']: OUT.write( ' <att type="string" name="interaction" label="interaction" value="%s"/>\n' % etype) OUT.write( ' <att type="string" name="EDGE_TYPE" label="EDGE_TYPE" value="DefaultEdge"/>\n' ) for att in self.dict['Edge'][etype][edge]: if att.lower() == 'type': continue if att not in self.dict['EdgeAtt']: continue type = self.dict['EdgeAtt'][att] value = string.replace( '%s' % self.dict['Edge'][etype][edge][att], '&', 'and') OUT.write( ' <att type="%s" name="%s" label="%s" value="%s"/>\n' % (type, att, att, value)) ### ~ [3b] Graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Update these at some point! #!# #OUT.write(' <graphics width="1" fill="#0000ff">\n') #OUT.write(' <att name="cytoscapeEdgeGraphicsAttributes">\n') #OUT.write(' <att name="sourceArrow" value="0"/>\n') #OUT.write(' <att name="targetArrow" value="0"/>\n') #OUT.write(' <att name="edgeLabelFont" value="Default-0-10"/>\n') #OUT.write(' <att name="edgeLineType" value="SOLID"/>\n') #OUT.write(' <att name="sourceArrowColor" value="#000000"/>\n') #OUT.write(' <att name="targetArrowColor" value="#000000"/>\n') #OUT.write(' <att name="curved" value="STRAIGHT_LINES"/>\n') #OUT.write(' </att>\n') #OUT.write(' </graphics>\n') ### ~ [3c] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.errorLog('!') OUT.write(' </edge>\n') ### ~ [5] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OUT.write('</graph>\n') OUT.close() self.log.printLog( '\r#XGMML', 'Output of XGMML file %s for %s complete.' % (filename, format)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def saveTimePoints(self, filename='', format='tdt', entries=[]): ### Saves TimePoints to a file ''' Saves TimePoints to a file from main TimePoints table. >> filename:str [''] = Output filename. Will use basefile if none given. >> format:str ['tdt'] = Output file format (csv/tsv/txt/db) >> entries:list [] = Entries from main table to output. (All if none given). ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db('TimePoints') if format.lower() in ['', 'none']: format = string.split(filename.lower(), '.')[-1] if not filename: filename = '%s.%s' % (self.basefile(), format) if not entries: entries = db.entries() ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if format in ['csv', 'tdt']: self.blanksToEmpty() rje.delimitedFileOutput(self, filename, db.fields(), rje_backup=True) for entry in entries: rje.delimitedFileOutput(self, filename, db.fields(), datadict=entry) ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: self.emptyToBlank() rje.backup(self, filename) OUT = open(filename, 'a') for entry in entries: if format == 'db': outlist = [] for field in db.fields(): outlist.append(entry[field]) out_txt = '%s' % outlist OUT.write('(%s);\n' % out_txt[1:-1]) else: # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) out_text = '%s. (TimePoint) ' % entry['TimePoint Name'] if entry['month'] in ['', 'blank']: out_text += '%s %s.' % (entry['Year'], entry['yearUnit']) else: out_text += '%s %s, %s %s.' % ( entry['Year'], entry['yearUnit'], entry['month'], entry['day']) out_text = '%s %s Source: <%s>[%s].' % ( out_text, entry['TimePoint Description'], entry['Source URL'], entry['Source URL']) klist = [] for i in range(1, 6): if entry['keyword%d' % i] not in ['', 'blank']: klist.append(entry['keyword%d' % i]) out_text = '%s (Keywords: %s)' % ( out_text, string.join(klist, ', ')) OUT.write('%s\n' % out_text) self.printLog('#OUT', '%d entries output to %s' % (len(entries), filename)) except: self.errorLog('%s.saveTimePoints(%s) error' % (self, filename)) return False
def gasp(self): ### Performs GASP: Gapped Ancestral Sequence Prediction '''Performs GASP: Gapped Ancestral Sequence Prediction.''' try: ### <a> ### Preparation self.obj['Tree'].cmd_list.append('unkspec=T') self.obj['Tree'].obj['SeqList'].opt['UnkSpec'] = True ## <i> ## Screen Output self.verbose(0, 3, "\nMaking Ancestral Sequences", 0) if self.stat['FixPam'] > 0: self.verbose(0, 3, "- Fixed PAM%d" % self.stat['FixPam'], 1) else: self.verbose(0, 3, "- Variable PAM Weighting", 1) ## <ii> ## PAM Matrix Setup try: if self.obj['Tree'].obj['PAM'] == None: self.obj['Tree'].obj['PAM'] = rje_pam.PamCtrl( log=self.log, cmd_list=self.cmd_list) if self.stat['FixPam'] <= 0: maxblen = 0 for b in self.obj['Tree'].branch: if b.stat['Length'] > maxblen: maxblen = b.stat['Length'] self.verbose(1, 3, 'Max Branch Length = %f: ' % maxblen, 0) maxblen = int(maxblen * 100) + 1 else: maxblen = self.stat['FixPam'] self.verbose(1, 3, 'Max PAM = %d' % maxblen, 1) #print tree.pam.getPamMax(), maxblen if self.obj['Tree'].obj['PAM'].stat['PamMax'] < maxblen: #print 'Upping PAM!' self.obj['Tree'].obj['PAM'].stat['PamMax'] = maxblen self.obj['Tree'].obj['PAM'].pamUp() except: self.log.errorLog( "Fatal run Exception during PAM Matrix Setup\n") raise ##<iii> ## AA Freqs aalist = self.obj['Tree'].obj['PAM'].alphabet self.verbose(1, 3, aalist, 1) if aalist.count('-') == 0: aalist.append('-') if aalist.count('X') == 0: aalist.append('X') self.aafreq = self.obj['Tree'].obj['SeqList'].aaFreq( alphabet=aalist) self.aafreq['-'] = 0.0 self.aafreq['X'] = 0.0 #tree.deBug(aafreq) ### <b> ### Terminal sequences - probabilities etc. are known (sequences are known!) self.gaspnode = {} # Array of GaspNode objects for node in self.obj['Tree'].node: ## <i> ## Check Sequence Exists if node.stat['ID'] > self.obj['Tree'].stat['SeqNum']: if node.obj['Sequence'] == None: self.obj['Tree'].obj['SeqList']._addSeq( node.info['Name'], 'X' * self.obj['Tree'].obj['SeqList'].seq[0].seqLen()) node.obj['Sequence'] = self.obj['Tree'].obj[ 'SeqList'].seq[-1] ## <ii> ## Create GaspNode object self.gaspnode[node] = GaspNode(node, aalist, self.log) ## <iii> ## Termini if node.stat['ID'] <= self.obj['Tree'].stat['SeqNum']: self.gaspnode[node].probFromSeq() #print s, len(gaspnode[s].sequence), gaspnode[s].ancfix self.gaspnode[node].ancfix = [True] * len( node.obj['Sequence'].info['Sequence']) ### <c> ### GASP 1: Gap Status self._gapStatus() ## <d> ## From tips to root #X#self.verbose(0,4,"GASP",0) aalist.remove('-') if aalist.count('X') > 0: aalist.remove('X') self._gaspProbs(aalist=aalist, useanc=False, dir='down', aaprobs=True, aasub=self.opt['FixDown'], aafix=self.opt['FixDown']) if self.opt['FixDown']: self.obj['Tree'].ancSeqOut(file='%s.anc.fas' % self.info['Name'], ordered=self.opt['Ordered']) return # Should now have matrix of aa probabilities right back to root... ## <b> ## Fix Root self._gaspProbs(aalist=aalist, useanc=False, dir='root', aaprobs=False, aasub=True, aafix=self.opt['FixUp']) ## <c> ## Back up tree using all 3 branches self._gaspProbs(aalist=aalist, useanc=True, dir='up', aaprobs=True, aasub=True, aafix=self.opt['FixUp']) ## <d> ## Back down tree with all 3 branches to soften 'outgroup sweep' near root for x in range(self.stat['XPass']): #X#self.verbose(0,4,":%d:" % (x+1),0) self._gaspProbs(aalist=aalist, useanc=True, dir='down', aaprobs=True, aasub=False, aafix=False, gpass=(x + 1)) self._gaspProbs(aalist=aalist, useanc=True, dir='down', aaprobs=True, aasub=True, aafix=True, gpass=(x + 1)) ### <4> ### Finished => Save for node in self.obj['Tree'].node: node.obj['Sequence'].info['Sequence'] = self.gaspnode[ node].sequence #X#self.verbose(0,2,"Done!",1) self.log.printLog( '\r#GASP', 'Gapped Ancestral Sequence Prediction Complete.') self.obj['Tree'].ancSeqOut(file='%s.anc.fas' % self.info['Name'], ordered=self.opt['Ordered']) ### <5> ### PAM Distances & PAM Tree if self.opt['PamTree']: try: self.obj['Tree'].branchPam() self.obj['Tree'].saveTree(filename='%s.anc.nsf' % self.info['Name'], type='nsf', seqnum=1, seqname='short', maxnamelen=127, blen='pam', bootstraps='node', multiline=1) self.obj['Tree'].textTree(seqnum=1, seqname='short', maxnamelen=30, nodename='short', showboot=1, showlen='branch', blen='pam', scale=4, spacer=1, compress=False) self.obj['Tree'].textTree(filename='%s.anc.txt' % self.info['Name'], seqnum=1, seqname='short', maxnamelen=30, nodename='short', showboot=1, showlen='branch', blen='pam', scale=4, spacer=1, compress=False) except: self.log.errorLog("Major Problem with PAM Tree.") raise ### <6> ### RST Output if self.opt['RST']: rstfile = '%s.rst' % self.info['Name'] rje.backup(self, rstfile) RST = open(rstfile, 'a') RST.write( 'Supplemental results for GASP - main output %s.anc.fas\n\n' % self.info['Name']) for node in self.obj['Tree'].node[self.obj['Tree']. stat['SeqNum']:]: gn = self.gaspnode[node] RST.write('%s\n\n' % string.join(gn.rst, '\n')) RST.close() self.log.printLog('RST output %s.rst complete.' % self.info['Name']) except: self.log.errorLog('Fatal Error during GASP.') raise
def exonerate(self,qryfas, genome, model,exonerate='exonerate',bestn=0): ''' Runs exonerate and parses output into lists for processing. { query: {'gff':[outputlines], 'cigar':[outputlines], 'alignment':[outputlines], 'vulgar':[[headerlist], {header:value}, {header:value}, ...] } ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### EXFILE = None exfile = '%s.%s' % (self.baseFile(),model) # Used in memsaver mode query_dic = {} header_list = ['query_id', 'query_start', 'query_end', 'query_strand', 'target_id', 'target_start', 'target_end', 'target_strand', 'score', '<label, query_length, target_length> triplets'] excmd = [exonerate, qryfas, genome, '--showtargetgff', '--showcigar'] if model: excmd += ['--model', model] if bestn: excmd += ['--bestn', '%d' % bestn] if self.getStrLC('ExOpt'): excmd += string.split(self.getStr('ExOpt')) self.printLog('#RUN',string.join(excmd)) extext = [] if self.getBool('MemSaver'): gzfile = '%s.gz' % exfile if rje.exists(gzfile): self.gUnzip(gzfile) if rje.exists(exfile) and not self.force(): self.printLog('#EXFILE','Found %s (force=F). Assuming complete.' % exfile) else: rje.backup(self,exfile) self.printLog('#SAVER','memsaver=T: Exonerate output directed to %s.' % exfile) EXFILE = open(exfile,'w') if subprocess.call(excmd, stdout=EXFILE): raise IOError('Exonerate call did not complete!') EXFILE.close() self.printLog('#EXFILE','%s generated.' % exfile) EXFILE = open(exfile,'r') else: extext = Popen(excmd, stdout=PIPE).stdout.readlines() output_format = '' while extext or EXFILE: #line = process.stdout.readline().rstrip() if EXFILE: line = EXFILE.readline() if not line: break line = rje.chomp(line) else: line = rje.chomp(extext.pop(0)) if line: if line.startswith(' Query:'): query = line.split(':', 1)[1].split(' ')[1] #for q in rje.sortKeys(query_dic): # self.bugPrint('%s: %s' % (q,rje.sortKeys(query_dic[q]))) #self.debug(query) if line == 'C4 Alignment:': output_format = 'alignment' elif line == '# --- START OF GFF DUMP ---': output_format = 'gff' elif line.startswith('vulgar:'): output_format = 'vulgar' fields = line.split(' ', 10)[1:] if output_format in query_dic[query]: query_dic[query][output_format].append({}) else: query_dic[query][output_format] = [header_list, {}] for header, field in zip(header_list, fields): query_dic[query][output_format][-1][header] = field #self.debug(query_dic[query][output_format]) elif line.startswith('cigar:'): output_format = 'cigar' if output_format in query_dic[query]: query_dic[query][output_format].append(line.replace('cigar: ', '')) else: query_dic[query][output_format] = [line.replace('cigar: ', '')] elif line == '------------' or line.startswith('Command line:') or line.startswith('Hostname:') or line == '# --- END OF GFF DUMP ---' or line == '#' or line.startswith('-- completed exonerate analysis'): pass elif output_format: if query in query_dic: if output_format in query_dic[query]: query_dic[query][output_format].append(line) else: query_dic[query][output_format] = [line] else: query_dic[query] = {output_format:[line]} #elif process.poll() is not None: # break elif output_format == 'alignment': try: query_dic[query][output_format].append(line) except: pass self.vPrint(line,v=1) if EXFILE: EXFILE.close() if self.getBool('Cleanup'): os.unlink(exfile) self.printLog('#CLEAN','%s deleted.' % exfile) elif self.getBool('GZip'): self.gZip(exfile) return query_dic except: self.errorLog('%s.exonerate error' % self.prog()); raise