def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1; continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile,db)) self.debug(rje.isYounger(hfile,db) == hfile) need2blast = need2blast or not rje.isYounger(hfile,db) == hfile if not need2blast: self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self,null_file); nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')] rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) os.rename(sbfile,hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1 if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file)) self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
def setup(self): ### Main class setup method. #V1.0 '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Set job directory to RunPath if given, else directory from which job was submitted ~ ## try: jobdir = rje.makePath(os.environ['PBS_O_WORKDIR']) except: jobdir = None if self.getStr('RunPath') == rje.makePath(os.path.abspath(os.curdir)) and jobdir: self.setStr({'RunPath':jobdir}) os.chdir(self.getStr('RunPath')) ## ~ [1b] ~ Read list of node names in file $PBS_NODEFILE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.setHosts() return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.strlist = [ 'Password', 'RestIn', 'Rest', 'RestBase', 'RestOutDir', 'RestURL' ] self.boollist = ['PureAPI', 'RestOut'] self.intlist = ['MaxRefresh', 'Refresh'] self.numlist = [] self.filelist = [] self.listlist = ['RestKeys'] self.dictlist = ['Output', 'Outfile'] self.objlist = [] ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setDefaults(str='None', bool=False, int=0, num=0.0, obj=None, setlist=True, setdict=True, setfile=True) self.setStr({ 'RestOutDir': rje.makePath('./'), 'RestURL': 'http://rest.slimsuite.unsw.edu.au/' }) self.setBool({'PureAPI': False, 'RestOut': False}) self.setInt({'MaxRefresh': 600, 'Refresh': 5}) self.setNum({}) ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setForkAttributes() # Delete if no forking
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object''' ### Basics ### self.infolist = ['SearchDB', 'HMMOut', 'HMMTab', 'HMMerPath'] self.optlist = ['HMMCalibrate', 'HMMPFam', 'GZip', 'CleanRes'] self.statlist = [] self.listlist = ['MakeHMM', 'HMMRes', 'HMM', 'HMMOptions'] self.dictlist = [] self.objlist = [] ### Defaults ### self._setDefaults(info='None', opt=False, stat=0.0, obj=None, setlist=True, setdict=True) self.setInfo({ 'HMMerPath': rje.makePath('/home/richard/Bioware/hmmer-2.3.2/src/'), 'HMMOut': '', 'HMMTab': '' }) self.setOpt({'HMMCalibrate': True, 'GZip': True, 'CleanRes': True}) self._cmdRead(cmd='hmm=*.hmm', type='glist', att='HMM') ### Other Attributes ### self.search = []
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### Basics ### self.infolist = [ 'Sequence', 'Disorder', 'IUPath', 'IUMethod', 'ANCHOR' ] self.optlist = ['Flat', 'PrintLog', 'IUChDir'] self.statlist = ['IUCut', 'FILoop', 'FISleep', 'MinRegion'] self.listlist = ['ResidueDisorder', 'RegionDisorder', 'RegionFold'] self.dictlist = [] self.objlist = [] ### Defaults ### self._setDefaults(info='', opt=False, stat=0.2, obj=None, setlist=True, setdict=True) self.setInfo({ 'IUPath': rje.makePath('c:/bioware/iupred/iupred.exe', wholepath=True), 'IUMethod': 'short', 'Disorder': 'iupred' }) self.setStat({ 'FILoop': 10, 'FISleep': 2, 'MinRegion': 0, 'IUCut': 0.2 })
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.getStr('RestIn').startswith('http:'): #!# Check for rest URL and add if missing #!# Split on & restcmd = string.split(self.getStr('RestIn'),'&') for i in range(len(restcmd)): if '=' not in restcmd[i]: continue (opt,value) = string.split(restcmd[i],'=',1) if value.startswith('file:'): # Conversion of cmd=file:FILE into cmd=CONTENT rfile = string.split(value,':',1)[1] #!# Consider adding max size constraint. Probably a URL size limit. if rje.exists(rfile): restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n'))) if '&' in restcmd[i]: self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile)) restcmd[i] = string.replace(restcmd[i],'&','+') else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True) self.setStr({'RestIn':string.join(restcmd,'&')}) ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: # Convert to file self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)}) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def complexFasta( self): ### Outputs parsed complex datasets in Fasta format '''Outputs parsed complex datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/') rje.mkDir(self, datpath) ### Output PPI Datasets ### for complex in rje.sortKeys(self.dict['Complex']): mylist = [] for p2 in self.dict['Complex'][complex]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, complex) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) self.log.printLog('#FAS', 'HPRD complex fasta output complete.') except: self.log.errorLog('Error in HPRD.complexFasta()', printerror=True, quitchoice=False) raise
def saveFasta(self): ### Outputs parsed PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/') rje.mkDir(self, datpath) ## Check Seqs ## for p1 in rje.sortKeys(self.dict['PPI']): if 'Seq' not in self.dict['HPRD'][p1]: #!# KeyError #!# print p1, self.dict['HPRD'][p1] self.deBug('No Seq for %s' % p1) ### All sequences ### self.obj['SeqList'].saveFasta() ### Output PPI Datasets ### for p1 in rje.sortKeys(self.dict['PPI']): mylist = [] for p2 in self.dict['PPI'][p1]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, self.dict['HPRD'][p1]['gene']) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) self.log.printLog('#FAS', 'HPRD PPI fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False)
def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog( '\r#FAS', 'Generating %s PPI fasta files: %.2f' % (rje.integerString(fx), hx / htot)) hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs, rje.makePath('YeastPPI/%s.fas' % hub, wholepath=True), log=False) fx += 1 self.printLog( '\r#FAS', 'Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx), rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup( self ): ### Main class setup method. #V1.0 '''Main class setup method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Set job directory to RunPath if given, else directory from which job was submitted ~ ## try: jobdir = rje.makePath(os.environ['PBS_O_WORKDIR']) except: jobdir = None if self.getStr('RunPath') == rje.makePath( os.path.abspath(os.curdir)) and jobdir: self.setStr({'RunPath': jobdir}) os.chdir(self.getStr('RunPath')) ## ~ [1b] ~ Read list of node names in file $PBS_NODEFILE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.setHosts() return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def haqBatch(self,force=False): ### Generates Batch and INI files for HAQESAC runs '''Generates Batch and INI files for HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True) inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],wholepath=True) if force or self.force() or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self,batfile); rje.backup(self,inifile) else: return self.printLog('#HAQBAT','HAQESAC Batch files found.') ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### haqcmd = [] for cmd in self.cmd_list: if cmd[:4].lower() != 'ini=': haqcmd.append(cmd) if self.opt['MultiHAQ']: haqcmd += ['multihaq=T','force=F'] open(inifile,'w').write(string.join(haqcmd,'\n')) ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): acc = seq.info['AccNum'] haqcmd = ['seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc] open(batfile,'a').write('python %shaqesac.py %s\n' % (self.info['Path'],string.join(haqcmd))) self.printLog('#HAQBAT','HAQESAC Batch file output to %s' % batfile) except: self.errorLog('Major problem with MultiHAQ.haqBatch',quitchoice=True)
def setup(self): ### Sets up headers and reads in existing data if present '''Sets up headers and reads in existing data if present.''' try: ### ~ Setup Basic Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #X#headers = ['Alias','Species','Symbol','HGNC','Entrez','UniProt','EnsEMBL','HPRD','OMIM','EnsLoci','Desc'] headers = ['Alias','Species'] + gc_headers # All other headers added from altsource list ### ~ Read in data from existing files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.readHGNC() if self.opt['Update'] and os.path.exists(self.info['CardOut']): self.list['AltSource'].append(self.info['CardOut']) for altsource in self.list['AltSource']: sourcefile = rje.makePath(altsource,True) if not os.path.exists(sourcefile): self.log.errorLog('Alternative source "%s" missing!' % sourcefile,printerror=False,quitchoice=True) continue update = rje.dataDict(self,sourcefile,getheaders=True,ignore=['#']) for h in update.pop('Headers'): if h not in headers: headers.append(h) self.log.printLog('#DATA','Read GeneCards data for %d genes.' % (len(update))) for gene in rje.sortKeys(update): # Each source will overwrite data from the file before ## ~ Convert to Upper Case for consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene != gene.upper() and gene.upper() in update: continue # Only use upper case one! elif gene != gene.upper(): update[gene.upper()] = update.pop(gene) gene = gene.upper() if gene == '!FAILED!': continue ## ~ Update main dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['Update'] and altsource == self.info['CardOut'] and gene not in self.list['Genes']: self.list['Genes'].append(gene) if gene in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][gene],update[gene]) else: self.dict['GeneCard'][gene] = update[gene] ## ~ Temp Debugging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene in self.list['TestGenes']: print gene print update[gene] self.deBug(self.dict['GeneCard'][gene]) ## ~ Check Aliases etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'Symbol' in self.dict['GeneCard'][gene]: self.dict['GeneCard'][gene]['Symbol'] = self.dict['GeneCard'][gene]['Symbol'].upper() if 'Symbol' in update[gene] and update[gene]['Symbol'] != '!FAILED!': symbol = update[gene]['Symbol'] if symbol in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][symbol],update[gene],overwrite=False,replaceblanks=True) else: self.dict['GeneCard'][symbol] = update[gene] self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])),newline=False,log=False) if len(string.split(gene)) > 1: print '!!!', gene, '!!!' ### ~ Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard']))) self.list['Headers'] = headers[0:] if self.opt['Update']: self.opt['Append'] = False #x#if 'TASP1' in self.dict['GeneCard']: self.deBug(self.dict['GeneCard']['TASP1']) #x#else: self.deBug(rje.sortKeys(self.dict['GeneCard'])) except: self.log.errorLog('Problem during GeneCards.setup()') raise
def run(self): ### Performs main run method, including both setup and UniFake '''Performs main run method, including both setup and UniFake.''' ### ~ [1] ~ Setup aliases and features dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setup() ### ~ [2] ~ Perform main UniFake file generation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.uniFake() ### ~ [3] ~ Index files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['MakeIndex']: i = self.stat['Interactive'] self.stat['Interactive'] = -1 self.info['UniPath'] = rje.makePath(os.path.split(self.info['DatOut'])[0]) rje_uniprot.processUniProt(self,makeindex=True,makespec=False,makefas=False) self.stat['Interactive'] = i
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### Basics ### self.infolist = ['CardOut','EnsLoci','HGNCData','Species'] self.optlist = ['FullEns','FullHGNC','Update','Purify','Restrict','UseWeb'] self.statlist = [] self.listlist = ['AltSource','Genes','SkipList','TestGenes'] self.dictlist = ['CardMap','EnsDesc','EnsLoci','GeneCard'] self.objlist = [] ### Defaults ### self._setDefaults(info='None',opt=True,stat=0.0,obj=None,setlist=True,setdict=True) self.setInfo({'CardOut':'genecards.tdt','Species':'Human', 'EnsLoci':rje.makePath('/home/richard/Databases/EnsEMBL/ens_HUMAN.loci.fas',True)}) self.setOpt({'FullEns':False,'Purify':False,'Restrict':False})
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### Basics ### self.infolist = ['Sequence','Disorder','IUPath','IUMethod','ANCHOR'] self.optlist = ['Flat','PrintLog','IUChDir'] self.statlist = ['IUCut','FILoop','FISleep','MinRegion'] self.listlist = ['ResidueDisorder','RegionDisorder','RegionFold'] self.dictlist = [] self.objlist = [] ### Defaults ### self._setDefaults(info='',opt=False,stat=0.2,obj=None,setlist=True,setdict=True) self.setInfo({'IUPath':rje.makePath('c:/bioware/iupred/iupred.exe',wholepath=True),'IUMethod':'short', 'Disorder':'iupred'}) self.setStat({'FILoop':10,'FISleep':2,'MinRegion':0,'IUCut':0.2})
def saveReadMe(self, filename='pydocs.txt', append=False): ### Prints docs for modules to file ''' Prints docs for modules to file. >> filename:str = output file name >> append:boolean ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pydoc = self.obj['PyDoc'] if append: self.printLog('#DOC', 'Appending docstrings to %s' % filename) PYDOC = open(filename, 'a') else: rje.mkDir(self, filename) self.printLog('#DOC', 'Writing docstrings to %s' % filename) PYDOC = open(filename, 'w') PYDOC.write(self.readMeHeader()) db = self.db('Module') dx = 0 ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for sourcedir in pydoc.list['SourceDir']: PYDOC.write('-%s:\n\n' % sourcedir) for pyfile in db.dataKeys(): entry = db.data(pyfile) module = entry['Module'] if not pyfile.find(sourcedir) >= 0 or not os.path.exists( '%s%s%s.py' % (pydoc.getStr('PyPath'), rje.makePath(sourcedir), module)): continue ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module, pyfile) while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:] try: PYDOC.write('%s\n\n%s\n' % (mtxt, entry['DocString'])) dx += 1 except: self.errorLog('Cannot write DocString for %s' % module, printerror=False) PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)) dx += 1 PYDOC.write('\n\n\n') PYDOC.close() self.printLog( '#DOC', 'Output to %s complete: %s modules.' % (filename, rje.iStr(dx))) except: self.errorLog('Error in %s.saveDocs()' % self.prog())
def haqBatch( self, force=False): ### Generates Batch and INI files for HAQESAC runs '''Generates Batch and INI files for HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True) inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'], wholepath=True) if force or self.force( ) or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self, batfile) rje.backup(self, inifile) else: return self.printLog('#HAQBAT', 'HAQESAC Batch files found.') ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### haqcmd = [] for cmd in self.cmd_list: if cmd[:4].lower() != 'ini=': haqcmd.append(cmd) if self.opt['MultiHAQ']: haqcmd += ['multihaq=T', 'force=F'] open(inifile, 'w').write(string.join(haqcmd, '\n')) ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): acc = seq.info['AccNum'] haqcmd = [ 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc ] open(batfile, 'a').write('python %shaqesac.py %s\n' % (self.info['Path'], string.join(haqcmd))) self.printLog('#HAQBAT', 'HAQESAC Batch file output to %s' % batfile) except: self.errorLog('Major problem with MultiHAQ.haqBatch', quitchoice=True)
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.infolist = ['HaqDir','HAQBLASTDir'] self.optlist = ['AddQueries','AutoSkip','Chaser','HAQESAC','MultiHAQ','ScreenQry'] self.statlist = ['BlastCut','MultiCut'] self.listlist = [] self.dictlist = [] self.objlist = ['SeqList'] ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setDefaults(info='None',opt=True,stat=0.0,obj=None,setlist=True,setdict=True) self.basefile('MultiHAQ') self.setOpt({'Chaser':False,'AutoSkip':False}) self.setStr({'HAQBLASTDir':rje.makePath('./HAQBLAST/')}) ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setForkAttributes() # Delete if no forking
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object''' ### Basics ### self.infolist = ['SearchDB','HMMOut','HMMTab','HMMerPath'] self.optlist = ['HMMCalibrate','HMMPFam','GZip','CleanRes'] self.statlist = [] self.listlist = ['MakeHMM','HMMRes','HMM','HMMOptions'] self.dictlist = [] self.objlist = [] ### Defaults ### self._setDefaults(info='None',opt=False,stat=0.0,obj=None,setlist=True,setdict=True) self.setInfo({'HMMerPath':rje.makePath('/home/richard/Bioware/hmmer-2.3.2/src/'),'HMMOut':'','HMMTab':''}) self.setOpt({'HMMCalibrate':True,'GZip':True,'CleanRes':True}) self._cmdRead(cmd='hmm=*.hmm',type='glist',att='HMM') ### Other Attributes ### self.search = []
def domainFasta(self): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain','HPRD','Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self,dfile,headers,'\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain','Source'] rje.delimitedFileOutput(self,sfile,shead,'\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']} rje.delimitedFileOutput(self,dfile,headers,'\t',datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain':domain,'Source':source} rje.delimitedFileOutput(self,sfile,shead,'\t',datadict) self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/') rje.mkDir(self,datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM','HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False) raise
def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self,'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx,htot,fx) = (0.0,len(self.dict['PPI']),0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog('\r#FAS','Generating %s PPI fasta files: %.2f' % (rje.integerString(fx),hx/htot)); hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs,rje.makePath('YeastPPI/%s.fas' % hub,wholepath=True),log=False); fx+=1 self.printLog('\r#FAS','Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx),rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def run(self ): ### Performs main run method, including both setup and UniFake '''Performs main run method, including both setup and UniFake.''' ### ~ [1] ~ Setup aliases and features dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setup() ### ~ [2] ~ Perform main UniFake file generation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.uniFake() ### ~ [3] ~ Index files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['MakeIndex']: i = self.stat['Interactive'] self.stat['Interactive'] = -1 self.info['UniPath'] = rje.makePath( os.path.split(self.info['DatOut'])[0]) rje_uniprot.processUniProt(self, makeindex=True, makespec=False, makefas=False) self.stat['Interactive'] = i
def complexFasta(self): ### Outputs parsed complex datasets in Fasta format '''Outputs parsed complex datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/') rje.mkDir(self,datpath) ### Output PPI Datasets ### for complex in rje.sortKeys(self.dict['Complex']): mylist = [] for p2 in self.dict['Complex'][complex]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,complex) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) self.log.printLog('#FAS','HPRD complex fasta output complete.') except: self.log.errorLog('Error in HPRD.complexFasta()',printerror=True,quitchoice=False) raise
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.strlist = ['Password','RestIn','Rest','RestBase','RestOutDir','RestURL'] self.boollist = ['PureAPI','RestOut'] self.intlist = ['MaxRefresh','Refresh'] self.numlist = [] self.filelist = [] self.listlist = ['RestKeys'] self.dictlist = ['Output','Outfile'] self.objlist = [] ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setDefaults(str='None',bool=False,int=0,num=0.0,obj=None,setlist=True,setdict=True,setfile=True) self.setStr({'RestOutDir':rje.makePath('./'),'RestURL':'http://rest.slimsuite.unsw.edu.au/'}) self.setBool({'PureAPI':False,'RestOut':False}) self.setInt({'MaxRefresh':600,'Refresh':5}) self.setNum({}) ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setForkAttributes() # Delete if no forking
def gopher(self): ### Sets up data for GOPHER run '''Sets up data for GOPHER run.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self,'BLAST/') rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.ygob.fas' % self.info['Basefile'],protein=True,force=False) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.yeast.fas' % self.info['Basefile'],protein=True,force=False) seqdict = self.obj['SeqList'].seqNameDic('AccNum') ymap = self.dict['PillarMap'] = {} ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (px,ptot) = (0.0,len(self.list['Pillars'])); ox = 0 for pillar in self.list['Pillars']: self.progLog('\r#YGOB','Converting YGOB Pillars for GOPHER: %.2f%%' % (px/ptot)); px += 100 newpillar = [] for yid in pillar: seq = rje_sequence.Sequence(self.log,self.cmd_list) seq.opt['Yeast'] = True #self.deBug(yid) seq.info['Name'] = yid seq.extractDetails(gnspacc=True) #self.deBug(seq.info) ygob = seq.info['AccNum'] if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob] else: acc = ygob ymap[yid] = acc if acc not in seqdict: self.printLog('\r#GENE','Non-coding gene %s (%s)? Cannot find in fasta file' % (acc,yid)); continue try: newpillar.append(seqdict[acc].shortName()) except: print yid, ygob, acc self.errorLog(rje_zen.Zen().wisdom()) if not newpillar: continue for ygob in pillar: acc = ymap[ygob] if acc not in seqdict: continue if acc in self.list['YeastSeq'] or (not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'): open(rje.makePath('BLAST/%s.blast.id' % acc,wholepath=True),'w').write(string.join(newpillar,'\n')) ox += 1 self.progLog('\r#YGOB','Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox)) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def saveFasta(self): ### Outputs parsed PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/') rje.mkDir(self,datpath) ## Check Seqs ## for p1 in rje.sortKeys(self.dict['PPI']): if 'Seq' not in self.dict['HPRD'][p1]: #!# KeyError #!# print p1, self.dict['HPRD'][p1] self.deBug('No Seq for %s' % p1) ### All sequences ### self.obj['SeqList'].saveFasta() ### Output PPI Datasets ### for p1 in rje.sortKeys(self.dict['PPI']): mylist = [] for p2 in self.dict['PPI'][p1]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,self.dict['HPRD'][p1]['gene']) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) self.log.printLog('#FAS','HPRD PPI fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.infolist = ['HaqDir', 'HAQBLASTDir'] self.optlist = [ 'AddQueries', 'AutoSkip', 'Chaser', 'HAQESAC', 'MultiHAQ', 'ScreenQry' ] self.statlist = ['BlastCut', 'MultiCut'] self.listlist = [] self.dictlist = [] self.objlist = ['SeqList'] ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setDefaults(info='None', opt=True, stat=0.0, obj=None, setlist=True, setdict=True) self.basefile('MultiHAQ') self.setOpt({'Chaser': False, 'AutoSkip': False}) self.setStr({'HAQBLASTDir': rje.makePath('./HAQBLAST/')}) ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setForkAttributes() # Delete if no forking
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath( rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True)) self.printLog('#FARM', batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = [ 'subjobs=%s' % batfile, 'farm=batch', 'qsub=F', 'i=-1', 'runpath=%s' % os.path.abspath(self.info['HaqDir']) ] if self.opt['MultiHAQ']: haqfarm = ['First round', 'Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog( '#CHDIR', 'Changing directory for %s farming: %s' % (farmrun, self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log, self.cmd_list + farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog( '#CHDIR', 'Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM', 'HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ', quitchoice=True)
def multiHAQ(self, secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt[ 'MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList( ) # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = [ 'ini=haqesac.ini', 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F' ] self.printLog( '#HAQ', 'Running HAQESAC for %s - will have own log etc.' % seq.shortName(), log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd, info=info) out = rje.Out( cmd_list=haqcmd ) # Sets up Out object for controlling output to screen out.printIntro( info ) # Prints intro text using details from Info object haqlog = rje.setLog( info, out, haqcmd ) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo( 'Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog( '#HAQ', 'HAQESAC final round run for %s' % seq.shortName()) else: self.printLog( '#HAQ', 'HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList( self.log, ['seqin=%s' % infile, 'autoload=T', 'autofilter=F' ]).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog( '#QRY', '%d other queries found in %s: [%s]' % (len(qacclist), infile, string.join(qacclist, '; '))) self.printLog( '#QRY', '%d of %d queries processed' % (len(processed), self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI', 'Executing second round of multiHAQ') self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ', quitchoice=True)
def domainFasta( self ): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain', 'HPRD', 'Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self, dfile, headers, '\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain', 'Source'] rje.delimitedFileOutput(self, sfile, shead, '\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM', 'HPRD Domain output (%s): %.1f%%' % (dfile, dx / len(self.dict['Domains'])), newline=False, log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = { 'Domain': domain, 'HPRD': hid, 'Gene': self.dict['HPRD'][hid]['gene'] } rje.delimitedFileOutput(self, dfile, headers, '\t', datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain': domain, 'Source': source} rje.delimitedFileOutput(self, sfile, shead, '\t', datadict) self.log.printLog( '\r#DOM', 'HPRD Domain output (%s): %s domains.' % (dfile, rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath( 'HPRD_Domain_Datasets/') rje.mkDir(self, datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) else: self.log.printLog( '#DOM', 'No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False) raise
def gopher(self): ### Sets up data for GOPHER run '''Sets up data for GOPHER run.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'BLAST/') rje_blast.BLASTRun(self.log, self.cmd_list).formatDB( fasfile='%s.ygob.fas' % self.info['Basefile'], protein=True, force=False) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB( fasfile='%s.yeast.fas' % self.info['Basefile'], protein=True, force=False) seqdict = self.obj['SeqList'].seqNameDic('AccNum') ymap = self.dict['PillarMap'] = {} ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (px, ptot) = (0.0, len(self.list['Pillars'])) ox = 0 for pillar in self.list['Pillars']: self.progLog( '\r#YGOB', 'Converting YGOB Pillars for GOPHER: %.2f%%' % (px / ptot)) px += 100 newpillar = [] for yid in pillar: seq = rje_sequence.Sequence(self.log, self.cmd_list) seq.opt['Yeast'] = True #self.deBug(yid) seq.info['Name'] = yid seq.extractDetails(gnspacc=True) #self.deBug(seq.info) ygob = seq.info['AccNum'] if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob] else: acc = ygob ymap[yid] = acc if acc not in seqdict: self.printLog( '\r#GENE', 'Non-coding gene %s (%s)? Cannot find in fasta file' % (acc, yid)) continue try: newpillar.append(seqdict[acc].shortName()) except: print yid, ygob, acc self.errorLog(rje_zen.Zen().wisdom()) if not newpillar: continue for ygob in pillar: acc = ymap[ygob] if acc not in seqdict: continue if acc in self.list['YeastSeq'] or ( not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'): open( rje.makePath('BLAST/%s.blast.id' % acc, wholepath=True), 'w').write(string.join(newpillar, '\n')) ox += 1 self.progLog( '\r#YGOB', 'Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath(rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)) self.printLog('#FARM',batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = ['subjobs=%s' % batfile,'farm=batch','qsub=F','i=-1','runpath=%s' % os.path.abspath(self.info['HaqDir'])] if self.opt['MultiHAQ']: haqfarm = ['First round','Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog('#CHDIR','Changing directory for %s farming: %s' % (farmrun,self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log,self.cmd_list+farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog('#CHDIR','Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM','HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ',quitchoice=True)
def mapPhosByBLAST( self, fasfile ): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + [ 'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F' ] qseqlist = rje_seq.SeqList(self.log, scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['', 'none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence'] delimit = rje.getDelimit( self.cmd_list, rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self, ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log, self.cmd_list + ['formatdb=F']) pblast.setInfo({ 'Name': '%s.p.blast' % rje.baseFile(fasfile), 'DBase': self.info['PELMFas'], 'InFile': fasfile }) pblast.setStat({'HitAln': pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID', 'Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True, log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=F', 'autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq: 0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys( self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False, gaps=False) != hseq.getSequence( case=False, gaps=False): self.log.errorLog( 'Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec'] ) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0
def makeHTML(self): ### Generates HTML pages for interactive navigation. '''Generates HTML pages for interactive navigation.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### basefile = self.basefile() scmd = self.cmd_list + [ 'seqin=%s' % self.getStr('Candidates'), 'autoload=T', 'autofilter=F', 'seqmode=file' ] candseq = rje_seqlist.SeqList(self.log, scmd) # All files and directories are named after basefile: # *.fas = original target PROTEIN sequences (with original descriptions) scmd = self.cmd_list + [ 'seqin=%s' % self.getStr('SeqIn'), 'autoload=T', 'autofilter=F', 'seqmode=file' ] seqlist = rje_seqlist.SeqList(self.log, scmd) # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.) gdb = self.db().addTable('%s.gablam.tdt' % basefile, mainkeys=['Qry', 'Hit'], name='gablam', expect=False) # - Contains candidate proteins as Queries and Target proteins as hits # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum haqdir = rje.makePath('./%s.HAQESAC/' % basefile) ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hfile = '%s.html' % basefile hobj = self.obj['HTML'] hobj.list['StyleSheets'] = [ 'http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css', 'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css' ] html = hobj.htmlHead(basefile) # Front page should have: html += '<h1>%s</h1>\n\n' % basefile htabs = [] # (tab_id, tab_html_text[, tab_title]) # Target protein list (with links to HAQ HTML) ctext = '%s\n' % string.join(['Name', 'Descripton', 'Length'], '\t') seqdict = seqlist.makeSeqNameDic('short') if gdb: hitlist = gdb.indexKeys('Hit') else: hitlist = rje.sortKeys(seqdict) for name in hitlist: seq = seqdict[name] cseq = [ name, seqlist.seqDesc(seq), '%s aa' % seqlist.seqLen(seq) ] acc = seqlist.seqAcc(seq) if os.path.exists('%s%s.log' % (haqdir, acc)): cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir, acc, cseq[0]) ctext += '%s\n' % string.join(cseq, '\t') htabs.append( ('Hits', rje_html.tableToHTML(ctext, '\t', tabid='parse'), 'Target sequences hit by candidates.')) # GABLAM/HMM table (with above links) if gdb: ctext = '%s\n' % string.join(gdb.fields(), '\t') for gline in open('%s.gablam.tdt' % basefile, 'r').readlines()[1:]: gdata = string.split(gline, '\t') acc = string.split(gdata[0], '__')[-1] gdata[ 0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % ( acc, gdata[0]) acc = string.split(gdata[1], '__')[-1] gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir, acc, gdata[1]) ctext += '%s\n' % string.join(gdata, '\t') htabs.append( ('GABLAM', rje_html.tableToHTML(ctext, '\t', tabid='parse'), 'GABLAM hit table.')) # Candidate list (with DB links) if candseq.seqNum(): ctext = '%s\n' % string.join( ['AccNum', 'ID', 'Descripton', 'Length'], '\t') accdict = candseq.makeSeqNameDic('accnum') for acc in rje.sortKeys(accdict): seq = accdict[acc] cseq = [ acc, candseq.seqID(seq), candseq.seqDesc(seq), '%s aa' % candseq.seqLen(seq) ] cseq[ 0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % ( acc, acc) ctext += '%s\n' % string.join(cseq, '\t') htabs.append(('Candidates', rje_html.tableToHTML(ctext, '\t', tabid='parse'), 'Candidate sequences to search.')) html += hobj.tabberHTML('GABLAM', htabs) html += hobj.htmlTail() open(hfile, 'w').write(html) ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #?# Move this to HAQESAC or MultiHAQ for i in range(len(hitlist)): hit = string.split(hitlist[i], '__')[-1] logfile = '%s%s.log' % (haqdir, hit) seqbase = logfile[:-4] hfile = '%s.html' % seqbase html = hobj.htmlHead(seqbase) # Front page should have: html += '<h1>%s</h1>\n\n' % seqbase html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile if i: html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % ( string.split(hitlist[i - 1], '__')[-1], hitlist[i - 1]) if i < len(hitlist) - 1: html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % ( string.split(hitlist[i + 1], '__')[-1], hitlist[i + 1]) html += '</p>\n' htabs = [] # (tab_id, tab_html_text[, tab_title]) for ftype in ['png', 'tree.txt', 'fas', 'nwk', 'log']: seqfile = '%s.%s' % (seqbase, ftype) if not os.path.exists(seqfile): continue tabtext = '<p><a href="./%s">./%s</a></p>\n' % ( os.path.basename(seqfile), os.path.basename(seqfile)) if ftype == 'png': tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % ( os.path.basename(seqfile), os.path.basename(seqfile)) tabdesc = 'PNG of %s tree.' % seqbase else: tabtext += '<pre>%s</pre>\n' % open(seqfile, 'r').read() if ftype == 'tree.txt': for xref in hitlist: reptext = '<a href="./%s.html">%s</a>' % ( string.split(xref, '__')[-1], xref) tabtext = string.replace( tabtext, ': %s ' % xref, ': %s ' % reptext) while rje.matchExp('(: \S+_(\S+)__(\S+) )', tabtext): (oldtext, sid, spec, spacc) = rje.matchExp( '(: (\S+)_(\S+)__(\S+) )', tabtext) newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % ( sid, spec, spec, spacc, spacc) tabtext = string.replace( tabtext, oldtext, newtext) tabdesc = '%s output' % seqfile htabs.append((ftype, tabtext, tabdesc)) if htabs: html += hobj.tabberHTML(os.path.basename(seqbase), htabs) else: html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit html += hobj.htmlTail() open(hfile, 'w').write(html) except: self.errorLog('Problem with %s.makeHTML()' % self.prog())
def mapPhosByBLAST(self,fasfile): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F'] qseqlist = rje_seq.SeqList(self.log,scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name','Pos','AA','PELM','PELMPos','Evidence'] delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self,ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F']) pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile}) pblast.setStat({'HitAln':pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID','Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True,log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq:0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False): self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0 aln.muscleAln() #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])) aln._addSeq('PhosAln','-' * qseq.seqLen()) aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']) ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print '>>>\n', aln.seq, pdict.keys(), rdict.keys() for a in range(qseq.seqLen()): if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1 for hseq in pdict: if hseq.info['Sequence'][a] == '-': continue if hseq != qseq: rdict[hseq] += 1 if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]: # Phosphosite pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a], 'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'} if hseq == qseq: pdata['Evidence'] = 'Self' elif hseq in idlist: pdata['Evidence'] = 'ID' rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata) self.addPhos(aln.seq[-1],a,pdata['Evidence']) ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## aln.saveFasta() # Align hits for each > X %ID # Map phosphosites onto alignment and output # return except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
def buildPam(self): ### Builds PAM Matrix in memory '''Builds PAM matrix in memory.''' try: ### Check for Alternative PAM Matrix ### if self.info['AltPam'].lower() not in ['','none']: self.altPAM() self.verbose(0,3,"Reading PAM1 matrix from %s" % self.info['Name'],2) ### <a> ### Open file & Read Lines pamfiles = [self.info['Name'],rje.makePath(self.info['Path']) + self.info['Name'],rje.makePath(self.info['Path']) + rje.makePath('../data/') + self.info['Name']] self.info['Name'] = None for pfile in pamfiles: if rje.checkForFile(pfile): file_lines = open(pfile, 'r').readlines() self.info['Name'] = pfile break if not self.info['Name']: for pfile in pamfiles: self.printLog('#ERR','File "%s" not found' % pfile) self.printLog('#ERR','No PAM file found!') raise ValueError ### <b> ### Read in alphabet self.verbose(0,3,file_lines[0],1) if file_lines[0].upper().find('X') >= 0: self.opt['X-Value'] = False if file_lines[0].find('-') >= 0: self.opt['GapValue'] = False self.alphabet = file_lines[0].split() ### <c> ### Make PAM0 ## <i> ## Clear dics zeropamp = {} for r in self.alphabet: for c in self.alphabet: zeropamp[r + c] = 0 zeropamp[r + r] = 1 if self.opt['X-Value']: zeropamp['X' + r] = 1 zeropamp[r + 'X'] = 1 if self.opt['GapValue']: zeropamp['-' + r] = 1 zeropamp[r + '-'] = 1 if self.opt['X-Value']: zeropamp['XX'] = 1 if self.opt['GapValue']: zeropamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: zeropamp['-X'] = 1 zeropamp['X-'] = 1 ## <ii> ## New Matrix newmatrix = PAM(pam=0,rawpamp=zeropamp,alpha=self.alphabet) self.matrix.append(newmatrix) ## <d> ## Read in PAM1 rawpamp = {} line = 1 for r in self.alphabet: pamline = file_lines[line].split() if len(pamline) != (len(self.alphabet)+1): self.log.errorLog("%s has wrong format! Does not match %s" % (pamline, self.alphabet),printerror=False,quitchoice=True) raise for c in range(int(len(self.alphabet))): prob = float(pamline[c+1]) rawpamp[r + self.alphabet[c]] = prob if self.opt['X-Value']: rawpamp['X' + r] = 1 rawpamp[r + 'X'] = 1 if self.opt['GapValue']: rawpamp['-' + r] = 1 rawpamp[r + '-'] = 1 line += 1 if self.opt['X-Value']: rawpamp['XX'] = 1 if self.opt['GapValue']: rawpamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: rawpamp['-X'] = 1 rawpamp['X-'] = 1 newmatrix = PAM(pam=1,rawpamp=rawpamp,alpha=self.alphabet) self.matrix.append(newmatrix) ## <e> ## Raise to pammax self.log.printLog('\r#PAM','Building PAM Matrices <= %d: ' % self.stat['PamMax'],log=False,newline=False) self.pamUp() self.log.printLog('\r#PAM','Building PAM Matrices <= %d: Complete.' % self.stat['PamMax']) except: self.log.errorLog('Fatal Error in PamCtrl.buildPam().') raise
def codons(self): ### Main codons analysis method '''Main codons analysis method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) gcode = rje_sequence.genetic_code ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nts = ['A','C','G','T'] ntfreq = cds.aaFreq(alphabet=nts) codons = [] # List of codons obs_cfreq = {} # Observed codon frequencies nts_cfreq = {} # Codon frequencies from NT frequencies obs_tfreq = {} # Observed triplet frequencies nts_tfreq = {} # Predicted triplet frequencies from NT frequencies ocd_tfreq = {} # Predicted triplet frequencies from observed codon frequencies ncd_tfreq = {} # Predicted triplet frequencies from nt-predicted codon frequencies ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for n1 in nts: for n2 in nts: for n3 in nts: cod = '%s%s%s' % (n1,n2,n3) codons.append(cod) aa = gcode[string.replace(cod,'T','U')] if aa not in obs_cfreq: obs_cfreq[aa] = {} if aa not in nts_cfreq: nts_cfreq[aa] = {} obs_cfreq[aa][cod] = 0.0 nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] obs_tfreq[cod] = 0.0 nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] ocd_tfreq[cod] = 0.0 ncd_tfreq[cod] = 0.0 nts_tfreq = rje.dictFreq(nts_tfreq,total=False) # Normalise triplet freq. for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False) # Normalise codon freq. self.log.printLog('#FREQ','Frequency dictionaries set up.') ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx,stot) = (0.0,cds.seqNum()) for seq in cds.seq[0:]: self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False) sx += 100.0 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0] except: try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0] except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0] self.deBug(exons) exons = string.split(exons,',') elen = [] try: for exon in exons: (start,end) = string.split(exon,'..') elen.append(string.atoi(end) - string.atoi(start) + 1) except: self.log.errorLog(id) cds.seq.remove(seq) continue if pos[:4] == 'comp': elen.reverse() seq.list['ExonLen'] = elen self.deBug(elen) if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False) if seq.aaLen()/3 != seq.aaLen()/3.0: self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False) cds.seq.remove(seq) continue #!# Add use exon option - single full-length exon if false (mature mRNA) #!# sequence = seq.info['Sequence'][0:] if string.count(sequence,'N') > 0: self.log.errorLog('%s has 1+ Ns!' % id,printerror=False) cds.seq.remove(seq) continue while sequence: cod = sequence[:3] sequence = sequence[3:] aa = gcode[string.replace(cod,'T','U')] obs_cfreq[aa][cod] += 1 for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False) # Normalise codon freq. self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.') ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx,stot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False) sx += 100.0 elen = seq.list['ExonLen'] sequence = seq.info['Sequence'][0:] aa = '' cod = '' ax = 0 # Measure sequence length processed for exon boundary checks while sequence: prevcod = cod cod = sequence[:3] prevaa = aa sequence = sequence[3:] aa = gcode[string.replace(cod,'T','U')] ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for cod2 in obs_cfreq[aa]: if elen[0] > ax + 3: # Exon boundary beyond this codon ocd_tfreq[cod2] += obs_cfreq[aa][cod2] ncd_tfreq[cod2] += nts_cfreq[aa][cod2] if prevaa: # Look at overlap with previous codon for cod1 in obs_cfreq[prevaa]: for i in range(1,3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = cod1[i:] + cod2[:i] ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2]) ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2]) ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if elen[0] > ax + 3: # Exon boundary beyond this codon obs_tfreq[cod] += 1 if prevcod: # Look at overlap with previous codon for i in range(1,3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = prevcod[i:] + cod[:i] obs_tfreq[acod] += 1 # Check exons # ax += 3 if ax >= elen[0]: ax -= elen.pop(0) obs_tfreq = rje.dictFreq(obs_tfreq,total=False) ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False) ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False) self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.') ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip'] tfile = 'quad_triplet.tdt' rje.delimitedFileOutput(self,tfile,headers,rje_backup=True) for cod in codons: aa = gcode[string.replace(cod,'T','U')] datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod], 'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod], 'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]} rje.delimitedFileOutput(self,tfile,headers,datadict=datadict) self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def makeFlySeq(self): ### Main run method '''Main run method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd) cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold':scaffold,'Gene':name}) try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) gene.opt['Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start':start,'End':end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE','Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx,ctot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False) cx += 100 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS','Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex,etot) = (0.0,exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False) ex += 100 try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON','Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) clist.append(pos) clist = rje.sortUnique(clist,xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) elist.append(pos) elist = rje.sortUnique(elist,xreplace=False) gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,',')) self.log.printLog('\r#GENE','Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def makeFlySeq(self): ### Main run method '''Main run method.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F'] genes = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd) cds = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd) exons = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx, gtot) = (0.0, genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE', 'Processing Gene Annotation: %.1f%%' % (gx / gtot), newline=False, log=False) gx += 100 (id, scaffold, pos, name, glen) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);', gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold': scaffold, 'Gene': name}) try: (end, start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) gene.opt[ 'Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start': start, 'End': end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE', 'Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx, ctot) = (0.0, cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS', 'Processing CDS Annotation: %.1f%%' % (cx / ctot), newline=False, log=False) cx += 100 try: (id, scaffold, pos, name, glen, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end, start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)', pos) except: try: (start, end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) seq.opt[ 'Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start': start, 'End': end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex, etot) = (0.0, exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON', 'Processing Exon Annotation: %.1f%%' % (ex / etot), newline=False, log=False) ex += 100 try: (id, scaffold, pos, name, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent, ',')[0]] try: (end, start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)', pos) except: try: (start, end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) seq.opt[ 'Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start': start, 'End': end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON', 'Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx, gtot) = (0.0, genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE', 'Generating new Gene Annotation: %.1f%%' % (gx / gtot), newline=False, log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt[ 'Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start, glen), rje.preZero(end, glen)) clist.append(pos) clist = rje.sortUnique(clist, xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt[ 'Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start, glen), rje.preZero(end, glen)) elist.append(pos) elist = rje.sortUnique(elist, xreplace=False) gene.info[ 'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % ( gene.info['Gene'], gene.info['SpecCode'], gene.info['AccNum'], gene.aaLen(), string.join(clist, ','), string.join(elist, ',')) self.log.printLog('\r#GENE', 'Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def codons(self): ### Main codons analysis method '''Main codons analysis method.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F'] cds = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd) gcode = rje_sequence.genetic_code ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nts = ['A', 'C', 'G', 'T'] ntfreq = cds.aaFreq(alphabet=nts) codons = [] # List of codons obs_cfreq = {} # Observed codon frequencies nts_cfreq = {} # Codon frequencies from NT frequencies obs_tfreq = {} # Observed triplet frequencies nts_tfreq = {} # Predicted triplet frequencies from NT frequencies ocd_tfreq = { } # Predicted triplet frequencies from observed codon frequencies ncd_tfreq = { } # Predicted triplet frequencies from nt-predicted codon frequencies ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for n1 in nts: for n2 in nts: for n3 in nts: cod = '%s%s%s' % (n1, n2, n3) codons.append(cod) aa = gcode[string.replace(cod, 'T', 'U')] if aa not in obs_cfreq: obs_cfreq[aa] = {} if aa not in nts_cfreq: nts_cfreq[aa] = {} obs_cfreq[aa][cod] = 0.0 nts_cfreq[aa][ cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] obs_tfreq[cod] = 0.0 nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] ocd_tfreq[cod] = 0.0 ncd_tfreq[cod] = 0.0 nts_tfreq = rje.dictFreq(nts_tfreq, total=False) # Normalise triplet freq. for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq( nts_cfreq[aa], total=False) # Normalise codon freq. self.log.printLog('#FREQ', 'Frequency dictionaries set up.') ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx, stot) = (0.0, cds.seqNum()) for seq in cds.seq[0:]: self.log.printLog( '\r#OBS', 'Calculating observed codon frequencies: %.1f%%' % (sx / stot), newline=False, log=False) sx += 100.0 try: (id, scaffold, pos, name, glen, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)', pos)[0] except: try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0] except: exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0] self.deBug(exons) exons = string.split(exons, ',') elen = [] try: for exon in exons: (start, end) = string.split(exon, '..') elen.append(string.atoi(end) - string.atoi(start) + 1) except: self.log.errorLog(id) cds.seq.remove(seq) continue if pos[:4] == 'comp': elen.reverse() seq.list['ExonLen'] = elen self.deBug(elen) if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id, printerror=False) if seq.aaLen() / 3 != seq.aaLen() / 3.0: self.log.errorLog('%s not a multiple of 3nt long!' % id, printerror=False) cds.seq.remove(seq) continue #!# Add use exon option - single full-length exon if false (mature mRNA) #!# sequence = seq.info['Sequence'][0:] if string.count(sequence, 'N') > 0: self.log.errorLog('%s has 1+ Ns!' % id, printerror=False) cds.seq.remove(seq) continue while sequence: cod = sequence[:3] sequence = sequence[3:] aa = gcode[string.replace(cod, 'T', 'U')] obs_cfreq[aa][cod] += 1 for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq( obs_cfreq[aa], total=False) # Normalise codon freq. self.log.printLog( '\r#OBS', 'Calculating observed codon frequencies complete.') ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx, stot) = (0.0, cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#TRIP', 'Calculating triplet frequencies: %.1f%%' % (sx / stot), newline=False, log=False) sx += 100.0 elen = seq.list['ExonLen'] sequence = seq.info['Sequence'][0:] aa = '' cod = '' ax = 0 # Measure sequence length processed for exon boundary checks while sequence: prevcod = cod cod = sequence[:3] prevaa = aa sequence = sequence[3:] aa = gcode[string.replace(cod, 'T', 'U')] ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for cod2 in obs_cfreq[aa]: if elen[0] > ax + 3: # Exon boundary beyond this codon ocd_tfreq[cod2] += obs_cfreq[aa][cod2] ncd_tfreq[cod2] += nts_cfreq[aa][cod2] if prevaa: # Look at overlap with previous codon for cod1 in obs_cfreq[prevaa]: for i in range(1, 3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = cod1[i:] + cod2[:i] ocd_tfreq[acod] += ( obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2]) ncd_tfreq[acod] += ( nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2]) ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if elen[0] > ax + 3: # Exon boundary beyond this codon obs_tfreq[cod] += 1 if prevcod: # Look at overlap with previous codon for i in range(1, 3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = prevcod[i:] + cod[:i] obs_tfreq[acod] += 1 # Check exons # ax += 3 if ax >= elen[0]: ax -= elen.pop(0) obs_tfreq = rje.dictFreq(obs_tfreq, total=False) ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False) ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False) self.log.printLog('\r#TRIP', 'Calculating triplet frequencies complete.') ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = [ 'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip', 'NT_Trip', 'ObCod_Trip', 'NTCod_Trip' ] tfile = 'quad_triplet.tdt' rje.delimitedFileOutput(self, tfile, headers, rje_backup=True) for cod in codons: aa = gcode[string.replace(cod, 'T', 'U')] datadict = { 'Triplet': cod, 'AA': aa, 'Degen': len(obs_cfreq[aa]), 'Obs_Codon': obs_cfreq[aa][cod], 'NT_Codon': nts_cfreq[aa][cod], 'Obs_Trip': obs_tfreq[cod], 'NT_Trip': nts_tfreq[cod], 'ObCod_Trip': ocd_tfreq[cod], 'NTCod_Trip': ncd_tfreq[cod] } rje.delimitedFileOutput(self, tfile, headers, datadict=datadict) self.log.printLog('#OUT', 'Triplet & codon data output to %s' % tfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def loadOrthAln(callobj,seq,gopher=True): ### Identifies file, loads and checks alignment. ''' Identifies file, loads and checks alignment. If the identified file is not actually aligned, then RJE_SEQ will try to align the proteins using MUSCLE or ClustalW. >> callobj:Object containing settings for stats generation (MotifList, generally). >> seq:Sequence being analysed. >> gopher:bool [True] = whether to try to generate alignment with GOPHER if callobj.opt['Gopher'] << aln = SeqList object containing alignment with queryseq ''' try: ### Setup Attributes ### v = callobj.stat['Verbose'] alndir = rje.makePath(callobj.info['AlnDir']) alnext = callobj.info['AlnExt'] ### Identify File ### if alnext[0] != '.': alnext = '.%s' % alnext alnstart = [seq.info['AccNum'],seq.info['ID'],seq.shortName(),None] if v > 2: callobj.log.printLog('#PRESTO','%s' % callobj.opt) #!# Old debugging? #!# if callobj.opt['Gopher'] and callobj.opt['FullForce']: if v > 0: callobj.log.printLog('#ALN','FullForce=T. Will call Gopher for %s regardless of existing files' % seq.shortName()) alnstart = [None] for file in alnstart: if file: file = '%s%s%s' % (alndir,file,alnext) if rje.checkForFile(file): break # File found else: #!# Sort out logging and see if Gopher can be used directly rather than just run() #!# ### Run GOPHER ### if gopher and callobj.opt['Gopher']: #!# Add working version for PRESTO and SlimPickings #!# callobj.deBug('Run GOPHER in %s' % callobj.info['GopherDir']) mydir = os.getcwd() os.chdir(callobj.info['GopherDir']) callobj.log.printLog('\n#GOPHER','Running GOPHER on %s' % seq.shortName()) try: #!# Add log.silent() method? #!# gcmd = ['orthtree'] + callobj.cmd_list + ['gnspacc=T','i=-1'] solo_gopher = gopher_V2.GopherFork(log=callobj.log,cmd_list=gcmd) solo_gopher.info['Name'] = seq.shortName() solo_gopher.obj['Sequence'] = seq solo_gopher.obj['BLAST'] = gopher_V2.Gopher(callobj.log,gcmd).setupBlast() #!# Contemplate setting up Gopher in callobj #!# solo_gopher.obj['BLAST'].log = callobj.log solo_gopher.run('orthalign') #X#gopher_V2.Gopher(callobj.log,gcmd).setMode()) except: os.chdir(mydir) callobj.log.errorLog('Problem with Gopher run!') return None if not 'old_school': inputseq = 'tmp%s.fas' % rje.randomString(8) TMP = open(inputseq,'w') TMP.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence'])) TMP.close() gcmd = ['orthtree'] + callobj.cmd_list + ['gopher=%s' % inputseq, 'gnspacc=T','i=-1'] try: mygopher = gopher_V2.Gopher(log=callobj.log,cmd_list=gcmd) mygopher.run() except: os.chdir(mydir) callobj.log.errorLog('Problem with Gopher run!',printerror=False) return None rje_blast.cleanupDB(callobj,dbfile=inputseq,deletesource=True) os.chdir(mydir) if callobj.opt['Gopher']: file = '%s%s%s' % (alndir,seq.info['AccNum'],alnext) if not os.path.exists(file): file = None if not file: callobj.log.printLog('#ALN','No alignment file found for %s in %s.' % (seq.shortName(),alndir),screen=False) return None ### Load Alignment ### callobj.log.stat['Verbose'] = v - 1 alncmd = ['seqin=None','query=%s' % seq.shortName(),'accnr=F','seqnr=F','autofilter=F','align=T','gnspacc=F'] aln = rje_seq.SeqList(log=callobj.log,cmd_list=callobj.cmd_list+alncmd) #X#print file aln.loadSeqs(seqfile=file,seqtype='Protein',aln=True,nodup=None) callobj.log.stat['Verbose'] = v ## Check Query ## qry = aln.obj['QuerySeq'] if not qry: if aln.querySeq(query=seq.info['AccNum']): qry = aln.obj['QuerySeq'] else: callobj.log.printLog('#ALN','Problem finding %s in %s.' % (seq.shortName(),file),screen=False) return None ### Check Alignment ### if aln.seqNum() < 2: callobj.log.printLog('#ALN','Not enough sequences for %s in %s.' % (seq.shortName(),file),screen=False) return None if aln._checkAln(aln=True,realign=True): return aln else: callobj.log.printLog('#ERR','%s not aligned!!!' % (file)) return None except: callobj.log.errorLog('Something bad has happened in rje_motif_stats.loadOrthAln()') callobj.log.stat['Verbose'] = v return None
def buildPam(self): ### Builds PAM Matrix in memory '''Builds PAM matrix in memory.''' try: ### Check for Alternative PAM Matrix ### if self.info['AltPam'].lower() not in ['', 'none']: self.altPAM() self.verbose(0, 3, "Reading PAM1 matrix from %s" % self.info['Name'], 2) ### <a> ### Open file & Read Lines pamfiles = [ self.info['Name'], rje.makePath(self.info['Path']) + self.info['Name'], rje.makePath(self.info['Path']) + rje.makePath('../data/') + self.info['Name'] ] self.info['Name'] = None for pfile in pamfiles: if rje.checkForFile(pfile): file_lines = open(pfile, 'r').readlines() self.info['Name'] = pfile break if not self.info['Name']: for pfile in pamfiles: self.printLog('#ERR', 'File "%s" not found' % pfile) self.printLog('#ERR', 'No PAM file found!') raise ValueError ### <b> ### Read in alphabet self.verbose(0, 3, file_lines[0], 1) if file_lines[0].upper().find('X') >= 0: self.opt['X-Value'] = False if file_lines[0].find('-') >= 0: self.opt['GapValue'] = False self.alphabet = file_lines[0].split() ### <c> ### Make PAM0 ## <i> ## Clear dics zeropamp = {} for r in self.alphabet: for c in self.alphabet: zeropamp[r + c] = 0 zeropamp[r + r] = 1 if self.opt['X-Value']: zeropamp['X' + r] = 1 zeropamp[r + 'X'] = 1 if self.opt['GapValue']: zeropamp['-' + r] = 1 zeropamp[r + '-'] = 1 if self.opt['X-Value']: zeropamp['XX'] = 1 if self.opt['GapValue']: zeropamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: zeropamp['-X'] = 1 zeropamp['X-'] = 1 ## <ii> ## New Matrix newmatrix = PAM(pam=0, rawpamp=zeropamp, alpha=self.alphabet) self.matrix.append(newmatrix) ## <d> ## Read in PAM1 rawpamp = {} line = 1 for r in self.alphabet: pamline = file_lines[line].split() if len(pamline) != (len(self.alphabet) + 1): self.log.errorLog( "%s has wrong format! Does not match %s" % (pamline, self.alphabet), printerror=False, quitchoice=True) raise for c in range(int(len(self.alphabet))): prob = float(pamline[c + 1]) rawpamp[r + self.alphabet[c]] = prob if self.opt['X-Value']: rawpamp['X' + r] = 1 rawpamp[r + 'X'] = 1 if self.opt['GapValue']: rawpamp['-' + r] = 1 rawpamp[r + '-'] = 1 line += 1 if self.opt['X-Value']: rawpamp['XX'] = 1 if self.opt['GapValue']: rawpamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: rawpamp['-X'] = 1 rawpamp['X-'] = 1 newmatrix = PAM(pam=1, rawpamp=rawpamp, alpha=self.alphabet) self.matrix.append(newmatrix) ## <e> ## Raise to pammax self.log.printLog('\r#PAM', 'Building PAM Matrices <= %d: ' % self.stat['PamMax'], log=False, newline=False) self.pamUp() self.log.printLog( '\r#PAM', 'Building PAM Matrices <= %d: Complete.' % self.stat['PamMax']) except: self.log.errorLog('Fatal Error in PamCtrl.buildPam().') raise
def saveReadMe(self,filename='pydocs.txt',append=False): ### Prints docs for modules to file ''' Prints docs for modules to file. >> filename:str = output file name >> append:boolean ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pydoc = self.obj['PyDoc'] if append: self.printLog('#DOC','Appending docstrings to %s' % filename) PYDOC = open(filename,'a') else: rje.mkDir(self,filename) self.printLog('#DOC','Writing docstrings to %s' % filename) PYDOC = open(filename,'w') PYDOC.write(self.readMeHeader()) db = self.db('Module') dx = 0 ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for sourcedir in pydoc.list['SourceDir']: PYDOC.write('-%s:\n\n' % sourcedir) for pyfile in db.dataKeys(): entry = db.data(pyfile) module = entry['Module'] if not pyfile.find(sourcedir) >= 0 or not os.path.exists('%s%s%s.py' % (pydoc.getStr('PyPath'),rje.makePath(sourcedir),module)): continue ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,pyfile) while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:] try: PYDOC.write('%s\n\n%s\n' % (mtxt,entry['DocString'])); dx += 1 except: self.errorLog('Cannot write DocString for %s' % module,printerror=False) PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)); dx += 1 PYDOC.write('\n\n\n') PYDOC.close() self.printLog('#DOC','Output to %s complete: %s modules.' % (filename,rje.iStr(dx))) except: self.errorLog('Error in %s.saveDocs()' % self.prog())
def run(self): ### Main Run method ''' Main Run method. ''' try: ### SLiMDisc Run ### if self.opt['SLiMDisc']: return self.slimDisc() ### TEIRESIAS ### if self.opt['Teiresias']: ## Setup ## seqlist = rje_seq.SeqList(self.log,self.cmd_list) infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True) outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True) run_teiresias = True if rje.isYounger(outfile,infile) == outfile: if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'): run_teiresias = False ## Run TEIRESIAS ## if run_teiresias: seqlist.saveFasta(seqfile=infile,name='Teiresias') ### Saves sequences in fasta format command = rje.makePath(self.info['TeiresiasPath'],True) command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt']) self.log.printLog('#CMD',command) os.system(command) ## Read Results ## self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1) self.list['Pattern'] = [] RESULTS = open(outfile,'r') line = RESULTS.readline() while line: if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line)) elif len(line) > 3 and line[0] != '#': self.log.errorLog('Did not recognise line: %s' % line,False,False) line = RESULTS.readline() RESULTS.close() patx = len(self.list['Pattern']) self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile)) ## Calculate Information Content ## aafreq = seqlist.aaFreq() self.verbose(0,3,'Calculating Information Content & Length stats...',0) occx = 0 for pattern in self.list['Pattern']: pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq) pattern._makeLength() occx += 1 rje.progressPrint(self,occx,patx/100,patx/10) self.verbose(0,1,'...Done!',2) ## Prepare Results ## delimit = rje.getDelimit(self.cmd_list) if self.info['Name'] == 'None': self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit)) if self.opt['MySQL']: # Two tables patfile = os.path.splitext(self.info['Name']) occfile = '%s.occ%s' % (patfile[0],patfile[1]) patfile = '%s.patterns%s' % (patfile[0],patfile[1]) if self.opt['Append']: PATFILE = open(patfile,'a') OCCFILE = open(occfile,'a') else: PATFILE = open(patfile,'w') rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit) OCCFILE = open(occfile,'a') rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit) else: if self.opt['Append']: RESFILE = open(self.info['Name'],'a') else: RESFILE = open(patfile,'w') rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit) ## Save Results ## occx = 0 for pattern in self.list['Pattern']: patstats = [] for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']: patstats.append('%d' % pattern.stat[stat]) patstats[2] = '%.3f' % pattern.stat['Info'] if self.opt['MySQL']: # Two tables rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit) for occ in rje.sortKeys(pattern.occ): seq = seqlist.seq[occ] for pos in pattern.occ[occ]: match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])] outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match] if self.opt['MySQL']: # Two tables rje.writeDelimit(OCCFILE,outlist,delimit) else: rje.writeDelimit(RESFILE,outlist+patstats,delimit) occx += 1 if self.opt['MySQL']: # Two tables PATFILE.close() OCCFILE.close() self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile)) self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile)) else: RESFILE.close() self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' % (rje.integerString(occx),rje.integerString(patx),self.info['Name'])) ### InfoContent ### elif self.info['Info'] != 'None': ## Setup ## alphabet = rje_seq.alph_protx if not os.path.exists(self.info['Info']): self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False) return False else: mypresto = presto.Presto(self.log,self.cmd_list) mypresto.loadMotifs(file=self.info['Info'],clear=True) seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) if seqlist.seqNum() > 0: aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False) ### Returns dictionary of AA (& gap etc.) frequencies else: aafreq = {} for aa in alphabet: aafreq[aa] = 1.0 / len(alphabet) alphabet = aafreq.keys() maxinfo = 0 for aa in alphabet: maxinfo += (aafreq[aa] * math.log(aafreq[aa],2)) ## Output ## delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext) if self.opt['Append']: OUTFILE = open(outfile,'a') else: OUTFILE = open(outfile,'w') rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit) ## Calculate Information Scores ## for motif in mypresto.motif: self.verbose(2,4,motif.info['Sequence'],0) pattern = string.replace(motif.info['Sequence'],'X','.') elements = string.split(pattern,'-') pattern = '' for el in elements: if el.find('.{') == 0: # Ambiguous spacer length - compress pattern += '.' else: pattern += el self.verbose(2,2,'=> %s' % pattern,1) motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen']) self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1) ## Output ## rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit) ## Finish ## OUTFILE.close() except: self.log.errorLog('Error in run().',printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def ANCHOR(self, retry=2): ### Runs ANCHOR disorder prediction '''Runs ANCHOR disorder prediction.''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## apath = self.info['ANCHOR'] if os.path.basename(apath) == 'anchor': apath = os.path.dirname(apath) anchor = rje.makePath(apath) + 'anchor' if not os.path.exists(anchor): self.errorLog('Path "%s" not found!' % anchor, printerror=False) retry = 0 raise IOError ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(tmp, 'w').write('>%s\n%s\n' % (name, sequence)) acmd = '%s %s -d %s' % (anchor, tmp, apath) dlines = os.popen(acmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Name'] not in ['', 'None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if d[:1] == '#': continue if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d): dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog( '%s: Position %d is %s in sequence but %s in ANCHOR output!' % (name, pos, sequence[i], aa), printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog( '%s: Position %d reached in ANCHOR output but previous results missing!' % (name, pos), printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog( '%s: Sequence = %d aa but ANCHOR results stop at %s!' % (name, len(sequence), len(self.list['ResidueDisorder'])), printerror=False) raise ValueError ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat[ 'IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start, i - 1)) dx += i - start start = 0 if not fstart and score <= self.stat[ 'IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart, i - 1)) fstart = 0 if start: self.list['RegionDisorder'].append((start, len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart, len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']), dx)) return True except: if retry: self.printLog('#RETRY', 'Trying %s again...' % name) return self.ANCHOR(retry - 1) self.log.errorLog( 'Error in Disorder.ANCHOR(%s). Disorder prediction failed.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] return False
def multiHAQ(self,secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt['MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList() # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = ['ini=haqesac.ini','seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F'] self.printLog('#HAQ','Running HAQESAC for %s - will have own log etc.' % seq.shortName(),log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd,info=info) out = rje.Out(cmd_list=haqcmd) # Sets up Out object for controlling output to screen out.printIntro(info) # Prints intro text using details from Info object haqlog = rje.setLog(info,out,haqcmd) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo('Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog('#HAQ','HAQESAC final round run for %s' % seq.shortName()) else: self.printLog('#HAQ','HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList(self.log,['seqin=%s' % infile,'autoload=T','autofilter=F']).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog('#QRY','%d other queries found in %s: [%s]' % (len(qacclist),infile,string.join(qacclist,'; '))) self.printLog('#QRY','%d of %d queries processed' % (len(processed),self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI','Executing second round of multiHAQ'); self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ',quitchoice=True)
def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile() nx = 0 null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file, 'r').read(), '\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1 continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile, db)) self.debug(rje.isYounger(hfile, db) == hfile) need2blast = need2blast or not rje.isYounger(hfile, db) == hfile if not need2blast: self.printLog( '#BLAST', 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self, null_file) nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('MultiCut'), 'blastv=%d' % self.getInt('MultiCut') ] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('BlastCut'), 'blastv=%d' % self.getInt('BlastCut') ] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += [ 'blasta=%d' % self.getInt('Forks') ] rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'), seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath( '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) os.rename(sbfile, hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file, 'a').write('%s\n' % seq.info['AccNum']) nx += 1 if nx: self.printLog( '#BLAST', '%s Accession Numbers without BLAST2Fas hits output to %s' % (nx, null_file)) self.printLog( '#BLAST', '%s HAQESAC input files made using BLAST2Fas' % (self.seqNum() - nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas') raise
def makeHTML(self): ### Generates HTML pages for interactive navigation. '''Generates HTML pages for interactive navigation.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### basefile = self.basefile() scmd = self.cmd_list + ['seqin=%s' % self.getStr('Candidates'),'autoload=T','autofilter=F','seqmode=file'] candseq = rje_seqlist.SeqList(self.log,scmd) # All files and directories are named after basefile: # *.fas = original target PROTEIN sequences (with original descriptions) scmd = self.cmd_list + ['seqin=%s' % self.getStr('SeqIn'),'autoload=T','autofilter=F','seqmode=file'] seqlist = rje_seqlist.SeqList(self.log,scmd) # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.) gdb = self.db().addTable('%s.gablam.tdt' % basefile,mainkeys=['Qry','Hit'],name='gablam',expect=False) # - Contains candidate proteins as Queries and Target proteins as hits # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum haqdir = rje.makePath('./%s.HAQESAC/' % basefile) ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hfile = '%s.html' % basefile hobj = self.obj['HTML'] hobj.list['StyleSheets'] = ['http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css', 'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css'] html = hobj.htmlHead(basefile) # Front page should have: html += '<h1>%s</h1>\n\n' % basefile htabs = [] # (tab_id, tab_html_text[, tab_title]) # Target protein list (with links to HAQ HTML) ctext = '%s\n' % string.join(['Name','Descripton','Length'],'\t') seqdict = seqlist.makeSeqNameDic('short') if gdb: hitlist = gdb.indexKeys('Hit') else: hitlist = rje.sortKeys(seqdict) for name in hitlist: seq = seqdict[name] cseq = [name,seqlist.seqDesc(seq),'%s aa' % seqlist.seqLen(seq)] acc = seqlist.seqAcc(seq) if os.path.exists('%s%s.log' % (haqdir,acc)): cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,cseq[0]) ctext += '%s\n' % string.join(cseq,'\t') htabs.append(('Hits',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Target sequences hit by candidates.')) # GABLAM/HMM table (with above links) if gdb: ctext = '%s\n' % string.join(gdb.fields(),'\t') for gline in open('%s.gablam.tdt' % basefile,'r').readlines()[1:]: gdata = string.split(gline,'\t') acc = string.split(gdata[0],'__')[-1] gdata[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,gdata[0]) acc = string.split(gdata[1],'__')[-1] gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,gdata[1]) ctext += '%s\n' % string.join(gdata,'\t') htabs.append(('GABLAM',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'GABLAM hit table.')) # Candidate list (with DB links) if candseq.seqNum(): ctext = '%s\n' % string.join(['AccNum','ID','Descripton','Length'],'\t') accdict = candseq.makeSeqNameDic('accnum') for acc in rje.sortKeys(accdict): seq = accdict[acc] cseq = [acc,candseq.seqID(seq),candseq.seqDesc(seq),'%s aa' % candseq.seqLen(seq)] cseq[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,acc) ctext += '%s\n' % string.join(cseq,'\t') htabs.append(('Candidates',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Candidate sequences to search.')) html += hobj.tabberHTML('GABLAM',htabs) html += hobj.htmlTail() open(hfile,'w').write(html) ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #?# Move this to HAQESAC or MultiHAQ for i in range(len(hitlist)): hit = string.split(hitlist[i],'__')[-1] logfile = '%s%s.log' % (haqdir,hit) seqbase = logfile[:-4] hfile = '%s.html' % seqbase html = hobj.htmlHead(seqbase) # Front page should have: html += '<h1>%s</h1>\n\n' % seqbase html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile if i: html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i-1],'__')[-1],hitlist[i-1]) if i < len(hitlist)-1: html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i+1],'__')[-1],hitlist[i+1]) html += '</p>\n' htabs = [] # (tab_id, tab_html_text[, tab_title]) for ftype in ['png','tree.txt','fas','nwk','log']: seqfile = '%s.%s' % (seqbase,ftype) if not os.path.exists(seqfile): continue tabtext = '<p><a href="./%s">./%s</a></p>\n' % (os.path.basename(seqfile),os.path.basename(seqfile)) if ftype == 'png': tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % (os.path.basename(seqfile),os.path.basename(seqfile)) tabdesc = 'PNG of %s tree.' % seqbase else: tabtext += '<pre>%s</pre>\n' % open(seqfile,'r').read() if ftype == 'tree.txt': for xref in hitlist: reptext = '<a href="./%s.html">%s</a>' % (string.split(xref,'__')[-1],xref) tabtext = string.replace(tabtext,': %s ' % xref,': %s ' % reptext) while rje.matchExp('(: \S+_(\S+)__(\S+) )',tabtext): (oldtext,sid,spec,spacc) = rje.matchExp('(: (\S+)_(\S+)__(\S+) )',tabtext) newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % (sid,spec,spec,spacc,spacc) tabtext = string.replace(tabtext,oldtext,newtext) tabdesc = '%s output' % seqfile htabs.append((ftype,tabtext,tabdesc)) if htabs: html += hobj.tabberHTML(os.path.basename(seqbase),htabs) else: html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit html += hobj.htmlTail() open(hfile,'w').write(html) except: self.errorLog('Problem with %s.makeHTML()' % self.prog())
def ANCHOR(self,retry=2): ### Runs ANCHOR disorder prediction '''Runs ANCHOR disorder prediction.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## apath = self.info['ANCHOR'] if os.path.basename(apath) == 'anchor': apath = os.path.dirname(apath) anchor = rje.makePath(apath) + 'anchor' if not os.path.exists(anchor): self.errorLog('Path "%s" not found!' % anchor,printerror=False) retry = 0; raise IOError ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(tmp,'w').write('>%s\n%s\n' % (name,sequence)) acmd = '%s %s -d %s' % (anchor,tmp,apath) dlines = os.popen(acmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Name'] not in ['','None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if d[:1] == '#': continue if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)',d): dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)',d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog('%s: Position %d is %s in sequence but %s in ANCHOR output!' % (name,pos,sequence[i],aa),printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog('%s: Position %d reached in ANCHOR output but previous results missing!' % (name,pos),printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog('%s: Sequence = %d aa but ANCHOR results stop at %s!' % (name,len(sequence),len(self.list['ResidueDisorder'])),printerror=False) raise ValueError ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat['IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start,i-1)) dx += i - start start = 0 if not fstart and score <= self.stat['IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart,i-1)) fstart = 0 if start: self.list['RegionDisorder'].append((start,len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart,len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog('\r#DIS','ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),dx)) return True except: if retry: self.printLog('#RETRY','Trying %s again...' % name) return self.ANCHOR(retry-1) self.log.errorLog('Error in Disorder.ANCHOR(%s). Disorder prediction failed.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] return False