Esempio n. 1
0
 def blast2fas(self):    ### Executes BLAST2FAS and copies results files
     '''Executes BLAST2FAS and copies results files.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         need2blast = self.opt['Force']
         null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = []
         if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n')
         self.debug(null_file)
         for seq in self.seqs():
             if seq.info['AccNum'] in null_list: nx += 1; continue
             hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True)
             for db in self.obj['SeqList'].list['Blast2Fas']:
                 self.debug(rje.isYounger(hfile,db))
                 self.debug(rje.isYounger(hfile,db) == hfile)
                 need2blast = need2blast or not rje.isYounger(hfile,db) == hfile
         if not need2blast:
             self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx)
             return False
         ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.backup(self,null_file); nx = 0
         if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')]
         elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')]
         if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')]
         rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir'))
         for seq in self.seqs():
             sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum'])
             if os.path.exists(sbfile):
                 hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True)
                 os.rename(sbfile,hfile)
                 if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile))
                 if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile))
             else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1
         if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file))
         self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx))
         return True
     except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
Esempio n. 2
0
 def setup(self):    ### Main class setup method.                                                                #V1.0
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Set job directory to RunPath if given, else directory from which job was submitted ~ ##
         try: jobdir = rje.makePath(os.environ['PBS_O_WORKDIR'])
         except: jobdir = None
         if self.getStr('RunPath') == rje.makePath(os.path.abspath(os.curdir)) and jobdir: self.setStr({'RunPath':jobdir})
         os.chdir(self.getStr('RunPath'))
         ## ~ [1b] ~ Read list of node names in file $PBS_NODEFILE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setHosts()
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Esempio n. 3
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.strlist = [
         'Password', 'RestIn', 'Rest', 'RestBase', 'RestOutDir', 'RestURL'
     ]
     self.boollist = ['PureAPI', 'RestOut']
     self.intlist = ['MaxRefresh', 'Refresh']
     self.numlist = []
     self.filelist = []
     self.listlist = ['RestKeys']
     self.dictlist = ['Output', 'Outfile']
     self.objlist = []
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(str='None',
                       bool=False,
                       int=0,
                       num=0.0,
                       obj=None,
                       setlist=True,
                       setdict=True,
                       setfile=True)
     self.setStr({
         'RestOutDir': rje.makePath('./'),
         'RestURL': 'http://rest.slimsuite.unsw.edu.au/'
     })
     self.setBool({'PureAPI': False, 'RestOut': False})
     self.setInt({'MaxRefresh': 600, 'Refresh': 5})
     self.setNum({})
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setForkAttributes()  # Delete if no forking
Esempio n. 4
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object'''
     ### Basics ###
     self.infolist = ['SearchDB', 'HMMOut', 'HMMTab', 'HMMerPath']
     self.optlist = ['HMMCalibrate', 'HMMPFam', 'GZip', 'CleanRes']
     self.statlist = []
     self.listlist = ['MakeHMM', 'HMMRes', 'HMM', 'HMMOptions']
     self.dictlist = []
     self.objlist = []
     ### Defaults ###
     self._setDefaults(info='None',
                       opt=False,
                       stat=0.0,
                       obj=None,
                       setlist=True,
                       setdict=True)
     self.setInfo({
         'HMMerPath':
         rje.makePath('/home/richard/Bioware/hmmer-2.3.2/src/'),
         'HMMOut':
         '',
         'HMMTab':
         ''
     })
     self.setOpt({'HMMCalibrate': True, 'GZip': True, 'CleanRes': True})
     self._cmdRead(cmd='hmm=*.hmm', type='glist', att='HMM')
     ### Other Attributes ###
     self.search = []
Esempio n. 5
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### Basics ###
     self.infolist = [
         'Sequence', 'Disorder', 'IUPath', 'IUMethod', 'ANCHOR'
     ]
     self.optlist = ['Flat', 'PrintLog', 'IUChDir']
     self.statlist = ['IUCut', 'FILoop', 'FISleep', 'MinRegion']
     self.listlist = ['ResidueDisorder', 'RegionDisorder', 'RegionFold']
     self.dictlist = []
     self.objlist = []
     ### Defaults ###
     self._setDefaults(info='',
                       opt=False,
                       stat=0.2,
                       obj=None,
                       setlist=True,
                       setdict=True)
     self.setInfo({
         'IUPath':
         rje.makePath('c:/bioware/iupred/iupred.exe', wholepath=True),
         'IUMethod':
         'short',
         'Disorder':
         'iupred'
     })
     self.setStat({
         'FILoop': 10,
         'FISleep': 2,
         'MinRegion': 0,
         'IUCut': 0.2
     })
Esempio n. 6
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.getStr('RestIn').startswith('http:'):
             #!# Check for rest URL and add if missing
             #!# Split on &
             restcmd = string.split(self.getStr('RestIn'),'&')
             for i in range(len(restcmd)):
                 if '=' not in restcmd[i]: continue
                 (opt,value) = string.split(restcmd[i],'=',1)
                 if value.startswith('file:'):   # Conversion of cmd=file:FILE into cmd=CONTENT
                     rfile = string.split(value,':',1)[1]
                     #!# Consider adding max size constraint. Probably a URL size limit.
                     if rje.exists(rfile):
                         restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n')))
                         if '&' in restcmd[i]:
                             self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile))
                             restcmd[i] = string.replace(restcmd[i],'&','+')
                     else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True)
             self.setStr({'RestIn':string.join(restcmd,'&')})
         ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:   # Convert to file
             self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)})
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Esempio n. 7
0
    def complexFasta(
            self):  ### Outputs parsed complex datasets in Fasta format
        '''Outputs parsed complex datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/')
            rje.mkDir(self, datpath)

            ### Output PPI Datasets ###
            for complex in rje.sortKeys(self.dict['Complex']):
                mylist = []
                for p2 in self.dict['Complex'][complex]:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p2]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath, complex)
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
            self.log.printLog('#FAS', 'HPRD complex fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.complexFasta()',
                              printerror=True,
                              quitchoice=False)
            raise
Esempio n. 8
0
    def saveFasta(self):  ### Outputs parsed PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/')
            rje.mkDir(self, datpath)
            ## Check Seqs ##
            for p1 in rje.sortKeys(self.dict['PPI']):
                if 'Seq' not in self.dict['HPRD'][p1]:  #!# KeyError #!#
                    print p1, self.dict['HPRD'][p1]
                    self.deBug('No Seq for %s' % p1)

            ### All sequences ###
            self.obj['SeqList'].saveFasta()
            ### Output PPI Datasets ###
            for p1 in rje.sortKeys(self.dict['PPI']):
                mylist = []
                for p2 in self.dict['PPI'][p1]:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p2]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath,
                                           self.dict['HPRD'][p1]['gene'])
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
            self.log.printLog('#FAS', 'HPRD PPI fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.saveFasta()',
                              printerror=True,
                              quitchoice=False)
Esempio n. 9
0
 def makePPIDatasets(self):  ### Generate PPI datasets from pairwise data
     '''Generate PPI datasets from pairwise data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self, 'YeastPPI/')
         seqdict = self.dict['SeqDict']
         ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0)
         for hub in rje.sortKeys(self.dict['PPI']):
             self.progLog(
                 '\r#FAS', 'Generating %s PPI fasta files: %.2f' %
                 (rje.integerString(fx), hx / htot))
             hx += 100.0
             if len(self.dict['PPI'][hub]) < 3: continue
             seqs = []
             for spoke in self.dict['PPI'][hub]:
                 if spoke not in seqdict: continue
                 seqs.append(seqdict[spoke])
             if len(seqs) < 3: continue
             self.obj['SeqList'].saveFasta(seqs,
                                           rje.makePath('YeastPPI/%s.fas' %
                                                        hub,
                                                        wholepath=True),
                                           log=False)
             fx += 1
         self.printLog(
             '\r#FAS',
             'Generation of %s PPI fasta files from %s hubs complete.' %
             (rje.integerString(fx), rje.integerString(htot)))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 10
0
 def setup(
     self
 ):  ### Main class setup method.                                                                #V1.0
     '''Main class setup method.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Set job directory to RunPath if given, else directory from which job was submitted ~ ##
         try:
             jobdir = rje.makePath(os.environ['PBS_O_WORKDIR'])
         except:
             jobdir = None
         if self.getStr('RunPath') == rje.makePath(
                 os.path.abspath(os.curdir)) and jobdir:
             self.setStr({'RunPath': jobdir})
         os.chdir(self.getStr('RunPath'))
         ## ~ [1b] ~ Read list of node names in file $PBS_NODEFILE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setHosts()
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Esempio n. 11
0
 def haqBatch(self,force=False): ### Generates Batch and INI files for HAQESAC runs
     '''Generates Batch and INI files for HAQESAC runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)
         inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],wholepath=True)
         if force or self.force() or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self,batfile); rje.backup(self,inifile)
         else: return self.printLog('#HAQBAT','HAQESAC Batch files found.')
         ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         haqcmd = []
         for cmd in self.cmd_list:
             if cmd[:4].lower() != 'ini=': haqcmd.append(cmd)
         if self.opt['MultiHAQ']: haqcmd += ['multihaq=T','force=F']
         open(inifile,'w').write(string.join(haqcmd,'\n'))
         ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             acc = seq.info['AccNum']
             haqcmd = ['seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc]
             open(batfile,'a').write('python %shaqesac.py %s\n' % (self.info['Path'],string.join(haqcmd)))
         self.printLog('#HAQBAT','HAQESAC Batch file output to %s' % batfile)
     except: self.errorLog('Major problem with MultiHAQ.haqBatch',quitchoice=True)
Esempio n. 12
0
 def setup(self):    ### Sets up headers and reads in existing data if present
     '''Sets up headers and reads in existing data if present.'''
     try:
         ### ~ Setup Basic Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         #X#headers = ['Alias','Species','Symbol','HGNC','Entrez','UniProt','EnsEMBL','HPRD','OMIM','EnsLoci','Desc']
         headers = ['Alias','Species'] + gc_headers  # All other headers added from altsource list
         ### ~ Read in data from existing files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.readHGNC()
         if self.opt['Update'] and os.path.exists(self.info['CardOut']): self.list['AltSource'].append(self.info['CardOut'])
         for altsource in self.list['AltSource']:
             sourcefile = rje.makePath(altsource,True)
             if not os.path.exists(sourcefile):
                 self.log.errorLog('Alternative source "%s" missing!' % sourcefile,printerror=False,quitchoice=True)
                 continue
             update = rje.dataDict(self,sourcefile,getheaders=True,ignore=['#'])
             for h in update.pop('Headers'):
                 if h not in headers:
                     headers.append(h)
             self.log.printLog('#DATA','Read GeneCards data for %d genes.' % (len(update)))
             for gene in rje.sortKeys(update):     # Each source will overwrite data from the file before
                 ## ~ Convert to Upper Case for consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if gene != gene.upper() and gene.upper() in update: continue    # Only use upper case one!
                 elif gene != gene.upper():
                     update[gene.upper()] = update.pop(gene)
                     gene = gene.upper()
                 if gene == '!FAILED!': continue
                 ## ~ Update main dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['Update'] and altsource == self.info['CardOut'] and gene not in self.list['Genes']: self.list['Genes'].append(gene)
                 if gene in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][gene],update[gene])
                 else: self.dict['GeneCard'][gene] = update[gene]
                 ## ~ Temp Debugging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if gene in self.list['TestGenes']:
                     print gene
                     print update[gene]
                     self.deBug(self.dict['GeneCard'][gene])
                 ## ~ Check Aliases etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'Symbol' in self.dict['GeneCard'][gene]: self.dict['GeneCard'][gene]['Symbol'] = self.dict['GeneCard'][gene]['Symbol'].upper()
                 if 'Symbol' in update[gene] and update[gene]['Symbol'] != '!FAILED!':
                     symbol = update[gene]['Symbol']
                     if symbol in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][symbol],update[gene],overwrite=False,replaceblanks=True)
                     else: self.dict['GeneCard'][symbol] = update[gene]
                 self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])),newline=False,log=False)
                 if len(string.split(gene)) > 1: print '!!!', gene, '!!!'
         ### ~ Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])))
         self.list['Headers'] = headers[0:]
         if self.opt['Update']: self.opt['Append'] = False
         #x#if 'TASP1' in self.dict['GeneCard']: self.deBug(self.dict['GeneCard']['TASP1'])
         #x#else: self.deBug(rje.sortKeys(self.dict['GeneCard']))
     except:
         self.log.errorLog('Problem during GeneCards.setup()')
         raise
Esempio n. 13
0
 def run(self):  ### Performs main run method, including both setup and UniFake
     '''Performs main run method, including both setup and UniFake.'''
     ### ~ [1] ~ Setup aliases and features dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.setup()
     ### ~ [2] ~ Perform main UniFake file generation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.uniFake()
     ### ~ [3] ~ Index files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if self.opt['MakeIndex']:
         i = self.stat['Interactive']
         self.stat['Interactive'] = -1
         self.info['UniPath'] = rje.makePath(os.path.split(self.info['DatOut'])[0])
         rje_uniprot.processUniProt(self,makeindex=True,makespec=False,makefas=False)
         self.stat['Interactive'] = i
Esempio n. 14
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### Basics ###
     self.infolist = ['CardOut','EnsLoci','HGNCData','Species']
     self.optlist = ['FullEns','FullHGNC','Update','Purify','Restrict','UseWeb']
     self.statlist = []
     self.listlist = ['AltSource','Genes','SkipList','TestGenes']
     self.dictlist = ['CardMap','EnsDesc','EnsLoci','GeneCard']
     self.objlist = []
     ### Defaults ###
     self._setDefaults(info='None',opt=True,stat=0.0,obj=None,setlist=True,setdict=True)
     self.setInfo({'CardOut':'genecards.tdt','Species':'Human',
                   'EnsLoci':rje.makePath('/home/richard/Databases/EnsEMBL/ens_HUMAN.loci.fas',True)})
     self.setOpt({'FullEns':False,'Purify':False,'Restrict':False})
Esempio n. 15
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### Basics ###
     self.infolist = ['Sequence','Disorder','IUPath','IUMethod','ANCHOR']
     self.optlist = ['Flat','PrintLog','IUChDir']
     self.statlist = ['IUCut','FILoop','FISleep','MinRegion']
     self.listlist = ['ResidueDisorder','RegionDisorder','RegionFold']
     self.dictlist = []
     self.objlist = []
     ### Defaults ###
     self._setDefaults(info='',opt=False,stat=0.2,obj=None,setlist=True,setdict=True)
     self.setInfo({'IUPath':rje.makePath('c:/bioware/iupred/iupred.exe',wholepath=True),'IUMethod':'short',
                   'Disorder':'iupred'})
     self.setStat({'FILoop':10,'FISleep':2,'MinRegion':0,'IUCut':0.2})
Esempio n. 16
0
 def saveReadMe(self,
                filename='pydocs.txt',
                append=False):  ### Prints docs for modules to file
     '''
     Prints docs for modules to file.
     >> filename:str = output file name
     >> append:boolean
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pydoc = self.obj['PyDoc']
         if append:
             self.printLog('#DOC', 'Appending docstrings to %s' % filename)
             PYDOC = open(filename, 'a')
         else:
             rje.mkDir(self, filename)
             self.printLog('#DOC', 'Writing docstrings to %s' % filename)
             PYDOC = open(filename, 'w')
             PYDOC.write(self.readMeHeader())
         db = self.db('Module')
         dx = 0
         ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for sourcedir in pydoc.list['SourceDir']:
             PYDOC.write('-%s:\n\n' % sourcedir)
             for pyfile in db.dataKeys():
                 entry = db.data(pyfile)
                 module = entry['Module']
                 if not pyfile.find(sourcedir) >= 0 or not os.path.exists(
                         '%s%s%s.py' % (pydoc.getStr('PyPath'),
                                        rje.makePath(sourcedir), module)):
                     continue
                 ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,
                                                              pyfile)
                 while len(mtxt) < 122:
                     mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:]
                 try:
                     PYDOC.write('%s\n\n%s\n' % (mtxt, entry['DocString']))
                     dx += 1
                 except:
                     self.errorLog('Cannot write DocString for %s' % module,
                                   printerror=False)
                     PYDOC.write('%s\n\nDocString Error!\n' % (mtxt))
                     dx += 1
             PYDOC.write('\n\n\n')
         PYDOC.close()
         self.printLog(
             '#DOC', 'Output to %s complete: %s modules.' %
             (filename, rje.iStr(dx)))
     except:
         self.errorLog('Error in %s.saveDocs()' % self.prog())
Esempio n. 17
0
 def haqBatch(
         self,
         force=False):  ### Generates Batch and INI files for HAQESAC runs
     '''Generates Batch and INI files for HAQESAC runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],
                                wholepath=True)
         inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],
                                wholepath=True)
         if force or self.force(
         ) or not rje.exists(batfile) or not rje.exists(inifile):
             rje.backup(self, batfile)
             rje.backup(self, inifile)
         else:
             return self.printLog('#HAQBAT', 'HAQESAC Batch files found.')
         ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         haqcmd = []
         for cmd in self.cmd_list:
             if cmd[:4].lower() != 'ini=': haqcmd.append(cmd)
         if self.opt['MultiHAQ']: haqcmd += ['multihaq=T', 'force=F']
         open(inifile, 'w').write(string.join(haqcmd, '\n'))
         ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             acc = seq.info['AccNum']
             haqcmd = [
                 'seqin=%s.fas' % acc,
                 'query=%s' % acc,
                 'basefile=%s' % acc
             ]
             open(batfile,
                  'a').write('python %shaqesac.py %s\n' %
                             (self.info['Path'], string.join(haqcmd)))
         self.printLog('#HAQBAT',
                       'HAQESAC Batch file output to %s' % batfile)
     except:
         self.errorLog('Major problem with MultiHAQ.haqBatch',
                       quitchoice=True)
Esempio n. 18
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.infolist = ['HaqDir','HAQBLASTDir']
     self.optlist = ['AddQueries','AutoSkip','Chaser','HAQESAC','MultiHAQ','ScreenQry']
     self.statlist = ['BlastCut','MultiCut']
     self.listlist = []
     self.dictlist = []
     self.objlist = ['SeqList']
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(info='None',opt=True,stat=0.0,obj=None,setlist=True,setdict=True)
     self.basefile('MultiHAQ')
     self.setOpt({'Chaser':False,'AutoSkip':False})
     self.setStr({'HAQBLASTDir':rje.makePath('./HAQBLAST/')})
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setForkAttributes()   # Delete if no forking
Esempio n. 19
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object'''
     ### Basics ###
     self.infolist = ['SearchDB','HMMOut','HMMTab','HMMerPath']
     self.optlist = ['HMMCalibrate','HMMPFam','GZip','CleanRes']
     self.statlist = []
     self.listlist = ['MakeHMM','HMMRes','HMM','HMMOptions']
     self.dictlist = []
     self.objlist = []
     ### Defaults ###
     self._setDefaults(info='None',opt=False,stat=0.0,obj=None,setlist=True,setdict=True)
     self.setInfo({'HMMerPath':rje.makePath('/home/richard/Bioware/hmmer-2.3.2/src/'),'HMMOut':'','HMMTab':''})
     self.setOpt({'HMMCalibrate':True,'GZip':True,'CleanRes':True})
     self._cmdRead(cmd='hmm=*.hmm',type='glist',att='HMM')
     ### Other Attributes ###
     self.search = []
Esempio n. 20
0
 def domainFasta(self):    ### Outputs parsed domain and domain PPI datasets in Fasta format
     '''Outputs parsed PPI datasets in Fasta format.'''
     try:
         ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         headers = ['Domain','HPRD','Gene']
         dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
         rje.delimitedFileOutput(self,dfile,headers,'\t')
         sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
         shead = ['Domain','Source']
         rje.delimitedFileOutput(self,sfile,shead,'\t')
         dx = 0.0
         for domain in rje.sortKeys(self.dict['Domains']):
             self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False)
             dx += 100.0
             for hid in self.dict['Domains'][domain]:
                 datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']}
                 rje.delimitedFileOutput(self,dfile,headers,'\t',datadict)
             for source in self.dict['DomainSource'][domain]:
                 datadict = {'Domain':domain,'Source':source}
                 rje.delimitedFileOutput(self,sfile,shead,'\t',datadict)
         self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains']))))
                    
         ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/')
         rje.mkDir(self,datpath)
         for domain in rje.sortKeys(self.dict['Domains']):
             ## Generate a list of all interactors with domain-containing proteins ##
             plist = []
             for p1 in self.dict['Domains'][domain]:
                 if p1 not in self.dict['PPI']: continue
                 for p2 in self.dict['PPI'][p1]:
                     if p2 not in plist: plist.append(p2)
             plist.sort()
             ## Generate Sequence list and output ##
             mylist = []
             for p in plist:
                 if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq']
                 else: mylist.append(self.dict['HPRD'][p]['Seq'])
             sfile = '%s%s_hprd.fas' % (datpath,domain)
             if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
             else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain)
         self.log.printLog('\r#DOM','HPRD Domain fasta output complete.')
     except:
         self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
         raise
Esempio n. 21
0
 def makePPIDatasets(self):  ### Generate PPI datasets from pairwise data
     '''Generate PPI datasets from pairwise data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self,'YeastPPI/')
         seqdict = self.dict['SeqDict']
         ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx,htot,fx) = (0.0,len(self.dict['PPI']),0)
         for hub in rje.sortKeys(self.dict['PPI']):
             self.progLog('\r#FAS','Generating %s PPI fasta files: %.2f' % (rje.integerString(fx),hx/htot)); hx += 100.0
             if len(self.dict['PPI'][hub]) < 3: continue
             seqs = []
             for spoke in self.dict['PPI'][hub]:
                 if spoke not in seqdict: continue
                 seqs.append(seqdict[spoke])
             if len(seqs) < 3: continue
             self.obj['SeqList'].saveFasta(seqs,rje.makePath('YeastPPI/%s.fas' % hub,wholepath=True),log=False); fx+=1
         self.printLog('\r#FAS','Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx),rje.integerString(htot)))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Esempio n. 22
0
 def run(self
         ):  ### Performs main run method, including both setup and UniFake
     '''Performs main run method, including both setup and UniFake.'''
     ### ~ [1] ~ Setup aliases and features dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.setup()
     ### ~ [2] ~ Perform main UniFake file generation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.uniFake()
     ### ~ [3] ~ Index files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if self.opt['MakeIndex']:
         i = self.stat['Interactive']
         self.stat['Interactive'] = -1
         self.info['UniPath'] = rje.makePath(
             os.path.split(self.info['DatOut'])[0])
         rje_uniprot.processUniProt(self,
                                    makeindex=True,
                                    makespec=False,
                                    makefas=False)
         self.stat['Interactive'] = i
Esempio n. 23
0
    def complexFasta(self):     ### Outputs parsed complex datasets in Fasta format
        '''Outputs parsed complex datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/')
            rje.mkDir(self,datpath)

            ### Output PPI Datasets ###
            for complex in rje.sortKeys(self.dict['Complex']):
                mylist = []
                for p2 in self.dict['Complex'][complex]:
                    if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq']
                    else: mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath,complex)
                if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
            self.log.printLog('#FAS','HPRD complex fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.complexFasta()',printerror=True,quitchoice=False)
            raise
Esempio n. 24
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.strlist = ['Password','RestIn','Rest','RestBase','RestOutDir','RestURL']
     self.boollist = ['PureAPI','RestOut']
     self.intlist = ['MaxRefresh','Refresh']
     self.numlist = []
     self.filelist = []
     self.listlist = ['RestKeys']
     self.dictlist = ['Output','Outfile']
     self.objlist = []
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(str='None',bool=False,int=0,num=0.0,obj=None,setlist=True,setdict=True,setfile=True)
     self.setStr({'RestOutDir':rje.makePath('./'),'RestURL':'http://rest.slimsuite.unsw.edu.au/'})
     self.setBool({'PureAPI':False,'RestOut':False})
     self.setInt({'MaxRefresh':600,'Refresh':5})
     self.setNum({})
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setForkAttributes()   # Delete if no forking
Esempio n. 25
0
 def gopher(self):  ### Sets up data for GOPHER run
     '''Sets up data for GOPHER run.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self,'BLAST/')
         rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.ygob.fas' % self.info['Basefile'],protein=True,force=False)
         rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.yeast.fas' % self.info['Basefile'],protein=True,force=False)
         seqdict = self.obj['SeqList'].seqNameDic('AccNum')
         ymap = self.dict['PillarMap'] = {}
         ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (px,ptot) = (0.0,len(self.list['Pillars'])); ox = 0
         for pillar in self.list['Pillars']:
             self.progLog('\r#YGOB','Converting YGOB Pillars for GOPHER: %.2f%%' % (px/ptot)); px += 100
             newpillar = []
             for yid in pillar:
                 seq = rje_sequence.Sequence(self.log,self.cmd_list)
                 seq.opt['Yeast'] = True
                 #self.deBug(yid)
                 seq.info['Name'] = yid
                 seq.extractDetails(gnspacc=True)
                 #self.deBug(seq.info)
                 ygob = seq.info['AccNum']
                 if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob]
                 else: acc = ygob
                 ymap[yid] = acc
                 if acc not in seqdict: self.printLog('\r#GENE','Non-coding gene %s (%s)? Cannot find in fasta file' % (acc,yid)); continue
                 try:
                     newpillar.append(seqdict[acc].shortName())
                 except:
                     print yid, ygob, acc
                     self.errorLog(rje_zen.Zen().wisdom())
             if not newpillar: continue
             for ygob in pillar:
                 acc = ymap[ygob]
                 if acc not in seqdict: continue
                 if acc in self.list['YeastSeq'] or (not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'):
                     open(rje.makePath('BLAST/%s.blast.id' % acc,wholepath=True),'w').write(string.join(newpillar,'\n'))
                     ox += 1
         self.progLog('\r#YGOB','Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Esempio n. 26
0
    def saveFasta(self):    ### Outputs parsed PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/')
            rje.mkDir(self,datpath)
            ## Check Seqs ##
            for p1 in rje.sortKeys(self.dict['PPI']):
                if 'Seq' not in self.dict['HPRD'][p1]:      #!# KeyError #!#
                    print p1, self.dict['HPRD'][p1]
                    self.deBug('No Seq for %s' % p1)

            ### All sequences ###
            self.obj['SeqList'].saveFasta()
            ### Output PPI Datasets ###
            for p1 in rje.sortKeys(self.dict['PPI']):
                mylist = []
                for p2 in self.dict['PPI'][p1]:
                    if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq']
                    else: mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath,self.dict['HPRD'][p1]['gene'])
                if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
            self.log.printLog('#FAS','HPRD PPI fasta output complete.')
        except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
Esempio n. 27
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.infolist = ['HaqDir', 'HAQBLASTDir']
     self.optlist = [
         'AddQueries', 'AutoSkip', 'Chaser', 'HAQESAC', 'MultiHAQ',
         'ScreenQry'
     ]
     self.statlist = ['BlastCut', 'MultiCut']
     self.listlist = []
     self.dictlist = []
     self.objlist = ['SeqList']
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(info='None',
                       opt=True,
                       stat=0.0,
                       obj=None,
                       setlist=True,
                       setdict=True)
     self.basefile('MultiHAQ')
     self.setOpt({'Chaser': False, 'AutoSkip': False})
     self.setStr({'HAQBLASTDir': rje.makePath('./HAQBLAST/')})
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setForkAttributes()  # Delete if no forking
Esempio n. 28
0
    def farmHAQ(self):  ### Uses SLiMFarmer to farm out the HAQESAC runs
        '''Uses SLiMFarmer to farm out the HAQESAC runs.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            batfile = os.path.abspath(
                rje.makePath('%shaqesac.bat' % self.info['HaqDir'],
                             wholepath=True))
            self.printLog('#FARM', batfile)
            if not rje.exists(batfile):
                raise IOError('Cannot find %s' % batfile)
            farmcmd = [
                'subjobs=%s' % batfile, 'farm=batch', 'qsub=F', 'i=-1',
                'runpath=%s' % os.path.abspath(self.info['HaqDir'])
            ]
            if self.opt['MultiHAQ']:
                haqfarm = ['First round', 'Second round']
            else:
                haqfarm = ['Complete run']

            ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for farmrun in haqfarm:
                self.printLog(
                    '#CHDIR', 'Changing directory for %s farming: %s' %
                    (farmrun, self.info['HaqDir']))
                os.chdir(self.info['HaqDir'])
                farmer = slimfarmer.SLiMFarmer(self.log,
                                               self.cmd_list + farmcmd)
                farmer.slimFarm()
                os.chdir(self.info['RunPath'])
                self.printLog(
                    '#CHDIR', 'Changed directory post-farming: %s' %
                    self.info['RunPath'])
                self.printLog('#FARM',
                              'HAQESAC %s farming complete.' % farmrun)
            return True

            #!# Add identifying and skipping of partial runs.

            for seq in self.seqs():
                ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                acc = seq.info['AccNum']
                if finalrun and acc in processed and (
                        self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo(
                            '%s already covered by previous HAQESAC. Skip?' %
                            seq.shortName()))):
                    self.printLog(
                        '#SKIP',
                        '%s already covered by previous HAQESAC: Skipped' %
                        seq.shortName())
                    continue
                ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
                logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc),
                                       wholepath=True)
                infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc),
                                      wholepath=True)
                pkfile = rje.makePath('%s%s.pickle' %
                                      (self.info['HaqDir'], acc),
                                      wholepath=True)
                pkzfile = rje.makePath('%s%s.pickle.gz' %
                                       (self.info['HaqDir'], acc),
                                       wholepath=True)
                if not os.path.exists(infile):
                    self.printLog(
                        '#SKIP', '%s input file %s not found: Skipped' %
                        (seq.shortName(), infile))
                    continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(
                        pkzfile, infile) == pkzfile:
                    self.printLog('#SKIP',
                                  '%s run detected: Skipped' % seq.shortName())
                    continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(
                        pkfile, infile) == pkfile:
                    self.printLog('#SKIP',
                                  '%s run detected: Skipped' % seq.shortName())
                    continue
                inseqx = rje_seq.SeqCount(self, infile)
                if inseqx < 2:
                    self.printLog(
                        '#SKIP',
                        'Only one sequence found in %s: Skipped' % (infile))
                    continue
                ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                pickled = os.path.exists(pkfile) or os.path.exists(
                    '%s.gz' % pkfile)
                tm = 0
                while secondrun and self.opt['Chaser'] and not pickled:
                    self.progLog(
                        '#WAIT',
                        'No %s pickle. Sleeping for %d min.' % (acc, tm))
                    time.sleep(60 * tm)
                    tm += 1
                    pickled = os.path.exists(pkfile) or os.path.exists(
                        '%s.gz' % pkfile)
                    if not pickled:
                        try:
                            rje.choice(
                                'Press <ENTER> to try again, or <CTRL+C> to Quit'
                            )
                        except:
                            self.printLog('#PICKLE',
                                          'No %s pickle.' % (acc, tm))
                            self.printLog('\r#MULTI',
                                          'Exiting multiHAQ "Chaser" run.')
                            return
                ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                runhaqesac = True
                pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc),
                                       wholepath=True)
                if not self.force() and rje.exists(pngfile):
                    self.printLog(
                        '#SKIP',
                        'Found evidence of completed run: %s (force=F). Skipping.'
                        % pngfile)
                    runhaqesac = False
                ancfile = rje.makePath('%s%s.anc.fas' %
                                       (self.info['HaqDir'], acc),
                                       wholepath=True)
                if not self.force() and rje.exists(ancfile):
                    self.printLog(
                        '#SKIP',
                        'Found evidence of completed run: %s (force=F). Skipping.'
                        % ancfile)
                    runhaqesac = False

        except:
            os.chdir(self.info['RunPath'])
            self.errorLog('Major problem with MultiHAQ.farmHAQ',
                          quitchoice=True)
Esempio n. 29
0
 def multiHAQ(self, secondrun=False):  ### Executes main HAQESAC runs
     '''Executes main HAQESAC runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         finalrun = secondrun == self.opt[
             'MultiHAQ']  # Whether this is the manual HAQESAC phase
         qryacc = self.obj['SeqList'].accList(
         )  # Full list of Query accession numbers
         processed = []  # List of processed sequence accession numbers
         ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             acc = seq.info['AccNum']
             if finalrun and acc in processed and (
                     self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo(
                         '%s already covered by previous HAQESAC. Skip?' %
                         seq.shortName()))):
                 self.printLog(
                     '#SKIP',
                     '%s already covered by previous HAQESAC: Skipped' %
                     seq.shortName())
                 continue
             ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
             logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc),
                                    wholepath=True)
             infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc),
                                   wholepath=True)
             pkfile = rje.makePath('%s%s.pickle' %
                                   (self.info['HaqDir'], acc),
                                   wholepath=True)
             pkzfile = rje.makePath('%s%s.pickle.gz' %
                                    (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not os.path.exists(infile):
                 self.printLog(
                     '#SKIP', '%s input file %s not found: Skipped' %
                     (seq.shortName(), infile))
                 continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(
                     pkzfile, infile) == pkzfile:
                 self.printLog('#SKIP',
                               '%s run detected: Skipped' % seq.shortName())
                 continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(
                     pkfile, infile) == pkfile:
                 self.printLog('#SKIP',
                               '%s run detected: Skipped' % seq.shortName())
                 continue
             inseqx = rje_seq.SeqCount(self, infile)
             if inseqx < 2:
                 self.printLog(
                     '#SKIP',
                     'Only one sequence found in %s: Skipped' % (infile))
                 continue
             ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             pickled = os.path.exists(pkfile) or os.path.exists(
                 '%s.gz' % pkfile)
             tm = 0
             while secondrun and self.opt['Chaser'] and not pickled:
                 self.progLog(
                     '#WAIT',
                     'No %s pickle. Sleeping for %d min.' % (acc, tm))
                 time.sleep(60 * tm)
                 tm += 1
                 pickled = os.path.exists(pkfile) or os.path.exists(
                     '%s.gz' % pkfile)
                 if not pickled:
                     try:
                         rje.choice(
                             'Press <ENTER> to try again, or <CTRL+C> to Quit'
                         )
                     except:
                         self.printLog('#PICKLE',
                                       'No %s pickle.' % (acc, tm))
                         self.printLog('\r#MULTI',
                                       'Exiting multiHAQ "Chaser" run.')
                         return
             ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             runhaqesac = True
             pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not self.force() and rje.exists(pngfile):
                 self.printLog(
                     '#SKIP',
                     'Found evidence of completed run: %s (force=F). Skipping.'
                     % pngfile)
                 runhaqesac = False
             ancfile = rje.makePath('%s%s.anc.fas' %
                                    (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not self.force() and rje.exists(ancfile):
                 self.printLog(
                     '#SKIP',
                     'Found evidence of completed run: %s (force=F). Skipping.'
                     % ancfile)
                 runhaqesac = False
             #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile:
             if runhaqesac:
                 haqcmd = [
                     'ini=haqesac.ini',
                     'seqin=%s.fas' % acc,
                     'query=%s' % acc,
                     'basefile=%s' % acc, 'newlog=F'
                 ]
                 self.printLog(
                     '#HAQ',
                     'Running HAQESAC for %s - will have own log etc.' %
                     seq.shortName(),
                     log=False)
                 os.chdir(self.info['HaqDir'])
                 info = haqesac.makeInfo()
                 haqcmd = rje.getCmdList(haqcmd, info=info)
                 out = rje.Out(
                     cmd_list=haqcmd
                 )  # Sets up Out object for controlling output to screen
                 out.printIntro(
                     info
                 )  # Prints intro text using details from Info object
                 haqlog = rje.setLog(
                     info, out, haqcmd
                 )  # Sets up Log object for controlling log file output
                 try:
                     haqesac.HAQESAC(log=haqlog,
                                     cmd_list=haqcmd).run(setobjects=True)
                 except:
                     os.chdir(self.info['RunPath'])
                     if self.i() >= 0 and rje.yesNo(
                             'Problem with %s HAQESAC run. Abort?' %
                             seq.shortName()):
                         raise KeyboardInterrupt
                 os.chdir(self.info['RunPath'])
                 if finalrun:
                     self.printLog(
                         '#HAQ',
                         'HAQESAC final round run for %s' % seq.shortName())
                 else:
                     self.printLog(
                         '#HAQ',
                         'HAQESAC first round run for %s' % seq.shortName())
             ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not self.opt['ScreenQry'] or not finalrun: continue
             qacclist = []
             for qacc in rje_seq.SeqList(
                     self.log,
                 ['seqin=%s' % infile, 'autoload=T', 'autofilter=F'
                  ]).accList():
                 if qacc in qryacc and qacc != acc: qacclist.append(qacc)
                 if qacc in qryacc and qacc not in processed:
                     processed.append(qacc)
             self.printLog(
                 '#QRY', '%d other queries found in %s: [%s]' %
                 (len(qacclist), infile, string.join(qacclist, '; ')))
             self.printLog(
                 '#QRY', '%d of %d queries processed' %
                 (len(processed), self.seqNum()))
         ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not finalrun:
             self.printLog('#MULTI', 'Executing second round of multiHAQ')
             self.multiHAQ(True)
     except:
         self.errorLog('Major problem with MultiHAQ.multiHAQ',
                       quitchoice=True)
Esempio n. 30
0
    def domainFasta(
        self
    ):  ### Outputs parsed domain and domain PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Domain', 'HPRD', 'Gene']
            dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
            rje.delimitedFileOutput(self, dfile, headers, '\t')
            sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
            shead = ['Domain', 'Source']
            rje.delimitedFileOutput(self, sfile, shead, '\t')
            dx = 0.0
            for domain in rje.sortKeys(self.dict['Domains']):
                self.log.printLog('\r#DOM',
                                  'HPRD Domain output (%s): %.1f%%' %
                                  (dfile, dx / len(self.dict['Domains'])),
                                  newline=False,
                                  log=False)
                dx += 100.0
                for hid in self.dict['Domains'][domain]:
                    datadict = {
                        'Domain': domain,
                        'HPRD': hid,
                        'Gene': self.dict['HPRD'][hid]['gene']
                    }
                    rje.delimitedFileOutput(self, dfile, headers, '\t',
                                            datadict)
                for source in self.dict['DomainSource'][domain]:
                    datadict = {'Domain': domain, 'Source': source}
                    rje.delimitedFileOutput(self, sfile, shead, '\t', datadict)
            self.log.printLog(
                '\r#DOM', 'HPRD Domain output (%s): %s domains.' %
                (dfile, rje.integerString(len(self.dict['Domains']))))

            ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datpath = self.info['OutDir'] + rje.makePath(
                'HPRD_Domain_Datasets/')
            rje.mkDir(self, datpath)
            for domain in rje.sortKeys(self.dict['Domains']):
                ## Generate a list of all interactors with domain-containing proteins ##
                plist = []
                for p1 in self.dict['Domains'][domain]:
                    if p1 not in self.dict['PPI']: continue
                    for p2 in self.dict['PPI'][p1]:
                        if p2 not in plist: plist.append(p2)
                plist.sort()
                ## Generate Sequence list and output ##
                mylist = []
                for p in plist:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath, domain)
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
                else:
                    self.log.printLog(
                        '#DOM', 'No PPI partners for domain "%s"' % domain)
            self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.saveFasta()',
                              printerror=True,
                              quitchoice=False)
            raise
Esempio n. 31
0
 def gopher(self):  ### Sets up data for GOPHER run
     '''Sets up data for GOPHER run.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self, 'BLAST/')
         rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(
             fasfile='%s.ygob.fas' % self.info['Basefile'],
             protein=True,
             force=False)
         rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(
             fasfile='%s.yeast.fas' % self.info['Basefile'],
             protein=True,
             force=False)
         seqdict = self.obj['SeqList'].seqNameDic('AccNum')
         ymap = self.dict['PillarMap'] = {}
         ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (px, ptot) = (0.0, len(self.list['Pillars']))
         ox = 0
         for pillar in self.list['Pillars']:
             self.progLog(
                 '\r#YGOB',
                 'Converting YGOB Pillars for GOPHER: %.2f%%' % (px / ptot))
             px += 100
             newpillar = []
             for yid in pillar:
                 seq = rje_sequence.Sequence(self.log, self.cmd_list)
                 seq.opt['Yeast'] = True
                 #self.deBug(yid)
                 seq.info['Name'] = yid
                 seq.extractDetails(gnspacc=True)
                 #self.deBug(seq.info)
                 ygob = seq.info['AccNum']
                 if ygob in self.dict['Rename']:
                     acc = self.dict['Rename'][ygob]
                 else:
                     acc = ygob
                 ymap[yid] = acc
                 if acc not in seqdict:
                     self.printLog(
                         '\r#GENE',
                         'Non-coding gene %s (%s)? Cannot find in fasta file'
                         % (acc, yid))
                     continue
                 try:
                     newpillar.append(seqdict[acc].shortName())
                 except:
                     print yid, ygob, acc
                     self.errorLog(rje_zen.Zen().wisdom())
             if not newpillar: continue
             for ygob in pillar:
                 acc = ymap[ygob]
                 if acc not in seqdict: continue
                 if acc in self.list['YeastSeq'] or (
                         not self.list['YeastSeq']
                         and seqdict[acc].info['SpecCode'] == 'YEAST'):
                     open(
                         rje.makePath('BLAST/%s.blast.id' % acc,
                                      wholepath=True),
                         'w').write(string.join(newpillar, '\n'))
                     ox += 1
         self.progLog(
             '\r#YGOB',
             'Converted YGOB Pillars for GOPHER: %s BLAST ID files.' %
             rje.iStr(ox))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 32
0
    def farmHAQ(self):  ### Uses SLiMFarmer to farm out the HAQESAC runs
        '''Uses SLiMFarmer to farm out the HAQESAC runs.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            batfile = os.path.abspath(rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True))
            self.printLog('#FARM',batfile)
            if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile)
            farmcmd = ['subjobs=%s' % batfile,'farm=batch','qsub=F','i=-1','runpath=%s' % os.path.abspath(self.info['HaqDir'])]
            if self.opt['MultiHAQ']:
                haqfarm = ['First round','Second round']
            else: haqfarm = ['Complete run']

            ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for farmrun in haqfarm:
                self.printLog('#CHDIR','Changing directory for %s farming: %s' % (farmrun,self.info['HaqDir']))
                os.chdir(self.info['HaqDir'])
                farmer = slimfarmer.SLiMFarmer(self.log,self.cmd_list+farmcmd)
                farmer.slimFarm()
                os.chdir(self.info['RunPath'])
                self.printLog('#CHDIR','Changed directory post-farming: %s' % self.info['RunPath'])
                self.printLog('#FARM','HAQESAC %s farming complete.' % farmrun)
            return True

            #!# Add identifying and skipping of partial runs.

            for seq in self.seqs():
                ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                acc = seq.info['AccNum']
                if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))):
                    self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue
                ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
                logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True)
                infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True)
                pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True)
                pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True)
                if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile:
                    self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile:
                    self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
                inseqx = rje_seq.SeqCount(self,infile)
                if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue
                ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0
                while secondrun and self.opt['Chaser'] and not pickled:
                    self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm))
                    time.sleep(60*tm); tm += 1
                    pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile)
                    if not pickled:
                        try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit')
                        except:
                            self.printLog('#PICKLE','No %s pickle.' % (acc,tm))
                            self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return
                ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                runhaqesac = True
                pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True)
                if not self.force() and rje.exists(pngfile):
                    self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile)
                    runhaqesac = False
                ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True)
                if not self.force() and rje.exists(ancfile):
                    self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile)
                    runhaqesac = False

        except:
            os.chdir(self.info['RunPath'])
            self.errorLog('Major problem with MultiHAQ.farmHAQ',quitchoice=True)
Esempio n. 33
0
    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0
Esempio n. 34
0
    def makeHTML(self):  ### Generates HTML pages for interactive navigation.
        '''Generates HTML pages for interactive navigation.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            basefile = self.basefile()
            scmd = self.cmd_list + [
                'seqin=%s' % self.getStr('Candidates'), 'autoload=T',
                'autofilter=F', 'seqmode=file'
            ]
            candseq = rje_seqlist.SeqList(self.log, scmd)
            # All files and directories are named after basefile:
            # *.fas = original target PROTEIN sequences (with original descriptions)
            scmd = self.cmd_list + [
                'seqin=%s' % self.getStr('SeqIn'), 'autoload=T',
                'autofilter=F', 'seqmode=file'
            ]
            seqlist = rje_seqlist.SeqList(self.log, scmd)
            # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.)
            gdb = self.db().addTable('%s.gablam.tdt' % basefile,
                                     mainkeys=['Qry', 'Hit'],
                                     name='gablam',
                                     expect=False)
            # - Contains candidate proteins as Queries and Target proteins as hits
            # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum
            haqdir = rje.makePath('./%s.HAQESAC/' % basefile)

            ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            hfile = '%s.html' % basefile
            hobj = self.obj['HTML']
            hobj.list['StyleSheets'] = [
                'http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css',
                'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css'
            ]
            html = hobj.htmlHead(basefile)
            # Front page should have:
            html += '<h1>%s</h1>\n\n' % basefile
            htabs = []  # (tab_id, tab_html_text[, tab_title])
            # Target protein list (with links to HAQ HTML)
            ctext = '%s\n' % string.join(['Name', 'Descripton', 'Length'],
                                         '\t')
            seqdict = seqlist.makeSeqNameDic('short')
            if gdb: hitlist = gdb.indexKeys('Hit')
            else: hitlist = rje.sortKeys(seqdict)
            for name in hitlist:
                seq = seqdict[name]
                cseq = [
                    name,
                    seqlist.seqDesc(seq),
                    '%s aa' % seqlist.seqLen(seq)
                ]
                acc = seqlist.seqAcc(seq)
                if os.path.exists('%s%s.log' % (haqdir, acc)):
                    cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir, acc,
                                                              cseq[0])
                ctext += '%s\n' % string.join(cseq, '\t')
            htabs.append(
                ('Hits', rje_html.tableToHTML(ctext, '\t', tabid='parse'),
                 'Target sequences hit by candidates.'))
            # GABLAM/HMM table (with above links)
            if gdb:
                ctext = '%s\n' % string.join(gdb.fields(), '\t')
                for gline in open('%s.gablam.tdt' % basefile,
                                  'r').readlines()[1:]:
                    gdata = string.split(gline, '\t')
                    acc = string.split(gdata[0], '__')[-1]
                    gdata[
                        0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (
                            acc, gdata[0])
                    acc = string.split(gdata[1], '__')[-1]
                    gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir, acc,
                                                               gdata[1])
                    ctext += '%s\n' % string.join(gdata, '\t')
                htabs.append(
                    ('GABLAM', rje_html.tableToHTML(ctext, '\t',
                                                    tabid='parse'),
                     'GABLAM hit table.'))
            # Candidate list (with DB links)
            if candseq.seqNum():
                ctext = '%s\n' % string.join(
                    ['AccNum', 'ID', 'Descripton', 'Length'], '\t')
                accdict = candseq.makeSeqNameDic('accnum')
                for acc in rje.sortKeys(accdict):
                    seq = accdict[acc]
                    cseq = [
                        acc,
                        candseq.seqID(seq),
                        candseq.seqDesc(seq),
                        '%s aa' % candseq.seqLen(seq)
                    ]
                    cseq[
                        0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (
                            acc, acc)
                    ctext += '%s\n' % string.join(cseq, '\t')
                htabs.append(('Candidates',
                              rje_html.tableToHTML(ctext, '\t', tabid='parse'),
                              'Candidate sequences to search.'))
            html += hobj.tabberHTML('GABLAM', htabs)
            html += hobj.htmlTail()
            open(hfile, 'w').write(html)

            ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #?# Move this to HAQESAC or MultiHAQ
            for i in range(len(hitlist)):
                hit = string.split(hitlist[i], '__')[-1]
                logfile = '%s%s.log' % (haqdir, hit)
                seqbase = logfile[:-4]
                hfile = '%s.html' % seqbase
                html = hobj.htmlHead(seqbase)
                # Front page should have:
                html += '<h1>%s</h1>\n\n' % seqbase
                html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile
                if i:
                    html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % (
                        string.split(hitlist[i - 1], '__')[-1], hitlist[i - 1])
                if i < len(hitlist) - 1:
                    html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % (
                        string.split(hitlist[i + 1], '__')[-1], hitlist[i + 1])
                html += '</p>\n'
                htabs = []  # (tab_id, tab_html_text[, tab_title])
                for ftype in ['png', 'tree.txt', 'fas', 'nwk', 'log']:
                    seqfile = '%s.%s' % (seqbase, ftype)
                    if not os.path.exists(seqfile): continue
                    tabtext = '<p><a href="./%s">./%s</a></p>\n' % (
                        os.path.basename(seqfile), os.path.basename(seqfile))
                    if ftype == 'png':
                        tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % (
                            os.path.basename(seqfile),
                            os.path.basename(seqfile))
                        tabdesc = 'PNG of %s tree.' % seqbase
                    else:
                        tabtext += '<pre>%s</pre>\n' % open(seqfile,
                                                            'r').read()
                        if ftype == 'tree.txt':
                            for xref in hitlist:
                                reptext = '<a href="./%s.html">%s</a>' % (
                                    string.split(xref, '__')[-1], xref)
                                tabtext = string.replace(
                                    tabtext, ': %s ' % xref, ': %s ' % reptext)
                            while rje.matchExp('(: \S+_(\S+)__(\S+) )',
                                               tabtext):
                                (oldtext, sid, spec, spacc) = rje.matchExp(
                                    '(: (\S+)_(\S+)__(\S+) )', tabtext)
                                newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % (
                                    sid, spec, spec, spacc, spacc)
                                tabtext = string.replace(
                                    tabtext, oldtext, newtext)
                        tabdesc = '%s output' % seqfile
                    htabs.append((ftype, tabtext, tabdesc))
                if htabs:
                    html += hobj.tabberHTML(os.path.basename(seqbase), htabs)
                else:
                    html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit
                html += hobj.htmlTail()
                open(hfile, 'w').write(html)
        except:
            self.errorLog('Problem with %s.makeHTML()' % self.prog())
Esempio n. 35
0
    def mapPhosByBLAST(self,fasfile):   ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F']
            qseqlist = rje_seq.SeqList(self.log,scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name','Pos','AA','PELM','PELMPos','Evidence']
            delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self,ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F'])
            pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile})
            pblast.setStat({'HitAln':pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key']
            for g in ['ID','Hom']:
                if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True,log=True)    # BLAST
            pblast.readBLAST(gablam=True)               # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F'])
                aln.seq = [qseq]
                pdict = {}      # Dictionary of {hseq:[poslist]}
                rdict = {qseq:0}      # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False):
                            self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln()   #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln','-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a],
                                     'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'}
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata)
                            self.addPhos(aln.seq[-1],a,pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()


            # Align hits for each > X %ID
            # Map phosphosites onto alignment and output #
            
            return
        except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
Esempio n. 36
0
    def buildPam(self):     ### Builds PAM Matrix in memory
        '''Builds PAM matrix in memory.'''
        try:
            ### Check for Alternative PAM Matrix ###
            if self.info['AltPam'].lower() not in ['','none']:
                self.altPAM()
            
            self.verbose(0,3,"Reading PAM1 matrix from %s" % self.info['Name'],2)
            ### <a> ### Open file & Read Lines
            pamfiles = [self.info['Name'],rje.makePath(self.info['Path']) + self.info['Name'],rje.makePath(self.info['Path']) + rje.makePath('../data/') + self.info['Name']]
            self.info['Name'] = None
            for pfile in pamfiles:
                if rje.checkForFile(pfile):
                    file_lines = open(pfile, 'r').readlines()
                    self.info['Name'] = pfile
                    break
            if not self.info['Name']:
                for pfile in pamfiles: self.printLog('#ERR','File "%s" not found' % pfile)
                self.printLog('#ERR','No PAM file found!')
                raise ValueError

            ### <b> ### Read in alphabet
            self.verbose(0,3,file_lines[0],1)
            if file_lines[0].upper().find('X') >= 0:
                self.opt['X-Value'] = False
            if file_lines[0].find('-') >= 0:
                self.opt['GapValue'] = False
            self.alphabet = file_lines[0].split()

            ### <c> ### Make PAM0
            ## <i> ## Clear dics
            zeropamp = {}
            for r in self.alphabet:
                for c in self.alphabet:
                    zeropamp[r + c] = 0
                zeropamp[r + r] = 1
                if self.opt['X-Value']:
                    zeropamp['X' + r] = 1
                    zeropamp[r + 'X'] = 1
                if self.opt['GapValue']:
                    zeropamp['-' + r] = 1
                    zeropamp[r + '-'] = 1
            if self.opt['X-Value']:
                zeropamp['XX'] = 1
            if self.opt['GapValue']:
                zeropamp['--'] = 1
            if self.opt['X-Value'] and self.opt['GapValue']:
                zeropamp['-X'] = 1
                zeropamp['X-'] = 1
            ## <ii> ## New Matrix
            newmatrix = PAM(pam=0,rawpamp=zeropamp,alpha=self.alphabet)
            self.matrix.append(newmatrix)

            ## <d> ## Read in PAM1
            rawpamp = {}
            line = 1
            for r in self.alphabet:
                pamline = file_lines[line].split()
                if len(pamline) != (len(self.alphabet)+1):
                    self.log.errorLog("%s has wrong format! Does not match %s" % (pamline, self.alphabet),printerror=False,quitchoice=True)
                    raise
                for c in range(int(len(self.alphabet))):
                    prob = float(pamline[c+1])
                    rawpamp[r + self.alphabet[c]] = prob
                if self.opt['X-Value']:
                    rawpamp['X' + r] = 1
                    rawpamp[r + 'X'] = 1
                if self.opt['GapValue']:
                    rawpamp['-' + r] = 1
                    rawpamp[r + '-'] = 1
                line += 1
            if self.opt['X-Value']:
                rawpamp['XX'] = 1
            if self.opt['GapValue']:
                rawpamp['--'] = 1
            if self.opt['X-Value'] and self.opt['GapValue']:
                rawpamp['-X'] = 1
                rawpamp['X-'] = 1
            newmatrix = PAM(pam=1,rawpamp=rawpamp,alpha=self.alphabet)
            self.matrix.append(newmatrix)

            ## <e> ## Raise to pammax
            self.log.printLog('\r#PAM','Building PAM Matrices <= %d: ' % self.stat['PamMax'],log=False,newline=False)
            self.pamUp()
            self.log.printLog('\r#PAM','Building PAM Matrices <= %d: Complete.' % self.stat['PamMax'])
        except:
            self.log.errorLog('Fatal Error in PamCtrl.buildPam().')
            raise
Esempio n. 37
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A','C','G','T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []     # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {}  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {}  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1,n2,n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod,'T','U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,total=False)                                 # Normalise triplet freq.
            for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('#FREQ','Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0]
                except:
                    try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0]
                    except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0]
                self.deBug(exons)
                exons = string.split(exons,',')
                elen = []
                try:
                    for exon in exons:
                        (start,end) = string.split(exon,'..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue
                        
                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False)
                if seq.aaLen()/3 != seq.aaLen()/3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence,'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                elen = seq.list['ExonLen'] 
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0      # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:    # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:              # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1,3):
                                    if elen[0] > ax + i:    # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:    # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:              # Look at overlap with previous codon
                        for i in range(1,3):
                            if elen[0] > ax + i:    # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq,total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False)    
            self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip']
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self,tfile,headers,rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod,'T','U')]
                datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod],
                            'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod],
                            'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]}
                rje.delimitedFileOutput(self,tfile,headers,datadict=datadict)
            self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 38
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd)
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}   # Dictionary of {ID:Sequence object}
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name'])
                if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold':scaffold,'Gene':name})
                try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos)
                except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                gene.opt['Complement'] = start > end        # Sequence on "lagging" strand
                gene.setStat({'Start':start,'End':end})
                gene.list['CDS'] = []       # Will add CDS sequences here
                gene.list['Exon'] = []      # Will add exon sequences here
            self.log.printLog('\r#GENE','Processing Gene Annotation complete!')
                           
            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx,ctot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False)
                cx += 100
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS','Processing CDS Annotation complete!')
                
            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex,etot) = (0.0,exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False)
                ex += 100
                try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON','Processing Exon Annotation complete!')
                
            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist,xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist,xreplace=False)
                gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,','))
            self.log.printLog('\r#GENE','Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 39
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            genes = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd)
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            exons = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}  # Dictionary of {ID:Sequence object}
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE',
                                  'Processing Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                (id, scaffold, pos, name, glen) = rje.matchExp(
                    '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',
                    gene.info['Name'])
                if string.atoi(glen) != gene.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold': scaffold, 'Gene': name})
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',
                                           pos)
                except:
                    (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                gene.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                gene.setStat({'Start': start, 'End': end})
                gene.list['CDS'] = []  # Will add CDS sequences here
                gene.list['Exon'] = []  # Will add exon sequences here
            self.log.printLog('\r#GENE',
                              'Processing Gene Annotation complete!')

            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx, ctot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS',
                                  'Processing CDS Annotation: %.1f%%' %
                                  (cx / ctot),
                                  newline=False,
                                  log=False)
                cx += 100
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!')

            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex, etot) = (0.0, exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON',
                                  'Processing Exon Annotation: %.1f%%' %
                                  (ex / etot),
                                  newline=False,
                                  log=False)
                ex += 100
                try:
                    (id, scaffold, pos, name, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,
                                                                 ',')[0]]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON',
                              'Processing Exon Annotation complete!')

            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE',
                                  'Generating new Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist, xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist, xreplace=False)
                gene.info[
                    'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (
                        gene.info['Gene'], gene.info['SpecCode'],
                        gene.info['AccNum'], gene.aaLen(),
                        string.join(clist, ','), string.join(elist, ','))
            self.log.printLog('\r#GENE',
                              'Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 40
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A', 'C', 'G', 'T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []  # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {
            }  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {
            }  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1, n2, n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod, 'T', 'U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][
                            cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,
                                     total=False)  # Normalise triplet freq.
            for aa in nts_cfreq:
                nts_cfreq[aa] = rje.dictFreq(
                    nts_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog('#FREQ', 'Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog(
                    '\r#OBS',
                    'Calculating observed codon frequencies: %.1f%%' %
                    (sx / stot),
                    newline=False,
                    log=False)
                sx += 100.0
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try:
                    exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',
                                         pos)[0]
                except:
                    try:
                        exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0]
                    except:
                        exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0]
                self.deBug(exons)
                exons = string.split(exons, ',')
                elen = []
                try:
                    for exon in exons:
                        (start, end) = string.split(exon, '..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue

                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen():
                    self.log.errorLog('%s exon length error' % id,
                                      printerror=False)
                if seq.aaLen() / 3 != seq.aaLen() / 3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,
                                      printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence, 'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id, printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq:
                obs_cfreq[aa] = rje.dictFreq(
                    obs_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog(
                '\r#OBS', 'Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP',
                                  'Calculating triplet frequencies: %.1f%%' %
                                  (sx / stot),
                                  newline=False,
                                  log=False)
                sx += 100.0
                elen = seq.list['ExonLen']
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0  # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:  # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:  # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1, 3):
                                    if elen[0] > ax + i:  # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (
                                            obs_cfreq[prevaa][cod1] *
                                            obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (
                                            nts_cfreq[prevaa][cod1] *
                                            nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:  # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:  # Look at overlap with previous codon
                        for i in range(1, 3):
                            if elen[0] > ax + i:  # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq, total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False)
            self.log.printLog('\r#TRIP',
                              'Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip',
                'NT_Trip', 'ObCod_Trip', 'NTCod_Trip'
            ]
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self, tfile, headers, rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod, 'T', 'U')]
                datadict = {
                    'Triplet': cod,
                    'AA': aa,
                    'Degen': len(obs_cfreq[aa]),
                    'Obs_Codon': obs_cfreq[aa][cod],
                    'NT_Codon': nts_cfreq[aa][cod],
                    'Obs_Trip': obs_tfreq[cod],
                    'NT_Trip': nts_tfreq[cod],
                    'ObCod_Trip': ocd_tfreq[cod],
                    'NTCod_Trip': ncd_tfreq[cod]
                }
                rje.delimitedFileOutput(self,
                                        tfile,
                                        headers,
                                        datadict=datadict)
            self.log.printLog('#OUT',
                              'Triplet & codon data output to %s' % tfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 41
0
def loadOrthAln(callobj,seq,gopher=True):    ### Identifies file, loads and checks alignment.
    '''
    Identifies file, loads and checks alignment. If the identified file is not actually aligned, then RJE_SEQ will try to
    align the proteins using MUSCLE or ClustalW.
    >> callobj:Object containing settings for stats generation (MotifList, generally).
    >> seq:Sequence being analysed.
    >> gopher:bool [True] = whether to try to generate alignment with GOPHER if callobj.opt['Gopher']
    << aln = SeqList object containing alignment with queryseq
    '''
    try:
        ### Setup Attributes ###
        v = callobj.stat['Verbose']
        alndir = rje.makePath(callobj.info['AlnDir'])
        alnext = callobj.info['AlnExt']
        
        ### Identify File ###
        if alnext[0] != '.': alnext = '.%s' % alnext
        alnstart = [seq.info['AccNum'],seq.info['ID'],seq.shortName(),None]
        if v > 2: callobj.log.printLog('#PRESTO','%s' % callobj.opt)  #!# Old debugging? #!#
        if callobj.opt['Gopher'] and callobj.opt['FullForce']:
            if v > 0: callobj.log.printLog('#ALN','FullForce=T. Will call Gopher for %s regardless of existing files' % seq.shortName())
            alnstart = [None]
        for file in alnstart:
            if file:
                file = '%s%s%s' % (alndir,file,alnext)
                if rje.checkForFile(file): break  # File found
            else:
                #!# Sort out logging and see if Gopher can be used directly rather than just run() #!#
                ### Run GOPHER ###
                if gopher and callobj.opt['Gopher']:  #!# Add working version for PRESTO and SlimPickings #!#
                    callobj.deBug('Run GOPHER in %s' % callobj.info['GopherDir'])
                    mydir = os.getcwd()
                    os.chdir(callobj.info['GopherDir'])
                    callobj.log.printLog('\n#GOPHER','Running GOPHER on %s' % seq.shortName())
                    try:    #!# Add log.silent() method? #!#
                        gcmd = ['orthtree'] + callobj.cmd_list + ['gnspacc=T','i=-1']
                        solo_gopher = gopher_V2.GopherFork(log=callobj.log,cmd_list=gcmd)
                        solo_gopher.info['Name'] = seq.shortName()
                        solo_gopher.obj['Sequence'] = seq
                        solo_gopher.obj['BLAST'] = gopher_V2.Gopher(callobj.log,gcmd).setupBlast()  #!# Contemplate setting up Gopher in callobj #!#
                        solo_gopher.obj['BLAST'].log = callobj.log
                        solo_gopher.run('orthalign')    #X#gopher_V2.Gopher(callobj.log,gcmd).setMode())
                    except:
                        os.chdir(mydir)
                        callobj.log.errorLog('Problem with Gopher run!')
                        return None
                        
                    if not 'old_school':                            
                        inputseq = 'tmp%s.fas' % rje.randomString(8)
                        TMP = open(inputseq,'w')
                        TMP.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence']))
                        TMP.close()
                        gcmd = ['orthtree'] + callobj.cmd_list + ['gopher=%s' % inputseq, 'gnspacc=T','i=-1']
                        try:
                            mygopher = gopher_V2.Gopher(log=callobj.log,cmd_list=gcmd)
                            mygopher.run()
                        except:
                            os.chdir(mydir)
                            callobj.log.errorLog('Problem with Gopher run!',printerror=False)
                            return None
                        rje_blast.cleanupDB(callobj,dbfile=inputseq,deletesource=True)
                    os.chdir(mydir)
                if callobj.opt['Gopher']:  
                    file = '%s%s%s' % (alndir,seq.info['AccNum'],alnext)
                    if not os.path.exists(file):
                        file = None
                if not file:
                    callobj.log.printLog('#ALN','No alignment file found for %s in %s.' % (seq.shortName(),alndir),screen=False)
                    return None
        
        ### Load Alignment ###
        callobj.log.stat['Verbose'] = v - 1
        alncmd = ['seqin=None','query=%s' % seq.shortName(),'accnr=F','seqnr=F','autofilter=F','align=T','gnspacc=F'] 
        aln = rje_seq.SeqList(log=callobj.log,cmd_list=callobj.cmd_list+alncmd)
        #X#print file
        aln.loadSeqs(seqfile=file,seqtype='Protein',aln=True,nodup=None)
        callobj.log.stat['Verbose'] = v 
        ## Check Query ##
        qry = aln.obj['QuerySeq']
        if not qry:
            if aln.querySeq(query=seq.info['AccNum']):
                qry = aln.obj['QuerySeq']
            else:
                callobj.log.printLog('#ALN','Problem finding %s in %s.' % (seq.shortName(),file),screen=False)
                return None

        ### Check Alignment ###
        if aln.seqNum() < 2:
            callobj.log.printLog('#ALN','Not enough sequences for %s in %s.' % (seq.shortName(),file),screen=False)
            return None
        if aln._checkAln(aln=True,realign=True):
            return aln
        else:
            callobj.log.printLog('#ERR','%s not aligned!!!' % (file))
            return None       
    except:
        callobj.log.errorLog('Something bad has happened in rje_motif_stats.loadOrthAln()')
        callobj.log.stat['Verbose'] = v 
        return None
Esempio n. 42
0
    def buildPam(self):  ### Builds PAM Matrix in memory
        '''Builds PAM matrix in memory.'''
        try:
            ### Check for Alternative PAM Matrix ###
            if self.info['AltPam'].lower() not in ['', 'none']:
                self.altPAM()

            self.verbose(0, 3,
                         "Reading PAM1 matrix from %s" % self.info['Name'], 2)
            ### <a> ### Open file & Read Lines
            pamfiles = [
                self.info['Name'],
                rje.makePath(self.info['Path']) + self.info['Name'],
                rje.makePath(self.info['Path']) + rje.makePath('../data/') +
                self.info['Name']
            ]
            self.info['Name'] = None
            for pfile in pamfiles:
                if rje.checkForFile(pfile):
                    file_lines = open(pfile, 'r').readlines()
                    self.info['Name'] = pfile
                    break
            if not self.info['Name']:
                for pfile in pamfiles:
                    self.printLog('#ERR', 'File "%s" not found' % pfile)
                self.printLog('#ERR', 'No PAM file found!')
                raise ValueError

            ### <b> ### Read in alphabet
            self.verbose(0, 3, file_lines[0], 1)
            if file_lines[0].upper().find('X') >= 0:
                self.opt['X-Value'] = False
            if file_lines[0].find('-') >= 0:
                self.opt['GapValue'] = False
            self.alphabet = file_lines[0].split()

            ### <c> ### Make PAM0
            ## <i> ## Clear dics
            zeropamp = {}
            for r in self.alphabet:
                for c in self.alphabet:
                    zeropamp[r + c] = 0
                zeropamp[r + r] = 1
                if self.opt['X-Value']:
                    zeropamp['X' + r] = 1
                    zeropamp[r + 'X'] = 1
                if self.opt['GapValue']:
                    zeropamp['-' + r] = 1
                    zeropamp[r + '-'] = 1
            if self.opt['X-Value']:
                zeropamp['XX'] = 1
            if self.opt['GapValue']:
                zeropamp['--'] = 1
            if self.opt['X-Value'] and self.opt['GapValue']:
                zeropamp['-X'] = 1
                zeropamp['X-'] = 1
            ## <ii> ## New Matrix
            newmatrix = PAM(pam=0, rawpamp=zeropamp, alpha=self.alphabet)
            self.matrix.append(newmatrix)

            ## <d> ## Read in PAM1
            rawpamp = {}
            line = 1
            for r in self.alphabet:
                pamline = file_lines[line].split()
                if len(pamline) != (len(self.alphabet) + 1):
                    self.log.errorLog(
                        "%s has wrong format! Does not match %s" %
                        (pamline, self.alphabet),
                        printerror=False,
                        quitchoice=True)
                    raise
                for c in range(int(len(self.alphabet))):
                    prob = float(pamline[c + 1])
                    rawpamp[r + self.alphabet[c]] = prob
                if self.opt['X-Value']:
                    rawpamp['X' + r] = 1
                    rawpamp[r + 'X'] = 1
                if self.opt['GapValue']:
                    rawpamp['-' + r] = 1
                    rawpamp[r + '-'] = 1
                line += 1
            if self.opt['X-Value']:
                rawpamp['XX'] = 1
            if self.opt['GapValue']:
                rawpamp['--'] = 1
            if self.opt['X-Value'] and self.opt['GapValue']:
                rawpamp['-X'] = 1
                rawpamp['X-'] = 1
            newmatrix = PAM(pam=1, rawpamp=rawpamp, alpha=self.alphabet)
            self.matrix.append(newmatrix)

            ## <e> ## Raise to pammax
            self.log.printLog('\r#PAM',
                              'Building PAM Matrices <= %d: ' %
                              self.stat['PamMax'],
                              log=False,
                              newline=False)
            self.pamUp()
            self.log.printLog(
                '\r#PAM',
                'Building PAM Matrices <= %d: Complete.' % self.stat['PamMax'])
        except:
            self.log.errorLog('Fatal Error in PamCtrl.buildPam().')
            raise
Esempio n. 43
0
 def saveReadMe(self,filename='pydocs.txt',append=False):      ### Prints docs for modules to file
     '''
     Prints docs for modules to file.
     >> filename:str = output file name
     >> append:boolean
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pydoc = self.obj['PyDoc']
         if append:
             self.printLog('#DOC','Appending docstrings to %s' % filename)
             PYDOC = open(filename,'a')
         else:
             rje.mkDir(self,filename)
             self.printLog('#DOC','Writing docstrings to %s' % filename)
             PYDOC = open(filename,'w')
             PYDOC.write(self.readMeHeader())
         db = self.db('Module')
         dx = 0
         ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for sourcedir in pydoc.list['SourceDir']:
             PYDOC.write('-%s:\n\n' % sourcedir)
             for pyfile in db.dataKeys():
                 entry = db.data(pyfile)
                 module = entry['Module']
                 if not pyfile.find(sourcedir) >= 0 or not os.path.exists('%s%s%s.py' % (pydoc.getStr('PyPath'),rje.makePath(sourcedir),module)): continue
                 ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,pyfile)
                 while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:]
                 try: PYDOC.write('%s\n\n%s\n' % (mtxt,entry['DocString'])); dx += 1
                 except:
                     self.errorLog('Cannot write DocString for %s' % module,printerror=False)
                     PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)); dx += 1
             PYDOC.write('\n\n\n')
         PYDOC.close()
         self.printLog('#DOC','Output to %s complete: %s modules.' % (filename,rje.iStr(dx)))
     except: self.errorLog('Error in %s.saveDocs()' % self.prog())
Esempio n. 44
0
	def run(self):		### Main Run method
		'''
		Main Run method.
		'''
		try:
			### SLiMDisc Run ###
			if self.opt['SLiMDisc']:
				return self.slimDisc()
			
			### TEIRESIAS ###
			if self.opt['Teiresias']:
				## Setup ##
				seqlist = rje_seq.SeqList(self.log,self.cmd_list)
				infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True)
				outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True)
				run_teiresias = True
				if rje.isYounger(outfile,infile) == outfile:
					if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'):
						run_teiresias = False
				## Run TEIRESIAS ##
				if run_teiresias:
					seqlist.saveFasta(seqfile=infile,name='Teiresias')	### Saves sequences in fasta format
					command = rje.makePath(self.info['TeiresiasPath'],True)
					command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt'])
					self.log.printLog('#CMD',command)
					os.system(command)
				## Read Results ##
				self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1)
				self.list['Pattern'] = []
				RESULTS = open(outfile,'r')
				line = RESULTS.readline()
				while line:
					if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern
						self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line))
					elif len(line) > 3 and line[0] != '#':
						self.log.errorLog('Did not recognise line: %s' % line,False,False)
					line = RESULTS.readline()
				RESULTS.close()
				patx = len(self.list['Pattern'])
				self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile))
				## Calculate Information Content ##
				aafreq = seqlist.aaFreq()
				self.verbose(0,3,'Calculating Information Content & Length stats...',0)
				occx = 0
				for pattern in self.list['Pattern']:
					pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq)
					pattern._makeLength()
					occx += 1
					rje.progressPrint(self,occx,patx/100,patx/10)
				self.verbose(0,1,'...Done!',2)
				## Prepare Results ##
				delimit = rje.getDelimit(self.cmd_list)
				if self.info['Name'] == 'None':
					self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit))
				if self.opt['MySQL']:	# Two tables
					patfile = os.path.splitext(self.info['Name'])
					occfile = '%s.occ%s' % (patfile[0],patfile[1])
					patfile = '%s.patterns%s' % (patfile[0],patfile[1])
					if self.opt['Append']:
						PATFILE = open(patfile,'a')
						OCCFILE = open(occfile,'a')
					else:
						PATFILE = open(patfile,'w')
						rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit)
						OCCFILE = open(occfile,'a')
						rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit)
				else:
					if self.opt['Append']:
						RESFILE = open(self.info['Name'],'a')
					else:
						RESFILE = open(patfile,'w')
						rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit)
				## Save Results ##
				occx = 0
				for pattern in self.list['Pattern']:
					patstats = []
					for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']:
						patstats.append('%d' % pattern.stat[stat])
					patstats[2] = '%.3f' % pattern.stat['Info']
					if self.opt['MySQL']:	# Two tables
						rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit)
					for occ in rje.sortKeys(pattern.occ):
						seq = seqlist.seq[occ]
						for pos in pattern.occ[occ]:
							match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])]
							outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match]
							if self.opt['MySQL']:	# Two tables
								rje.writeDelimit(OCCFILE,outlist,delimit)
							else:
								rje.writeDelimit(RESFILE,outlist+patstats,delimit)
							occx += 1
				if self.opt['MySQL']:	# Two tables
					PATFILE.close()
					OCCFILE.close()
					self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile))
					self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile))
				else:
					RESFILE.close()
					self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' %
									  (rje.integerString(occx),rje.integerString(patx),self.info['Name']))

			### InfoContent ###
			elif self.info['Info'] != 'None':
				## Setup ##
				alphabet = rje_seq.alph_protx 
				if not os.path.exists(self.info['Info']):
					self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False)
					return False
				else:
					mypresto = presto.Presto(self.log,self.cmd_list)
					mypresto.loadMotifs(file=self.info['Info'],clear=True)
				seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
				if seqlist.seqNum() > 0:
					aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False)  ### Returns dictionary of AA (& gap etc.) frequencies
				else:
					aafreq = {}
					for aa in alphabet:
						aafreq[aa] = 1.0 / len(alphabet)
				alphabet = aafreq.keys()
				maxinfo = 0 
				for aa in alphabet:
					maxinfo +=  (aafreq[aa] * math.log(aafreq[aa],2))
				## Output ##
				delimit = rje.getDelimit(self.cmd_list)
				ext = rje.delimitExt(delimit)
				outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext)
				if self.opt['Append']:
					OUTFILE = open(outfile,'a')
				else:
					OUTFILE = open(outfile,'w')
					rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit)
				
				## Calculate Information Scores ##
				for motif in mypresto.motif:
					self.verbose(2,4,motif.info['Sequence'],0)
					pattern = string.replace(motif.info['Sequence'],'X','.')
					elements = string.split(pattern,'-')
					pattern = ''
					for el in elements:
						if el.find('.{') == 0:	# Ambiguous spacer length - compress
							pattern += '.'
						else:
							pattern += el
					self.verbose(2,2,'=> %s' % pattern,1)
					motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen'])
					self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1)
					## Output ##
					rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit)
				
				## Finish ##
				OUTFILE.close()
		except:
			self.log.errorLog('Error in run().',printerror=True,quitchoice=False)
			raise	# Delete this if method error not terrible
Esempio n. 45
0
 def ANCHOR(self, retry=2):  ### Runs ANCHOR disorder prediction
     '''Runs ANCHOR disorder prediction.'''
     try:  ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sequence = self.info['Sequence'].upper()
         name = self.info['Name'][:4] + rje.randomString(8)
         tmp = name + '.tmp'
         ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         apath = self.info['ANCHOR']
         if os.path.basename(apath) == 'anchor':
             apath = os.path.dirname(apath)
         anchor = rje.makePath(apath) + 'anchor'
         if not os.path.exists(anchor):
             self.errorLog('Path "%s" not found!' % anchor,
                           printerror=False)
             retry = 0
             raise IOError
         ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         open(tmp, 'w').write('>%s\n%s\n' % (name, sequence))
         acmd = '%s %s -d %s' % (anchor, tmp, apath)
         dlines = os.popen(acmd).readlines()
         try:
             os.unlink(tmp)
         except:
             self.errorLog('Cannot delete %s!' % tmp)
         ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Name'] not in ['', 'None']: name = self.info['Name']
         self.list['ResidueDisorder'] = []
         for d in dlines:
             if d[:1] == '#': continue
             if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d):
                 dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d)
                 pos = string.atoi(dm[0])
                 aa = dm[1]
                 score = string.atof(dm[2])
                 i = len(self.list['ResidueDisorder'])
                 if sequence[i] != aa:
                     self.log.errorLog(
                         '%s: Position %d is %s in sequence but %s in ANCHOR output!'
                         % (name, pos, sequence[i], aa),
                         printerror=False)
                     raise ValueError
                 if pos != (i + 1):
                     self.log.errorLog(
                         '%s: Position %d reached in ANCHOR output but previous results missing!'
                         % (name, pos),
                         printerror=False)
                     raise ValueError
                 self.list['ResidueDisorder'].append(score)
         if len(self.list['ResidueDisorder']) != len(sequence):
             self.log.errorLog(
                 '%s: Sequence = %d aa but ANCHOR results stop at %s!' %
                 (name, len(sequence), len(self.list['ResidueDisorder'])),
                 printerror=False)
             raise ValueError
         ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RegionDisorder'] = []
         self.list['RegionFold'] = []
         start = 0
         fstart = 0
         i = 0
         dx = 0
         while i < len(sequence):
             score = self.list['ResidueDisorder'][i]
             i += 1
             if not start and score > self.stat[
                     'IUCut']:  ### Start new disorder ###
                 start = i
             elif start and score <= self.stat['IUCut']:  ### End!
                 self.list['RegionDisorder'].append((start, i - 1))
                 dx += i - start
                 start = 0
             if not fstart and score <= self.stat[
                     'IUCut']:  ### Start new fold ###
                 fstart = i
             elif fstart and score > self.stat['IUCut']:  ### End!
                 self.list['RegionFold'].append((fstart, i - 1))
                 fstart = 0
         if start:
             self.list['RegionDisorder'].append((start, len(sequence)))
             dx += len(sequence) + 1 - start
         if fstart: self.list['RegionFold'].append((fstart, len(sequence)))
         self.minRegion()
         if self.opt['PrintLog']:
             self.log.printLog(
                 '\r#DIS',
                 'ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa'
                 % (len(self.list['RegionDisorder']), dx))
         return True
     except:
         if retry:
             self.printLog('#RETRY', 'Trying %s again...' % name)
             return self.ANCHOR(retry - 1)
         self.log.errorLog(
             'Error in Disorder.ANCHOR(%s). Disorder prediction failed.' %
             name)
         self.list['RegionDisorder'] = []
         self.list['RegionFold'] = []
         return False
Esempio n. 46
0
 def multiHAQ(self,secondrun=False):     ### Executes main HAQESAC runs
     '''Executes main HAQESAC runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         finalrun = secondrun == self.opt['MultiHAQ']    # Whether this is the manual HAQESAC phase
         qryacc = self.obj['SeqList'].accList()          # Full list of Query accession numbers
         processed = []                                  # List of processed sequence accession numbers
         ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             acc = seq.info['AccNum']
             if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))):
                 self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue
             ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
             logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True)
             infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True)
             pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True)
             pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True)
             if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile:
                 self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile:
                 self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
             inseqx = rje_seq.SeqCount(self,infile)
             if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue
             ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0
             while secondrun and self.opt['Chaser'] and not pickled:
                 self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm))
                 time.sleep(60*tm); tm += 1
                 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile)
                 if not pickled:
                     try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit')
                     except:
                         self.printLog('#PICKLE','No %s pickle.' % (acc,tm))
                         self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return
             ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             runhaqesac = True
             pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True)
             if not self.force() and rje.exists(pngfile):
                 self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile)
                 runhaqesac = False
             ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True)
             if not self.force() and rje.exists(ancfile):
                 self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile)
                 runhaqesac = False
             #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile:
             if runhaqesac:
                 haqcmd = ['ini=haqesac.ini','seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F']
                 self.printLog('#HAQ','Running HAQESAC for %s - will have own log etc.' % seq.shortName(),log=False)
                 os.chdir(self.info['HaqDir'])
                 info = haqesac.makeInfo()
                 haqcmd = rje.getCmdList(haqcmd,info=info)
                 out = rje.Out(cmd_list=haqcmd)    # Sets up Out object for controlling output to screen
                 out.printIntro(info)                                # Prints intro text using details from Info object
                 haqlog = rje.setLog(info,out,haqcmd)                 # Sets up Log object for controlling log file output
                 try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True)
                 except:
                     os.chdir(self.info['RunPath'])
                     if self.i() >= 0 and rje.yesNo('Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt
                 os.chdir(self.info['RunPath'])
                 if finalrun: self.printLog('#HAQ','HAQESAC final round run for %s' % seq.shortName())
                 else: self.printLog('#HAQ','HAQESAC first round run for %s' % seq.shortName())
             ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not self.opt['ScreenQry'] or not finalrun: continue
             qacclist = []
             for qacc in rje_seq.SeqList(self.log,['seqin=%s' % infile,'autoload=T','autofilter=F']).accList():
                 if qacc in qryacc and qacc != acc: qacclist.append(qacc)
                 if qacc in qryacc and qacc not in processed: processed.append(qacc)
             self.printLog('#QRY','%d other queries found in %s: [%s]' % (len(qacclist),infile,string.join(qacclist,'; ')))
             self.printLog('#QRY','%d of %d queries processed' % (len(processed),self.seqNum()))
         ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not finalrun: self.printLog('#MULTI','Executing second round of multiHAQ'); self.multiHAQ(True)
     except: self.errorLog('Major problem with MultiHAQ.multiHAQ',quitchoice=True)
Esempio n. 47
0
 def blast2fas(self):  ### Executes BLAST2FAS and copies results files
     '''Executes BLAST2FAS and copies results files.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         need2blast = self.opt['Force']
         null_file = '%s.blast2fas_null.txt' % self.baseFile()
         nx = 0
         null_list = []
         if os.path.exists(null_file):
             null_list = string.split(open(null_file, 'r').read(), '\n')
         self.debug(null_file)
         for seq in self.seqs():
             if seq.info['AccNum'] in null_list:
                 nx += 1
                 continue
             hfile = rje.makePath('%s%s.fas' %
                                  (self.info['HaqDir'], seq.info['AccNum']),
                                  wholepath=True)
             for db in self.obj['SeqList'].list['Blast2Fas']:
                 self.debug(rje.isYounger(hfile, db))
                 self.debug(rje.isYounger(hfile, db) == hfile)
                 need2blast = need2blast or not rje.isYounger(hfile,
                                                              db) == hfile
         if not need2blast:
             self.printLog(
                 '#BLAST',
                 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)'
                 % nx)
             return False
         ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.backup(self, null_file)
         nx = 0
         if self.getInt('MultiCut'):
             self.obj['SeqList'].cmd_list += [
                 'blastb=%d' % self.getInt('MultiCut'),
                 'blastv=%d' % self.getInt('MultiCut')
             ]
         elif self.getInt('BlastCut'):
             self.obj['SeqList'].cmd_list += [
                 'blastb=%d' % self.getInt('BlastCut'),
                 'blastv=%d' % self.getInt('BlastCut')
             ]
         if self.getInt('Forks'):
             self.obj['SeqList'].cmd_list += [
                 'blasta=%d' % self.getInt('Forks')
             ]
         rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir'))
         for seq in self.seqs():
             sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),
                                          seq.info['AccNum'])
             if os.path.exists(sbfile):
                 hfile = rje.makePath(
                     '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']),
                     wholepath=True)
                 os.rename(sbfile, hfile)
                 if os.path.exists('%s.pickle' % rje.baseFile(hfile)):
                     os.unlink('%s.pickle' % rje.baseFile(hfile))
                 if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)):
                     os.unlink('%s.pickle.gz' % rje.baseFile(hfile))
             else:
                 open(null_file, 'a').write('%s\n' % seq.info['AccNum'])
                 nx += 1
         if nx:
             self.printLog(
                 '#BLAST',
                 '%s Accession Numbers without BLAST2Fas hits output to %s'
                 % (nx, null_file))
         self.printLog(
             '#BLAST', '%s HAQESAC input files made using BLAST2Fas' %
             (self.seqNum() - nx))
         return True
     except:
         self.errorLog('Major problem with MultiHAQ.blast2fas')
         raise
Esempio n. 48
0
    def makeHTML(self): ### Generates HTML pages for interactive navigation.
        '''Generates HTML pages for interactive navigation.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            basefile = self.basefile()
            scmd = self.cmd_list + ['seqin=%s' % self.getStr('Candidates'),'autoload=T','autofilter=F','seqmode=file']
            candseq = rje_seqlist.SeqList(self.log,scmd)
            # All files and directories are named after basefile:
            # *.fas = original target PROTEIN sequences (with original descriptions)
            scmd = self.cmd_list + ['seqin=%s' % self.getStr('SeqIn'),'autoload=T','autofilter=F','seqmode=file']
            seqlist = rje_seqlist.SeqList(self.log,scmd)
            # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.)
            gdb = self.db().addTable('%s.gablam.tdt' % basefile,mainkeys=['Qry','Hit'],name='gablam',expect=False)
            # - Contains candidate proteins as Queries and Target proteins as hits
            # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum
            haqdir = rje.makePath('./%s.HAQESAC/' % basefile)

            ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            hfile = '%s.html' % basefile
            hobj = self.obj['HTML']
            hobj.list['StyleSheets'] = ['http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css',
                                        'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css']
            html = hobj.htmlHead(basefile)
            # Front page should have:
            html += '<h1>%s</h1>\n\n' % basefile
            htabs = []      # (tab_id, tab_html_text[, tab_title])
            # Target protein list (with links to HAQ HTML)
            ctext = '%s\n' % string.join(['Name','Descripton','Length'],'\t')
            seqdict = seqlist.makeSeqNameDic('short')
            if gdb: hitlist = gdb.indexKeys('Hit')
            else: hitlist = rje.sortKeys(seqdict)
            for name in hitlist:
                seq = seqdict[name]
                cseq = [name,seqlist.seqDesc(seq),'%s aa' % seqlist.seqLen(seq)]
                acc = seqlist.seqAcc(seq)
                if os.path.exists('%s%s.log' % (haqdir,acc)):
                    cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,cseq[0])
                ctext += '%s\n' % string.join(cseq,'\t')
            htabs.append(('Hits',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Target sequences hit by candidates.'))
            # GABLAM/HMM table (with above links)
            if gdb:
                ctext = '%s\n' % string.join(gdb.fields(),'\t')
                for gline in open('%s.gablam.tdt' % basefile,'r').readlines()[1:]:
                    gdata = string.split(gline,'\t')
                    acc = string.split(gdata[0],'__')[-1]
                    gdata[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,gdata[0])
                    acc = string.split(gdata[1],'__')[-1]
                    gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,gdata[1])
                    ctext += '%s\n' % string.join(gdata,'\t')
                htabs.append(('GABLAM',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'GABLAM hit table.'))
            # Candidate list (with DB links)
            if candseq.seqNum():
                ctext = '%s\n' % string.join(['AccNum','ID','Descripton','Length'],'\t')
                accdict = candseq.makeSeqNameDic('accnum')
                for acc in rje.sortKeys(accdict):
                    seq = accdict[acc]
                    cseq = [acc,candseq.seqID(seq),candseq.seqDesc(seq),'%s aa' % candseq.seqLen(seq)]
                    cseq[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,acc)
                    ctext += '%s\n' % string.join(cseq,'\t')
                htabs.append(('Candidates',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Candidate sequences to search.'))
            html += hobj.tabberHTML('GABLAM',htabs)
            html += hobj.htmlTail()
            open(hfile,'w').write(html)

            ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #?# Move this to HAQESAC or MultiHAQ
            for i in range(len(hitlist)):
                hit = string.split(hitlist[i],'__')[-1]
                logfile = '%s%s.log' % (haqdir,hit)
                seqbase = logfile[:-4]
                hfile = '%s.html' % seqbase
                html = hobj.htmlHead(seqbase)
                # Front page should have:
                html += '<h1>%s</h1>\n\n' % seqbase
                html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile
                if i: html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i-1],'__')[-1],hitlist[i-1])
                if i < len(hitlist)-1: html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i+1],'__')[-1],hitlist[i+1])
                html += '</p>\n'
                htabs = []      # (tab_id, tab_html_text[, tab_title])
                for ftype in ['png','tree.txt','fas','nwk','log']:
                    seqfile = '%s.%s' % (seqbase,ftype)
                    if not os.path.exists(seqfile): continue
                    tabtext = '<p><a href="./%s">./%s</a></p>\n' % (os.path.basename(seqfile),os.path.basename(seqfile))
                    if ftype == 'png':
                        tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % (os.path.basename(seqfile),os.path.basename(seqfile))
                        tabdesc = 'PNG of %s tree.' % seqbase
                    else:
                        tabtext += '<pre>%s</pre>\n' % open(seqfile,'r').read()
                        if ftype == 'tree.txt':
                            for xref in hitlist:
                                reptext = '<a href="./%s.html">%s</a>' % (string.split(xref,'__')[-1],xref)
                                tabtext = string.replace(tabtext,': %s ' % xref,': %s ' % reptext)
                            while rje.matchExp('(: \S+_(\S+)__(\S+) )',tabtext):
                                (oldtext,sid,spec,spacc) = rje.matchExp('(: (\S+)_(\S+)__(\S+) )',tabtext)
                                newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % (sid,spec,spec,spacc,spacc)
                                tabtext = string.replace(tabtext,oldtext,newtext)
                        tabdesc = '%s output' % seqfile
                    htabs.append((ftype,tabtext,tabdesc))
                if htabs: html += hobj.tabberHTML(os.path.basename(seqbase),htabs)
                else: html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit
                html += hobj.htmlTail()
                open(hfile,'w').write(html)
        except: self.errorLog('Problem with %s.makeHTML()' % self.prog())
Esempio n. 49
0
 def ANCHOR(self,retry=2):     ### Runs ANCHOR disorder prediction
     '''Runs ANCHOR disorder prediction.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sequence = self.info['Sequence'].upper()
         name = self.info['Name'][:4] + rje.randomString(8)
         tmp = name + '.tmp'
         ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         apath = self.info['ANCHOR']
         if os.path.basename(apath) == 'anchor': apath = os.path.dirname(apath)
         anchor = rje.makePath(apath) + 'anchor'
         if not os.path.exists(anchor):
             self.errorLog('Path "%s" not found!' % anchor,printerror=False)
             retry = 0; raise IOError
         ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         open(tmp,'w').write('>%s\n%s\n' % (name,sequence))
         acmd = '%s %s -d %s' % (anchor,tmp,apath)
         dlines = os.popen(acmd).readlines()
         try: os.unlink(tmp)
         except: self.errorLog('Cannot delete %s!' % tmp)
         ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Name'] not in ['','None']: name = self.info['Name']
         self.list['ResidueDisorder'] = []
         for d in dlines:
             if d[:1] == '#': continue
             if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)',d):
                 dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)',d)
                 pos = string.atoi(dm[0])
                 aa = dm[1]
                 score = string.atof(dm[2])
                 i = len(self.list['ResidueDisorder'])
                 if sequence[i] != aa:
                     self.log.errorLog('%s: Position %d is %s in sequence but %s in ANCHOR output!' % (name,pos,sequence[i],aa),printerror=False)
                     raise ValueError
                 if pos != (i + 1):
                     self.log.errorLog('%s: Position %d reached in ANCHOR output but previous results missing!' % (name,pos),printerror=False)
                     raise ValueError
                 self.list['ResidueDisorder'].append(score)
         if len(self.list['ResidueDisorder']) != len(sequence):
             self.log.errorLog('%s: Sequence = %d aa but ANCHOR results stop at %s!' % (name,len(sequence),len(self.list['ResidueDisorder'])),printerror=False)
             raise ValueError
         ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RegionDisorder'] = []
         self.list['RegionFold'] = []
         start = 0
         fstart = 0
         i = 0
         dx = 0
         while i < len(sequence):
             score = self.list['ResidueDisorder'][i]
             i += 1
             if not start and score > self.stat['IUCut']:    ### Start new disorder ###
                 start = i
             elif start and score <= self.stat['IUCut']:     ### End!
                 self.list['RegionDisorder'].append((start,i-1))
                 dx += i - start
                 start = 0
             if not fstart and score <= self.stat['IUCut']:    ### Start new fold ###
                 fstart = i
             elif fstart and score > self.stat['IUCut']:     ### End!
                 self.list['RegionFold'].append((fstart,i-1))
                 fstart = 0
         if start:
             self.list['RegionDisorder'].append((start,len(sequence)))
             dx += len(sequence) + 1 - start
         if fstart: self.list['RegionFold'].append((fstart,len(sequence)))
         self.minRegion()
         if self.opt['PrintLog']: self.log.printLog('\r#DIS','ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),dx))
         return True
     except:
         if retry:
             self.printLog('#RETRY','Trying %s again...' % name)
             return self.ANCHOR(retry-1)
         self.log.errorLog('Error in Disorder.ANCHOR(%s). Disorder prediction failed.' % name)
         self.list['RegionDisorder'] = []
         self.list['RegionFold'] = []
         return False