Example #1
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease'])
         if self.getInt('MinPepLen') > 0: db.addField('MinPepLen')
         if self.getBool('NRPep'): db.addField('NRPep')
         if rje.exists(self.getStr('Source')):
             fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source')
             fdb.addField('File')
             fdb.addField('ProtMWt')
         else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum'])
         for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i)
         if self.getBool('PepMWt'):
             for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0)
         ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F'])
         self.obj['SeqList'].seq = fullseq = []
         for seqfile in self.list['SeqFiles']:
             file = rje.baseFile(seqfile,True)
             seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T'])
             fullseq += seqlist.seqs()
             for seq in seqlist.seqs():
                 accnum = seq.getStr('AccNum')
                 try:
                     entry = fdb.data()[accnum]
                     if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False)
                     entry['File'] = file
                     entry['ProtMWt'] = seq.MWt()
                 except:
                     entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()}
                     fdb.addEntry(entry)
                 self.deBug(fdb.dict['Data'][seq.getStr('AccNum')])
         self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq))
         fdb.fillBlanks()
         ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self._peptideProbabilities():
             db.addField('LenExp','PepCount');
             if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp')
             else: db.addField('Len7Exp','LenExp')
             db.addField('Len37','Len7Exp')
             if self.getBool('PepMWt'):
                 db.addField('Len5','MWtExp'); db.addField('MWt5','Len5')
                 db.addField('Len3','MWtExp'); db.addField('MWt3','Len3')
             else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp')
         return
         ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM')
         ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg')
         self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True)
         gdb.renameField('Qry','AccNum')
         tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False)
         tmp.saveToFile()
         tmp.compress(['File'],default='mean')
         tmp.dropFields(['AccNum'])
         tmp.info['Name'] = 'blastsum'
         tmp.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Example #2
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### Basics ###
     self.infolist = ['Pillars', 'PPIFile', 'XRef']
     self.optlist = ['SGD2SP', 'Gopher']
     self.statlist = []
     self.listlist = ['Pillars', 'YeastSeq']
     self.dictlist = ['PPI', 'Rename']
     self.objlist = ['SeqList']
     ### Defaults ###
     self._setDefaults(info='None',
                       opt=False,
                       stat=0.0,
                       obj=None,
                       setlist=True,
                       setdict=True)
     self.setInfo({
         'Pillars': 'Pillars.tab',
         'PPIFile': 'Y2H_union.txt',
         'XRef': 'yeast_xref.20101222.tdt'
     })
     ### Other Attributes ###
     self.obj['SeqList'] = rje_seq.SeqList(
         self.log,
         ['accnr=F', 'seqnr=F', 'autoload=T', 'seqin=Proteins.fsa'] +
         self.cmd_list)
     self.dict['SeqDict'] = self.obj['SeqList'].seqNameDic(
         proglog=self.stat['Verbose'] > 0)
     self.obj['DB'] = rje_db.Database(self.log, self.cmd_list)
Example #3
0
 def seqBySeq(
     self
 ):  ### Runs in SeqBySeq Mode                                                               #V1.0
     '''
     In SeqBySeq mode, the program assumes that seqin=FILE and basefile=X are given and farm states the program to be run.
     Seqin will then be worked through in turn and each sequence farmed out to the farm program. Outputs given by OutList
     are then compiled, as is the Log, into the correct basefile=X given. In the case of *.csv and *.tdt files, the header
     row is copied for the first file and then excluded for all subsequent files. For all other files extensions, the
     whole output is copied.
     '''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('Farm')[-3:] == '.py':
             self.str['Farm'] = self.str['Farm'][:-3]
         self.list['Seq'] = rje_seq.SeqList(
             self.log,
             self.cmd_list + ['autoload=T', 'accnr=F', 'seqnr=F']).seq[0:]
         while self.getStrLC('StartFrom') and self.list['Seq']:
             if self.list['Seq'][0].shortName() != self.getStr('StartFrom'):
                 self.list['Seq'] = self.list['Seq'][1:]
             else:
                 self.str['StartFrom'] = ''
         self.printLog(
             '#SEQ', '%s query sequences to farm out' %
             rje.integerString(len(self.list['Seq'])))
         self.list['Pickup'] = self.pickupList()
         ### ~ [2] ~ Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.runJobs()
         return True
     except SystemExit:
         raise  # Child
     except:
         self.errorLog('JobFarmer.seqBySeq error')
     return False
Example #4
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### Basics ###
     self.infolist = [
         'Aliases', 'DatOut', 'PFam', 'SPCode', 'SignalP', 'TMHMM'
     ]
     self.optlist = ['CleanUp', 'EnsDat', 'MakeIndex', 'FudgeFT']
     self.statlist = ['DisDom']
     self.listlist = ['Features', 'UniFake']
     self.dictlist = ['Aliases', 'Features', 'UniReal']
     self.objlist = ['SeqList']
     ### Defaults ###
     self._setDefaults(info='None',
                       opt=False,
                       stat=0.0,
                       obj=None,
                       setlist=True,
                       setdict=True)
     ### Other Attributes ###
     self.setOpt({'CleanUp': True})
     self.list['UniFake'] = [
         'tmhmm', 'signalp', 'disorder', 'pfam', 'uniprot'
     ]
     self.list['UniReal'] = ['AC', 'GN', 'RC', 'RX', 'CC', 'DR', 'PE', 'KW']
     self.obj['SeqList'] = rje_seq.SeqList(
         self.log, self.cmd_list + ['autoload=T', 'gnspacc=T', 'datout=F'])
Example #5
0
 def run(self,batch=False):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch: self.setupResults()
         ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch and not self.obj['SeqList'].seqs():    ### Look for batch files and run for each
             batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0)
             self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles)))
             if not batchfiles: self.errorLog('No input files found!',printerror=False)
             else:
                 bx = 0
                 for infile in batchfiles:
                     bx += 1
                     self.printLog('#BATCH','Batch running %s' % infile)
                     bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile]
                     self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd)
                     self.run(batch=True)
                     self.opt['Append'] = True
                     self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False)
             if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32
             return
         ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.info['Special'].lower() == 'allbyall':
             self.printLog('#RUN','Performing special "all-by-all" pairwise run')
             self.info['Special'] = ''
             for i in range(len(self.seqs())-1):
                 self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i]
                 for j in range(i+1,len(self.seqs())):
                     self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1)
                     self.run(batch=True)
                     self.opt['Append'] = True
             self.info['Special'] = 'allbyall'; return                
         ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setup()
         ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.fitness()
         self.phenotype()
         self.grouping()
         for vector in ['Fitness','Phenotype','SeqGroup']:
             if len(self.list[vector]) != self.qry().seqLen():
                 self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False)
                 raise ValueError
         results = self.price()
         ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True)
         results['Query'] = self.qry().shortName()
         results['Fitness'] = self.info['Fmethod']
         results['Phenotype'] = self.info['Pmethod']
         results['SeqGroup'] = self.info['SeqGroup']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results)
         self.printLog('#OUT','Results output to %s' % self.info['ResFile'])
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #6
0
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] ~ Sequence file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)   #!# Add code for memsaver/autoload=F #!#
            self.printLog('#SCAP','%s sequences loaded for SCAP analysis' % rje.integerString(seqlist.seqNum()))
            ## ~ [1b] ~ Xmer background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            mseqfile = self.info['XmerBack']
            if mseqfile.lower() in ['','none']: mseqfile = self.info['XmerBack'] = seqlist.info['Name']
            markov = self.obj['Markov'] = rje_markov.Markov(self.log,['autoload=T','accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % mseqfile,'direction=both','markov=F','scap=T'])
            markov.setup()
            maxx = markov.stat['MaxXmer']
            if self.info['Basefile'].lower() in ['','none']:
                self.info['Basefile'] = '%s.scap' % rje.baseFile(seqlist.info['Name'],True)
                if markov.opt['Sorted']: self.info['Basefile'] = '%s.sorted' % self.info['Basefile']
            basefile = self.info['Basefile']
            self.printLog('#MARKOV','Markov setup complete')
            ## ~ [1c] ~ SCAP Background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scapfile = self.info['ScapBack']
            if scapfile.lower() in ['','none',seqlist.info['Name'].lower()]: self.obj['ScapBack'] = self.obj['SeqList']
            elif scapfile == mseqfile: self.obj['ScapBack'] = markov.obj['SeqList'] 
            else: self.obj['ScapBack'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % scapfile])
            self.printLog('#SCAP','%s sequences for SCAP Background' % rje.integerString(seqlist.seqNum()))

            ### ~ [2] Markov Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if mseqfile == seqlist.info['Name']: markov.obj['SeqList'] = seqlist
            elif mseqfile == self.obj['ScapBack'].info['Name']: markov.obj['SeqList'] = self.obj['ScapBack']
            mpickle = markov.unpickleMe()
            if mpickle: markov = self.obj['Markov'] = mpickle
            if not markov.suftree() or not markov.pretree() or maxx > markov.stat['MaxXmer']:
                markov.run()
                markov.pickleMe()
            markov.opt['DeBug'] = self.opt['DeBug']
            self.deBug(markov.opt)
            self.deBug(markov.stat)
            #self.deBug(markov.suftree())
            #self.deBug(markov.pretree())
            return True     # Setup successful
        except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Example #7
0
 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Example #8
0
 def makePPI(self):  ### Generates files for Human-HIV PPI analysis
     '''Generates files for Human-HIV PPI analysis.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(
             self.log, self.cmd_list +
             ['seqin=%s' % self.getStr('HIVSeq'), 'autoload=T'])
         if not seqlist.seqs(): return False
         seqmap = seqlist.seqNameDic('Max')
         mdb = self.db('HHPIDMap')
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for hivacc in mdb.index('AccHIV'):
             # map HIV accession numbers on to sequences seqNameDic
             accnum = string.split(hivacc, '.')[0]
             hivseq = seqmap[accnum]
             # extract short HIV name from sequence ID
             hivgene = string.split(hivseq.shortName(), '_')[0].upper()
             # create directory named after HIV gene
             #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene))
             rje.mkDir(self, '%s/' % hivgene, log=True)
             # copy human PPI files into directories, adding HIV gene
             ex = 0.0
             etot = len(mdb.index('AccHIV')[hivacc])
             for entry in mdb.indexEntries('AccHIV', hivacc):
                 self.progLog(
                     '\r#PPI',
                     'Generating human-HIV PPI fasta files for %s %s PPI' %
                     (rje.iStr(etot), hivgene))
                 pfile = self.getStr(
                     'PPIDir') + entry['Symbol'] + '.ppi.fas'
                 if rje.exists(pfile):
                     FAS = open(
                         '%s/%s.%s.ppi.fas' %
                         (hivgene, hivgene.lower(), entry['Symbol']), 'w')
                     FAS.write('>%s\n%s\n' %
                               (hivseq.info['Name'], hivseq.getSequence()))
                     FAS.write(open(pfile, 'r').read())
                     FAS.close()
                 else:
                     self.errorLog(
                         'Cannot find human PPI file for %s interactor "%s"'
                         % (entry['HIV'], entry['Symbol']),
                         printerror=False)
             self.printLog(
                 '\r#PPI',
                 'Generated human-HIV PPI fasta files for %s %s (%s) PPI.' %
                 (rje.iStr(etot), entry['HIV'], hivgene))
     except:
         self.errorLog('%s.makePPI error' % self)
         return False
Example #9
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.infolist = ['Fitness','Phenotype','ResFile','SeqGroup','Special']
     self.optlist = ['QryGaps','NormFit','Weighted']
     self.statlist = []
     self.listlist = ['Batch','Fitness','Phenotype','SeqGroup']
     self.dictlist = []
     self.objlist = ['SeqList']
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(info='None',opt=False,stat=0.0,obj=None,setlist=True,setdict=True)
     self.setInfo({'Fitness':'cons','Phenotype':'cons','SeqGroup':'triplets','ResFile':'price.tdt'})
     self.setOpt({'Append':True,'Weighted':True})
     self.list['Batch'] = ['*.fas','*.fasta']
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.obj['SeqList'] = rje_seq.SeqList(self.log,['query=1']+self.cmd_list+['autoload=T'])
Example #10
0
 def _addSeq(
     self, seqname, fasdb
 ):  ### Fishes seqname from fasdb (using fastacmd) and sets as self.obj['Seq']
     '''Fishes seqname from fasdb (using fastacmd) and sets as self.obj['Seq'].'''
     try:
         ### Setup ###
         scmd = self.cmd_list[0:] + ['seqin=None', 'autoload=F']
         self.obj['Seq'] = rje_seq.SeqList(self.log, scmd).seqFromFastaCmd(
             seqname, fasdb)
         if not self.obj['Seq']:
             self.log.errorLog(
                 'Fastacmd failure (%s from %s). Check win32=T/F!' %
                 (seqname, fasdb),
                 printerror=False)
     except:
         self.log.errorLog('Error in MotifOcc._addSeq()',
                           printerror=True,
                           quitchoice=True)
Example #11
0
 def _setAttributes(self):  ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.infolist = []
     self.optlist = []
     self.statlist = []
     self.listlist = []
     self.dictlist = []
     self.objlist = []
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(info='None',
                       opt=False,
                       stat=0.0,
                       obj=None,
                       setlist=True,
                       setdict=True)
     self.obj['SeqList'] = rje_seq.SeqList(self.log, self.cmd_list)
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setForkAttributes()  # Delete if no forking
Example #12
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''
     Sets Attributes of Object:
     - Info:str ['Name','TMHMM','SignalP','Source']
     - Stats:float []
     - Opt:boolean ['MySQL','MaskCleave']
     - Obj:RJE_Object []
     '''
     ### <a> ### Basics 
     self.infolist = ['Name','TMHMM','SignalP','Source']
     self.statlist = []
     self.optlist = ['MySQL','MaskCleave']
     self.objlist = ['SeqList']
     ### <b> ### Defaults
     self._setDefaults(info='None',opt=False,stat=0.0,obj=None)
     self.obj['SeqList'] = rje_seq.SeqList(log=self.log,cmd_list=self.cmd_list+['autoload=F','autofilter=F'])
     self.info['Source'] = 'tmhmm'
     ### <c> ### Other Attributes
     self.tmhmm = {}
     self.signalp = {}
Example #13
0
 def _setAttributes(self):   ### Sets Attributes of Object
     '''Sets Attributes of Object.'''
     ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.strlist = []
     self.boollist = []
     self.intlist = []
     self.numlist = []
     self.listlist = []
     self.dictlist = []
     self.objlist = []
     ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setDefaults(str='None',bool=False,int=0,num=0.0,obj=None,setlist=True,setdict=True)
     #self.setInfo({})
     #self.setBool({})
     #self.setInt({})
     #self.setNum({})
     ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self._setForkAttributes()   # Delete if no forking
     self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
     self.obj['SeqList'] = rje_seq.SeqList(self.log,['autoload=T','dna=T']+self.cmd_list)
Example #14
0
 def _setupMapped(self):     ### Sets up list of Previously Mapped Sequences
     '''Sets up list of Previously Mapped Sequences.'''
     ### Setup ###
     self.list['Mapped'] = []    # List of mapped sequence names
     if not self.bool['Append'] or not os.path.exists(self.str['MapFas']): return
     ### Previous Sequences ###
     seqlist = rje_seq.SeqList(None,['i=-1','v=-1','autoload=F','seqin=%s' % self.str['MapFas']])
     SEQFILE = open(filename,'r')
     lastline = ''
     sx = 0
     ### Count ###
     while 1:
         (nextseq,lastline) = seqlist.nextFasSeq(SEQFILE,lastline)
         seqlist.seq = []
         if nextseq:
             sx += 1
             self.list['Mapped'].append(nextseq.shortName())
         else:
             break
     SEQFILE.close()
     self.printLog('#MAP','Read names of %s previously mapped sequences for redundancy checking' % rje.integerString(sx))
Example #15
0
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Read in Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqfile = self.getStr('SeqIn')
            seqs = rje_seq.SeqList(log=self.log,cmd_list=['i=0']+self.cmd_list+['autofilter=F','autoload=F','seqin=None'])
            self.printLog('#SEQS','Loading sequences from %s' % seqfile)
            if not seqs.loadSeqs(seqfile=seqfile,seqtype='protein',aln=True):
                raise IOError('Cannot load from %s' % seqfile)
            seqfile = seqs.info['Name']
            basefile = rje.baseFile(seqfile)
            if not self.getStrLC('Basefile'): self.baseFile(basefile)
            self.printLog('#SEQ',"%s protein sequences read from %s\n" % (str(seqs.seqNum()),seqfile),1)
            #?# Add option to generate alignment?
            self.printLog('#SEQ',"Alignment = %s. (%d aa)\n" % (seqs.opt['Aligned'],seqs.seq[0].seqLen()),1)
            self.dict['Output']['seqin'] = seqfile

            ### ~ [1] Read in Tree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.getStrLC('NSFIn'):
                nsfin = self.getStr('NSFIn')
            else:
                nsfin = basefile + '.nsf'
            while not os.path.exists(nsfin):
                if self.i() >= 0:
                    nsfin = rje.choice(text='Input tree file "%s" not found. Input filename? (Blank to exit.)' % nsfin)
                    if nsfin == '':
                        raise KeyboardInterrupt
                else:
                    raise IOError('File %s not found. Cannot load tree!' % nsfin)
            self.dict['Output']['nsfin'] = nsfin
            self.cmd_list.append('nsfin=' + nsfin)
            self.printLog('#TREE','Loading tree from %s' % nsfin)
            self.obj['Tree'] = mytree = rje_tree.Tree(log=self.log,cmd_list=['root=yes']+self.cmd_list)
            mytree.mapSeq(seqlist=seqs)
            mytree.textTree()
            if mytree.opt['ReRooted']:
                mytree.saveTree(filename='%s.nsf' % basefile)
            return True     # Setup successful
        except KeyboardInterrupt: self.printLog('#CANCEL','User terminated.'); return False
        except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Example #16
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(
             self.log, ['keepblast=T'] + self.cmd_list +
             ['autofilter=F', 'align=F', 'haqbat=None'])
         self.obj['SeqList']._checkForDup(True)
         if not self.seqNum():
             self.errorLog('No sequences loaded!', printerror=False)
             return False
         if self.opt['AddQueries'] and self.name(
         ) not in self.obj['SeqList'].list['Blast2Fas']:
             self.obj['SeqList'].list['Blast2Fas'].append(self.name())
         ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['HaqDir'].lower() in ['', 'none']:
             self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile(
                 self.name(), strip_path=True)
         rje.mkDir(self, self.info['HaqDir'])
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Example #17
0
 def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists
     '''Populates PosPep and NegPep Lists.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pfile = '%s.peptides.tdt' % self.basefile()
         #if rje.exists(pfile) and not self.getBool('Force'):
         #    try:
         #        pdb = self.db().addTable(pfile,['Peptide'],name='Peptides')
         #        pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'})
         #        self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y']
         #        self.list['NegPep'] = pdb.index('Positive')['Neg']
         #        return pdb
         #    except: pass
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide'])
         ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         allpep = []; self.list['Redundant'] = redundant = []
         sx = 0.0; stot = self.obj['SeqList'].seqNum() 
         for seq in self.obj['SeqList'].seqs():
             self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]:
                 sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             for frag in string.split(sequence,':'):
                 if frag in allpep: redundant.append(frag)
                 else: allpep.append(frag)
         self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot)))   
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False),
                      'Len':len(pep),'NR':'Y','Pos':'Y'}
             if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N'
             if pep in redundant: entry['NR'] = 'N'
             for aacomb in ['C','HPW','DENQ','M']:
                 x = 0
                 for a in aacomb: x += pep.count(a)
                 entry[aacomb] = x
             pdb.addEntry(entry)
         self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist)))
         ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w')
         for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep))
         for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep))
         POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile()))
         NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile()))
         return pdb
     except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None  # Setup failed
Example #18
0
	def slimDisc(self):	### Runs SLiMDisc on batch of files
		'''Runs SLiMDisc on batch of files.'''
		try:
			### Setup ###
			if self.stat['MinSup'] > self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1:
				self.stat['MinSup'] = self.stat['SlimSupport']
			if self.stat['MaxSup'] > 0  and self.stat['MaxSup'] < self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1:
				self.stat['MaxSup'] = self.stat['SlimSupport']
			### Make File List ##
			_stage = 'Make File List'
			if self.info['SeqIn'].lower() not in ['','none']:
				if os.path.exists(self.info['SeqIn']):
					gfiles = [self.info['SeqIn']]
				else:
					self.log.errorLog('"seqin" file "%s" not found! No SLiMDisc analysis.' % self.info['SeqIn'],printerror=False)
					return False
			else:
				gfiles = rje.getFileList(callobj=self,filelist=self.list['SlimFiles'],subfolders=False,summary=False)
			self.log.printLog('#FILES','%s files identified for SLiMDisc analysis.' % rje.integerString(len(gfiles)))
			## Sort by size and filter by MinSup and MaxSup ###
			datasize = {}   # Dictionary for crude sorting of files by total AA content
			seqnum = {}		# Number of sequences in each file
			qry = {}		# Query sequence name (if any) for file
			tmpseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autofilter=F'])
			gx = 0
			while gx < len(gfiles):
				seqfilename = gfiles[gx]
				gx += 1
				seqfile = seqfilename[0:]
				tmpseq.seq = []
				tmpseq.loadSeqs(seqfile)
				## *** Special RemHub process *** ##
				checkhub = True
				for hubtype in ['rem','kept','no']:
					if seqfile.find('-%shub.fas' % hubtype) > 0:
						checkhub = False
				if self.stat['RemHub'] > 0.0 and checkhub:
					if rje.matchExp('(\S+)_PPI',seqfile):
						hub_acc = rje.matchExp('(\S+)_PPI',rje.baseFile(seqfile,strip_path=True))[0]
					else:
						hub_acc = rje.baseFile(seqfile,strip_path=True)
					hub_base = rje.matchExp('(\S+)%s' % hub_acc,seqfilename)[0]
					basefile = seqfile
					while rje.baseFile(basefile) != basefile:
						basefile = rje.baseFile(basefile)
					if tmpseq.querySeq(query=hub_acc):     ### Sets Hub as Query Sequence
						self.log.printLog('#HUB','Removing hub protein %s and >=%.1f%% ID from PPI dataset %s.' % (hub_acc,self.stat['RemHub'],seqfile))
						tmpseq.makeNR(text='Hub protein homologues',nrid=self.stat['RemHub'],blast=tmpseq.seqNum(),nrsim=0,nr_qry=tmpseq.obj['QuerySeq'])
						tmpseq.removeSeq(text='PPI Hub Protein (self-interactor)',seq=tmpseq.obj['QuerySeq'])
						tmpseq.obj['QuerySeq'] = None
						seqfile = '%s-remhub.fas' % basefile
						tmpseq.saveFasta(seqfile=seqfile)	### Saves sequences in fasta format
						keptfile = '%s-kepthub.fas' % basefile
						os.rename(seqfilename,keptfile)
						gfiles.append(keptfile)
					else:
						seqfile = '%s-nohub.fas' % basefile
						os.rename(seqfilename,seqfile)
						self.log.printLog('#HUB','Hub protein %s not in PPI dataset %s => %s.' % (hub_acc,seqfilename,seqfile))
						#X#print tmpseq.obj['QuerySeq']
				## Support Range ###				
				if tmpseq.seqNum() < self.stat['MinSup'] or (self.stat['MaxSup'] > 0 and tmpseq.seqNum() > self.stat['MaxSup']):
					self.log.printLog('#REJ','%s rejected: %s sequences = outside acceptable range of %d-%d.' % (seqfile,rje.integerString(tmpseq.seqNum()),self.stat['MinSup'],self.stat['MaxSup']))
					continue
				aasize = tmpseq.aaCount()
				self.log.printLog('#AA','%s = %s aa.' % (seqfile,rje.integerString(aasize)))
				while datasize.has_key(aasize):
					aasize += 1
				datasize[aasize] = seqfile
				seqnum[seqfile] = tmpseq.seqNum()
				## Query ##
				qry[seqfile] = None
				if self.opt['SlimQuery']:
					if rje.matchExp('qry_(\S+)\.',seqfilename):
						if tmpseq.querySeq(query=rje.matchExp('qry_(\S+)\.',seqfilename)[0]):     ### Sets Query Sequence if appropriate
							qry[seqfile] = tmpseq.obj['QuerySeq'].shortName()
			self.log.printLog('#INF','%s Datasets to process.' % rje.integerString(len(seqnum)))

			### Batch Output Mode ###
			batchout = None
			if self.info['BatchOut'].lower() not in ['','none']:
				batchout = self.info['BatchOut']
				if not self.opt['Append'] and os.path.exists(batchout):
					rje.backup(self,batchout)

			### Work through Files ###
			_stage = 'Work through files'
			for key in rje.sortKeys(datasize,revsort=self.opt['BigFirst']):
				seqfile = datasize[key]
				basefile = seqfile
				while rje.baseFile(basefile) != basefile:
					basefile = rje.baseFile(basefile)
				base = rje.baseFile(basefile,True)
				self.log.printLog('#DAT',seqfile,timeout=False)
				if not self.opt['UseRes']:
					slim_cmd = '-BT -TT'
				else:
					## Detect old files ##
					_stage = 'Detect old files'
					old_rank = '%s/%s.rank' % (basefile,base)
					self.log.printLog('#RES','Existing SLiMDisc Output?: %s' % (os.path.exists(old_rank)))
					old_b_list = glob.glob('%s/results/*.blastp' % basefile)
					old_t_file = '%s/%s.fasta.out' % (basefile,base)
					self.log.printLog('#RES','Existng TEIRESIAS Output?: %s' % (os.path.exists(old_t_file)))
					self.log.printLog('#RES','%s of %s BLAST files detected.' % (rje.integerString(len(old_b_list)),rje.integerString(seqnum[seqfile])))
					## TEIRESIAS ##
					if (os.path.exists(old_rank) or len(old_b_list) > 0) and os.path.exists(old_t_file):  # BLAST started: TEIRESIAS finished!
						slim_cmd = '-TF'
					else:
						slim_cmd = '-TT'
					## BLAST ##
					if len(old_b_list) != seqnum[seqfile]:	# Need BLAST
						slim_cmd += ' -BT'
					else:
						slim_cmd += ' -BF'
				## Query ##
				if self.opt['SlimQuery'] and qry[seqfile]:
					slim_cmd += ' -q %s' % qry[seqfile]
				## Ranks ##
				slim_cmd += ' -n %d' % self.stat['SlimRanks']
				## Support ##
				if self.stat['SlimSupport'] > 0 and self.stat['SlimSupport'] < 1:
					slim_cmd += ' -S %.1f' % self.stat['SlimSupport']
				elif self.stat['SlimSupport'] > 0:
					slim_cmd += ' -S %d' % self.stat['SlimSupport']
				## WallTime ##
				slim_cmd += ' -W %d' % self.stat['SlimWall']
				## MemSaver ##
				if self.opt['MemSaver']:
					slim_cmd += ' -X T'
				else:
					slim_cmd += ' -X F'
				## SlimOpt ##
				if self.info['SlimOpt']:
					slim_cmd += ' %s' % self.info['SlimOpt']
				## Perform SLiMDisc Run ##
				_stage = 'Peform SLiMDisc Run (%s)' % (seqfile)
				if batchout:
					BATCH = open(batchout,'a')
					BATCH.write('%s -i %s -Q0 %s\n' % (self.info['SlimCall'],seqfile,slim_cmd))
					BATCH.close()
				else:
					if self.stat['Verbose'] > 0:
						syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q2 %s' % (self.info['SlimVersion'],seqfile,slim_cmd)
					else:
						syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q0 %s' % (self.info['SlimVersion'],seqfile,slim_cmd)
					self.log.printLog('#SYS',syscmd)
					os.system(syscmd)
				if not batchout:
					new_rank = '%s/%s.rank' % (basefile,base)
					self.log.printLog('#RES','New rank result %s produced?: %s' % (new_rank,os.path.exists(new_rank)))

		except:
			self.log.errorLog('rje_pattern_discovery banjaxed in slimDisc() %s' % _stage,quitchoice=True)
Example #19
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['', 'none']:
             self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.':
             self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(
             self.log, self.cmd_list + ['autoload=T'])
         self.list['PlotFT'] = string.split(
             string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['', 'none']:
             self.info['Delimit'] = rje.delimitFromExt(
                 filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(
                 self, self.info['OccFile'],
                 ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'],
                 ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']:
                     self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]:
                     self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog(
                 '#OCC', 'Loaded data for %s occurrences in %s sequences' %
                 (rje.integerString(len(occdata)),
                  rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter([
                 'GoodSeq=%s' %
                 string.join(rje.sortKeys(self.dict['OccData']), ',')
             ])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(
             string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list[
                 'PlotStat']:
             slimcalc = rje_slimcalc.SLiMCalc(self.log, self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']:
                 continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']:
                 seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(
                     sequence, returnlist=True)
             if 'hyd' in self.list['PlotStat']:
                 seq.dict['PlotStat'][
                     'Hydropathy'] = rje_seq.eisenbergHydropathy(
                         sequence, returnlist=True)
             if 'dis' in self.list['PlotStat']:
                 seq.dict['PlotStat']['Disorder'] = seq.disorder(
                     returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list[
                     'PlotStat']:
                 slimcalc.relConListFromSeq(seq,
                                            slimcalc.stat['RelConWin'],
                                            store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop(
                         'RelCons')
                 except:
                     self.printLog('#CONS',
                                   'No conservation stats for %s' % name)
             self.printLog('#STAT', 'PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0:
                     seq.dict['PlotStat'][stat] = self.plotWin(
                         seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(
                     seq.dict['PlotStat'][stat])
             self.printLog('#STAT', 'PlotStats converted for %s' % name)
             ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (
                         self.info['Basefile'], dataset, seq.info['AccNum'])
                     self.output(seq, ofile,
                                 self.dict['OccData'][name][dataset])
             else:
                 self.output(
                     seq, '%s%s.plot.txt' %
                     (self.info['Basefile'], seq.info['AccNum']))
         return
     except:
         self.errorLog(rje_zen.Zen().wisdom())
Example #20
0
def loadOrthAln(callobj,seq,gopher=True):    ### Identifies file, loads and checks alignment.
    '''
    Identifies file, loads and checks alignment. If the identified file is not actually aligned, then RJE_SEQ will try to
    align the proteins using MUSCLE or ClustalW.
    >> callobj:Object containing settings for stats generation (MotifList, generally).
    >> seq:Sequence being analysed.
    >> gopher:bool [True] = whether to try to generate alignment with GOPHER if callobj.opt['Gopher']
    << aln = SeqList object containing alignment with queryseq
    '''
    try:
        ### Setup Attributes ###
        v = callobj.stat['Verbose']
        alndir = rje.makePath(callobj.info['AlnDir'])
        alnext = callobj.info['AlnExt']
        
        ### Identify File ###
        if alnext[0] != '.': alnext = '.%s' % alnext
        alnstart = [seq.info['AccNum'],seq.info['ID'],seq.shortName(),None]
        if v > 2: callobj.log.printLog('#PRESTO','%s' % callobj.opt)  #!# Old debugging? #!#
        if callobj.opt['Gopher'] and callobj.opt['FullForce']:
            if v > 0: callobj.log.printLog('#ALN','FullForce=T. Will call Gopher for %s regardless of existing files' % seq.shortName())
            alnstart = [None]
        for file in alnstart:
            if file:
                file = '%s%s%s' % (alndir,file,alnext)
                if rje.checkForFile(file): break  # File found
            else:
                #!# Sort out logging and see if Gopher can be used directly rather than just run() #!#
                ### Run GOPHER ###
                if gopher and callobj.opt['Gopher']:  #!# Add working version for PRESTO and SlimPickings #!#
                    callobj.deBug('Run GOPHER in %s' % callobj.info['GopherDir'])
                    mydir = os.getcwd()
                    os.chdir(callobj.info['GopherDir'])
                    callobj.log.printLog('\n#GOPHER','Running GOPHER on %s' % seq.shortName())
                    try:    #!# Add log.silent() method? #!#
                        gcmd = ['orthtree'] + callobj.cmd_list + ['gnspacc=T','i=-1']
                        solo_gopher = gopher_V2.GopherFork(log=callobj.log,cmd_list=gcmd)
                        solo_gopher.info['Name'] = seq.shortName()
                        solo_gopher.obj['Sequence'] = seq
                        solo_gopher.obj['BLAST'] = gopher_V2.Gopher(callobj.log,gcmd).setupBlast()  #!# Contemplate setting up Gopher in callobj #!#
                        solo_gopher.obj['BLAST'].log = callobj.log
                        solo_gopher.run('orthalign')    #X#gopher_V2.Gopher(callobj.log,gcmd).setMode())
                    except:
                        os.chdir(mydir)
                        callobj.log.errorLog('Problem with Gopher run!')
                        return None
                        
                    if not 'old_school':                            
                        inputseq = 'tmp%s.fas' % rje.randomString(8)
                        TMP = open(inputseq,'w')
                        TMP.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence']))
                        TMP.close()
                        gcmd = ['orthtree'] + callobj.cmd_list + ['gopher=%s' % inputseq, 'gnspacc=T','i=-1']
                        try:
                            mygopher = gopher_V2.Gopher(log=callobj.log,cmd_list=gcmd)
                            mygopher.run()
                        except:
                            os.chdir(mydir)
                            callobj.log.errorLog('Problem with Gopher run!',printerror=False)
                            return None
                        rje_blast.cleanupDB(callobj,dbfile=inputseq,deletesource=True)
                    os.chdir(mydir)
                if callobj.opt['Gopher']:  
                    file = '%s%s%s' % (alndir,seq.info['AccNum'],alnext)
                    if not os.path.exists(file):
                        file = None
                if not file:
                    callobj.log.printLog('#ALN','No alignment file found for %s in %s.' % (seq.shortName(),alndir),screen=False)
                    return None
        
        ### Load Alignment ###
        callobj.log.stat['Verbose'] = v - 1
        alncmd = ['seqin=None','query=%s' % seq.shortName(),'accnr=F','seqnr=F','autofilter=F','align=T','gnspacc=F'] 
        aln = rje_seq.SeqList(log=callobj.log,cmd_list=callobj.cmd_list+alncmd)
        #X#print file
        aln.loadSeqs(seqfile=file,seqtype='Protein',aln=True,nodup=None)
        callobj.log.stat['Verbose'] = v 
        ## Check Query ##
        qry = aln.obj['QuerySeq']
        if not qry:
            if aln.querySeq(query=seq.info['AccNum']):
                qry = aln.obj['QuerySeq']
            else:
                callobj.log.printLog('#ALN','Problem finding %s in %s.' % (seq.shortName(),file),screen=False)
                return None

        ### Check Alignment ###
        if aln.seqNum() < 2:
            callobj.log.printLog('#ALN','Not enough sequences for %s in %s.' % (seq.shortName(),file),screen=False)
            return None
        if aln._checkAln(aln=True,realign=True):
            return aln
        else:
            callobj.log.printLog('#ERR','%s not aligned!!!' % (file))
            return None       
    except:
        callobj.log.errorLog('Something bad has happened in rje_motif_stats.loadOrthAln()')
        callobj.log.stat['Verbose'] = v 
        return None
Example #21
0
 def multiHAQ(self, secondrun=False):  ### Executes main HAQESAC runs
     '''Executes main HAQESAC runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         finalrun = secondrun == self.opt[
             'MultiHAQ']  # Whether this is the manual HAQESAC phase
         qryacc = self.obj['SeqList'].accList(
         )  # Full list of Query accession numbers
         processed = []  # List of processed sequence accession numbers
         ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             acc = seq.info['AccNum']
             if finalrun and acc in processed and (
                     self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo(
                         '%s already covered by previous HAQESAC. Skip?' %
                         seq.shortName()))):
                 self.printLog(
                     '#SKIP',
                     '%s already covered by previous HAQESAC: Skipped' %
                     seq.shortName())
                 continue
             ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
             logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc),
                                    wholepath=True)
             infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc),
                                   wholepath=True)
             pkfile = rje.makePath('%s%s.pickle' %
                                   (self.info['HaqDir'], acc),
                                   wholepath=True)
             pkzfile = rje.makePath('%s%s.pickle.gz' %
                                    (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not os.path.exists(infile):
                 self.printLog(
                     '#SKIP', '%s input file %s not found: Skipped' %
                     (seq.shortName(), infile))
                 continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(
                     pkzfile, infile) == pkzfile:
                 self.printLog('#SKIP',
                               '%s run detected: Skipped' % seq.shortName())
                 continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(
                     pkfile, infile) == pkfile:
                 self.printLog('#SKIP',
                               '%s run detected: Skipped' % seq.shortName())
                 continue
             inseqx = rje_seq.SeqCount(self, infile)
             if inseqx < 2:
                 self.printLog(
                     '#SKIP',
                     'Only one sequence found in %s: Skipped' % (infile))
                 continue
             ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             pickled = os.path.exists(pkfile) or os.path.exists(
                 '%s.gz' % pkfile)
             tm = 0
             while secondrun and self.opt['Chaser'] and not pickled:
                 self.progLog(
                     '#WAIT',
                     'No %s pickle. Sleeping for %d min.' % (acc, tm))
                 time.sleep(60 * tm)
                 tm += 1
                 pickled = os.path.exists(pkfile) or os.path.exists(
                     '%s.gz' % pkfile)
                 if not pickled:
                     try:
                         rje.choice(
                             'Press <ENTER> to try again, or <CTRL+C> to Quit'
                         )
                     except:
                         self.printLog('#PICKLE',
                                       'No %s pickle.' % (acc, tm))
                         self.printLog('\r#MULTI',
                                       'Exiting multiHAQ "Chaser" run.')
                         return
             ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             runhaqesac = True
             pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not self.force() and rje.exists(pngfile):
                 self.printLog(
                     '#SKIP',
                     'Found evidence of completed run: %s (force=F). Skipping.'
                     % pngfile)
                 runhaqesac = False
             ancfile = rje.makePath('%s%s.anc.fas' %
                                    (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not self.force() and rje.exists(ancfile):
                 self.printLog(
                     '#SKIP',
                     'Found evidence of completed run: %s (force=F). Skipping.'
                     % ancfile)
                 runhaqesac = False
             #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile:
             if runhaqesac:
                 haqcmd = [
                     'ini=haqesac.ini',
                     'seqin=%s.fas' % acc,
                     'query=%s' % acc,
                     'basefile=%s' % acc, 'newlog=F'
                 ]
                 self.printLog(
                     '#HAQ',
                     'Running HAQESAC for %s - will have own log etc.' %
                     seq.shortName(),
                     log=False)
                 os.chdir(self.info['HaqDir'])
                 info = haqesac.makeInfo()
                 haqcmd = rje.getCmdList(haqcmd, info=info)
                 out = rje.Out(
                     cmd_list=haqcmd
                 )  # Sets up Out object for controlling output to screen
                 out.printIntro(
                     info
                 )  # Prints intro text using details from Info object
                 haqlog = rje.setLog(
                     info, out, haqcmd
                 )  # Sets up Log object for controlling log file output
                 try:
                     haqesac.HAQESAC(log=haqlog,
                                     cmd_list=haqcmd).run(setobjects=True)
                 except:
                     os.chdir(self.info['RunPath'])
                     if self.i() >= 0 and rje.yesNo(
                             'Problem with %s HAQESAC run. Abort?' %
                             seq.shortName()):
                         raise KeyboardInterrupt
                 os.chdir(self.info['RunPath'])
                 if finalrun:
                     self.printLog(
                         '#HAQ',
                         'HAQESAC final round run for %s' % seq.shortName())
                 else:
                     self.printLog(
                         '#HAQ',
                         'HAQESAC first round run for %s' % seq.shortName())
             ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not self.opt['ScreenQry'] or not finalrun: continue
             qacclist = []
             for qacc in rje_seq.SeqList(
                     self.log,
                 ['seqin=%s' % infile, 'autoload=T', 'autofilter=F'
                  ]).accList():
                 if qacc in qryacc and qacc != acc: qacclist.append(qacc)
                 if qacc in qryacc and qacc not in processed:
                     processed.append(qacc)
             self.printLog(
                 '#QRY', '%d other queries found in %s: [%s]' %
                 (len(qacclist), infile, string.join(qacclist, '; ')))
             self.printLog(
                 '#QRY', '%d of %d queries processed' %
                 (len(processed), self.seqNum()))
         ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not finalrun:
             self.printLog('#MULTI', 'Executing second round of multiHAQ')
             self.multiHAQ(True)
     except:
         self.errorLog('Major problem with MultiHAQ.multiHAQ',
                       quitchoice=True)
Example #22
0
    def _pepStats(self):  ### Peptide Distance
        '''
        Peptide Distance.
        '''
        try:
            ### Setup ###
            seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=T'])
            aaprop = rje_aaprop.AAPropMatrix(self.log, self.cmd_list)
            aaprop.makePropDif()
            delimit = rje.getDelimit(self.cmd_list)

            ### Output File Setup ###
            OUTFILE = open('hrb.pepstats.%s' % rje.delimitExt(delimit), 'w')
            headlist = ['peptide']
            ## 10 Dimensional Peptide Property Output ##
            for property in rje.sortKeys(aaprop.prop):
                headlist.append(property.lower())
                for aa in aaprop.prop[property].keys():
                    try:
                        if aa not in ['-', 'X']:
                            aaprop.prop[property][aa] = string.atoi(
                                aaprop.prop[property][aa])
                    except:
                        print aaprop.prop, property, aa, aaprop.prop[property][
                            aa]
                        raise
            ## Additional Stats ##
            headlist.append('net_charge')
            #headlist.append('hydrophobicity')
            headlist.append('charge_balance')
            headlist.append('hydrophobic_balance')
            #headlist.append('hydrophobicity_balance')
            ## Output
            rje.writeDelimit(OUTFILE, headlist, delimit)

            ### Calculate stats ###
            for pep in seqlist.seq:
                pepname = pep.shortName()
                if rje.matchExp('^(\S+_\d[CQ])', pepname):
                    pepname = rje.matchExp('^(\S+_\d[CQ])', pepname)[0]
                outlist = [pepname]
                pepseq = pep.info['Sequence']
                ## 10 Dimensional Peptide Property Output ##
                for property in rje.sortKeys(aaprop.prop):
                    px = 0
                    for aa in pepseq:
                        px += aaprop.prop[property][aa]
                    outlist.append('%d' % px)
                ## Additional Stats ##
                net_charge = 0
                for aa in pepseq:
                    net_charge += (aaprop.prop['Positive'][aa] -
                                   aaprop.prop['Negative'][aa])
                outlist.append('%d' % net_charge)
                charge_balance = 0
                hydrophobic_balance = 0
                for r in range(len(pepseq)):
                    charge_balance += aaprop.prop['Charged'][pepseq[r]] * (
                        1.0 / (r + 1))
                    charge_balance -= aaprop.prop['Charged'][pepseq[r]] * (
                        1.0 / (10 - r))
                    hydrophobic_balance += aaprop.prop['Hydrophobic'][
                        pepseq[r]] * (1.0 / (r + 1))
                    hydrophobic_balance -= aaprop.prop['Hydrophobic'][
                        pepseq[r]] * (1.0 / (10 - r))
                outlist.append('%.3f' % charge_balance)
                outlist.append('%.3f' % hydrophobic_balance)
                rje.writeDelimit(OUTFILE, outlist, delimit)

            ### Finish ###
            OUTFILE.close()

        except:
            self.log.errorLog('Error in _pepStats',
                              printerror=True,
                              quitchoice=False)
            raise  # Delete this if method error not terrible
Example #23
0
    def parse(
        self,
        parsedom=True,
        parseseq=True,
        parsecomplex=True
    ):  ### HPRD Parsing method. Generates Mappings, HPRD data dictionary, Domain dictionary & Sequences
        '''HPRD Parsing method. Generates Mappings, HPRD data dictionary, Domain dictionary & Sequences.'''
        try:
            ### ~ Parse HPRD Mappings onto other database IDs from HPRD_ID_MAPPINGS.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['HPRD'] = {}
            self.dict['Mapping'] = {}
            hprd = self.loadFromFile('%sHPRD_ID_MAPPINGS.txt' %
                                     self.info['HPRDPath'],
                                     v=1,
                                     checkpath=True,
                                     chomplines=True)
            hx = float(len(hprd))
            while hprd:
                entry = hprd.pop(0)
                px = 100.0 * (hx - len(hprd)) / hx
                self.log.printLog('\r#HPRD',
                                  'Parsing HPRD_ID_MAPPINGS: %.1f%%' % px,
                                  newline=False,
                                  log=False)
                data = string.split(entry)
                ## Check ##
                if len(data) < 7: continue
                if self.dict['HPRD'].has_key(data[0]):
                    self.log.errorLog('HPRD ID %s duplicated! Aaargh!' %
                                      data[0],
                                      printerror=False)
                ## Update ##
                self.dict['HPRD'][data[0].upper()] = {
                    'gene': data[1].upper(),
                    'gb': data[3],
                    'entrez': data[4],
                    'omim': data[5],
                    'sp': data[6].upper(),
                    'desc': string.join(data[7:])
                }
                for i in [1, 3, 6]:
                    self.dict['Mapping'][data[i].upper()] = data[0]
            self.log.printLog('\r#HPRD', 'Parsing HPRD_ID_MAPPINGS complete!')

            ### ~ Parse HPRD Domain Mappings from PROTEIN_Architecture.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Domains'] = {}
            self.dict['DomainSource'] = {}
            if parsedom:
                hprd = self.loadFromFile('%sPROTEIN_Architecture.txt' %
                                         self.info['HPRDPath'],
                                         v=1,
                                         checkpath=True,
                                         chomplines=True)
                hx = float(len(hprd))
                while hprd:
                    entry = hprd.pop(0)
                    px = 100.0 * (hx - len(hprd)) / hx
                    self.log.printLog('\r#HPRD',
                                      'Parsing PROTEIN_Architecture: %.1f%%' %
                                      px,
                                      newline=False,
                                      log=False)
                    data = string.split(entry)
                    ## Check ##
                    if len(data) < 9: continue
                    (hid, domain, type, source) = (data[0], data[4], data[5],
                                                   data[8])
                    if type != 'Domain': continue
                    ## Update ##
                    if domain not in self.dict['Domains']:
                        self.dict['Domains'][domain] = [hid]
                    elif hid not in self.dict['Domains'][domain]:
                        self.dict['Domains'][domain].append(hid)
                    if domain not in self.dict['DomainSource']:
                        self.dict['DomainSource'][domain] = [source]
                    elif source not in self.dict['DomainSource'][domain]:
                        self.dict['DomainSource'][domain].append(source)
                self.log.printLog('\r#HPRD',
                                  'Parsing PROTEIN_Architecture complete!')

            ### ~ Make SeqList from PROTEIN_SEQUENCES.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if parseseq:
                scmd = self.cmd_list + [
                    'autoload=T', 'gnspacc=F',
                    'seqin=%sPROTEIN_SEQUENCES.txt' % self.info['HPRDPath'],
                    'autofilter=F', 'accnr=F', 'seqnr=F'
                ]
                self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd)
                self.obj['SeqList'].info[
                    'Name'] = self.info['OutDir'] + 'hprd.fas'
                sx = 0.0
                for seq in self.obj['SeqList'].seq[
                        0:]:  # seq.info['ID'] should be the HPRD ID #
                    ## Initial processing of sequence. Only keep if AllIso or isoform 1 ##
                    self.log.printLog('\r#SEQ',
                                      'Processing HPRD Sequences: %.1f%%' %
                                      (sx / self.obj['SeqList'].seqNum()),
                                      newline=False,
                                      log=False)
                    iso = 'X'
                    h = seq.info['ID']
                    try:
                        iso = rje.matchExp('^\d+\|\d+_(\d+)\|',
                                           seq.info['Name'])[0]
                    except:
                        self.deBug(seq.info['Name'])
                    try:
                        if h not in self.dict['HPRD']:
                            self.printLog(
                                '\r#ERR',
                                'Missing from HPRD_ID_MAPPINGS?: %s' %
                                seq.info['Name'])
                            data = string.split(seq.info['Name'], '|')
                            self.dict['HPRD'][h] = {
                                'gene': '-',
                                'gb': data[2],
                                'entrez': '',
                                'omim': '',
                                'sp': '',
                                'desc': string.join(data[3:], '|')
                            }
                        if not self.opt['AllIso'] and self.dict['HPRD'][
                                h].has_key('Seq') and iso != '1':
                            self.obj['SeqList'].seq.remove(seq)
                            continue
                        #x#if h == '00001': self.deBug('%s = %s' % (h,iso))
                        sx += 100.0
                        seq.setInfo({
                            'Gene':
                            self.dict['HPRD'][h]['gene'],
                            'Description':
                            self.dict['HPRD'][h]['desc'] +
                            ' [Gene:%s HPRD:%s; gb:%s; sp:%s]' %
                            (self.dict['HPRD'][h]['gene'], h, self.dict['HPRD']
                             [h]['gb'], self.dict['HPRD'][h]['sp']),
                            'AccNum':
                            self.dict['HPRD'][h]['sp']
                        })
                        ## AllIso options ##
                        if self.opt['AllIso']:
                            if 'Seq' not in self.dict['HPRD'][h]:
                                self.dict['HPRD'][h]['Seq'] = [seq]
                            else:
                                self.dict['HPRD'][h]['Seq'].append(seq)
                            seq.setInfo({'AccNum': '%s-%s' % (h, iso)})
                        else:
                            self.dict['HPRD'][h]['Seq'] = seq
                        #x#print h, self.dict['HPRD'][h]['Seq']
                        ## Finish formatting ##
                        if seq.info['Gene'] == '-':
                            self.dict['HPRD'][h]['gene'] = seq.info[
                                'Gene'] = 'HPRD' + h
                        if seq.info['AccNum'] == '-':
                            seq.info['AccNum'] = self.dict['HPRD'][h]['gb']
                        seq.info['ID'] = '%s_HUMAN' % seq.info['Gene']
                        seq.info['Name'] = '%s__%s %s' % (
                            seq.info['ID'], seq.info['AccNum'],
                            seq.info['Description'])
                    except:
                        self.errorLog('Protein Parse Error (%s)' %
                                      seq.info['Name'])
                self.log.printLog('\r#SEQ',
                                  'Processing HPRD Sequences complete!')

            ### ~ Make PPI Data from BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            missing = []
            self.dict['PPI'] = {}
            ppi = self.loadFromFile(
                '%sBINARY_PROTEIN_PROTEIN_INTERACTIONS.txt' %
                self.info['HPRDPath'],
                v=1,
                checkpath=True,
                chomplines=True)
            hx = float(len(ppi))
            ix = 0
            while ppi:
                entry = ppi.pop(0)
                px = 100.0 * (hx - len(ppi)) / hx
                self.log.printLog(
                    '\r#PPI',
                    'Parsing BINARY_PROTEIN_PROTEIN_INTERACTIONS: %.1f%%' % px,
                    newline=False,
                    log=False)
                data = string.split(entry, '\t')
                ## Check ##
                if len(data) < 7: continue
                types = string.split(data[6], ';')
                if not types: types = ['unknown']
                for type in types[0:]:
                    if type in self.list['BadType'] or (
                            self.list['PPIType']
                            and type not in self.list['PPIType']):
                        types.remove(type)
                if not types: continue
                ix += 1
                ## Update ##
                (p1, p2) = (data[1].upper(), data[4].upper())
                if p1 not in self.dict['HPRD']:
                    if p1 not in missing:
                        missing.append(p1)
                        self.log.printLog(
                            '#ERR',
                            'HPRD ID "%s" missing from HPRD_ID_MAPPINGS!' % p1,
                            screen=False)
                    continue
                if p2 not in self.dict['HPRD']:
                    if p2 not in missing:
                        missing.append(p2)
                        self.log.printLog(
                            '#ERR',
                            'HPRD ID "%s" missing from HPRD_ID_MAPPINGS!' % p1,
                            screen=False)
                    continue
                if not self.dict['PPI'].has_key(p1): self.dict['PPI'][p1] = []
                if p2 not in self.dict['PPI'][p1]:
                    self.dict['PPI'][p1].append(p2)
                if not self.dict['PPI'].has_key(p2): self.dict['PPI'][p2] = []
                if p1 not in self.dict['PPI'][p2]:
                    self.dict['PPI'][p2].append(p1)
                if p1 not in self.dict['Evidence']:
                    self.dict['Evidence'][p1] = {}
                if p2 not in self.dict['Evidence'][p1]:
                    self.dict['Evidence'][p1][p2] = []
                for type in types:
                    if type not in self.dict['Evidence'][p1][p2]:
                        self.dict['Evidence'][p1][p2].append(type)
                #x#if p1 == '12422': self.deBug(self.dict['PPI'][p1])
            self.log.printLog(
                '\r#PPI',
                'Parsing BINARY_PROTEIN_PROTEIN_INTERACTIONS complete!')

            ### ~ Parse protein Complex data from PROTEIN_COMPLEXES.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Complex'] = {}
            ppi = self.loadFromFile('%sPROTEIN_COMPLEXES.txt' %
                                    self.info['HPRDPath'],
                                    v=1,
                                    checkpath=True,
                                    chomplines=True)
            hx = float(len(ppi))
            while ppi:
                entry = ppi.pop(0)
                px = 100.0 * (hx - len(ppi)) / hx
                self.log.printLog('\r#PPI',
                                  'Parsing PROTEIN_COMPLEXES: %.1f%%' % px,
                                  newline=False,
                                  log=False)
                data = string.split(entry)
                ## Check ##
                if len(data) < 5: continue
                ## Update ##
                (complex, hprd) = (data[0], data[1])
                if hprd == 'None': continue
                if not self.dict['Complex'].has_key(complex):
                    self.dict['Complex'][complex] = []
                if hprd not in self.dict['Complex'][complex]:
                    self.dict['Complex'][complex].append(hprd)
                #x#if p1 == '12422': self.deBug(self.dict['PPI'][p1])
            self.log.printLog('\r#PPI', 'Parsing PROTEIN_COMPLEXES complete!')

            ### ~ Update PPI from protein Complex data if appropriate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            type = 'complex'
            if type not in self.list['BadType'] and (
                    not self.list['PPIType'] or type in self.list['PPIType']):
                cx = 0.0
                for complex in self.dict['Complex']:
                    self.log.printLog(
                        '\r#PPI',
                        'Adding protein complex data to PPI: %.1f%%' %
                        (cx / len(self.dict['Complex'])),
                        newline=False,
                        log=False)
                    cx += 100.0
                    for p1 in self.dict['Complex'][complex]:
                        for p2 in self.dict['Complex'][complex]:
                            if not self.dict['PPI'].has_key(p1):
                                self.dict['PPI'][p1] = []
                            if p2 not in self.dict['PPI'][p1]:
                                self.dict['PPI'][p1].append(p2)
                            if p1 not in self.dict['Evidence']:
                                self.dict['Evidence'][p1] = {}
                            if p2 not in self.dict['Evidence'][p1]:
                                self.dict['Evidence'][p1][p2] = []
                            if type not in self.dict['Evidence'][p1][p2]:
                                self.dict['Evidence'][p1][p2].append(type)
                self.log.printLog(
                    '\r#PPI',
                    'Added protein complex data to PPI for %s complexes' %
                    rje.integerString(len(self.dict['Complex'])))
            ptxt = '%s proteins; %s interactions' % (rje.integerString(
                len(self.dict['PPI'])), rje.integerString(ix))
            self.log.printLog('\r#PPI',
                              'Parsing interactions complete: %s.' % ptxt)
            if missing:
                open('HPRD.missing.txt', 'w').write(string.join(missing, '\n'))
        except:
            self.log.errorLog('Error in HPRD.parse()',
                              printerror=True,
                              quitchoice=False)
            raise
Example #24
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            genes = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd)
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            exons = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}  # Dictionary of {ID:Sequence object}
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE',
                                  'Processing Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                (id, scaffold, pos, name, glen) = rje.matchExp(
                    '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',
                    gene.info['Name'])
                if string.atoi(glen) != gene.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold': scaffold, 'Gene': name})
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',
                                           pos)
                except:
                    (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                gene.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                gene.setStat({'Start': start, 'End': end})
                gene.list['CDS'] = []  # Will add CDS sequences here
                gene.list['Exon'] = []  # Will add exon sequences here
            self.log.printLog('\r#GENE',
                              'Processing Gene Annotation complete!')

            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx, ctot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS',
                                  'Processing CDS Annotation: %.1f%%' %
                                  (cx / ctot),
                                  newline=False,
                                  log=False)
                cx += 100
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!')

            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex, etot) = (0.0, exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON',
                                  'Processing Exon Annotation: %.1f%%' %
                                  (ex / etot),
                                  newline=False,
                                  log=False)
                ex += 100
                try:
                    (id, scaffold, pos, name, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,
                                                                 ',')[0]]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON',
                              'Processing Exon Annotation complete!')

            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE',
                                  'Generating new Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist, xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist, xreplace=False)
                gene.info[
                    'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (
                        gene.info['Gene'], gene.info['SpecCode'],
                        gene.info['AccNum'], gene.aaLen(),
                        string.join(clist, ','), string.join(elist, ','))
            self.log.printLog('\r#GENE',
                              'Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Example #25
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A', 'C', 'G', 'T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []  # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {
            }  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {
            }  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1, n2, n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod, 'T', 'U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][
                            cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,
                                     total=False)  # Normalise triplet freq.
            for aa in nts_cfreq:
                nts_cfreq[aa] = rje.dictFreq(
                    nts_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog('#FREQ', 'Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog(
                    '\r#OBS',
                    'Calculating observed codon frequencies: %.1f%%' %
                    (sx / stot),
                    newline=False,
                    log=False)
                sx += 100.0
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try:
                    exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',
                                         pos)[0]
                except:
                    try:
                        exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0]
                    except:
                        exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0]
                self.deBug(exons)
                exons = string.split(exons, ',')
                elen = []
                try:
                    for exon in exons:
                        (start, end) = string.split(exon, '..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue

                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen():
                    self.log.errorLog('%s exon length error' % id,
                                      printerror=False)
                if seq.aaLen() / 3 != seq.aaLen() / 3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,
                                      printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence, 'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id, printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq:
                obs_cfreq[aa] = rje.dictFreq(
                    obs_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog(
                '\r#OBS', 'Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP',
                                  'Calculating triplet frequencies: %.1f%%' %
                                  (sx / stot),
                                  newline=False,
                                  log=False)
                sx += 100.0
                elen = seq.list['ExonLen']
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0  # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:  # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:  # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1, 3):
                                    if elen[0] > ax + i:  # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (
                                            obs_cfreq[prevaa][cod1] *
                                            obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (
                                            nts_cfreq[prevaa][cod1] *
                                            nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:  # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:  # Look at overlap with previous codon
                        for i in range(1, 3):
                            if elen[0] > ax + i:  # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq, total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False)
            self.log.printLog('\r#TRIP',
                              'Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip',
                'NT_Trip', 'ObCod_Trip', 'NTCod_Trip'
            ]
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self, tfile, headers, rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod, 'T', 'U')]
                datadict = {
                    'Triplet': cod,
                    'AA': aa,
                    'Degen': len(obs_cfreq[aa]),
                    'Obs_Codon': obs_cfreq[aa][cod],
                    'NT_Codon': nts_cfreq[aa][cod],
                    'Obs_Trip': obs_tfreq[cod],
                    'NT_Trip': nts_tfreq[cod],
                    'ObCod_Trip': ocd_tfreq[cod],
                    'NTCod_Trip': ncd_tfreq[cod]
                }
                rje.delimitedFileOutput(self,
                                        tfile,
                                        headers,
                                        datadict=datadict)
            self.log.printLog('#OUT',
                              'Triplet & codon data output to %s' % tfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Example #26
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI', 'Loading pairwise data...')
         pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'],
                                 ['Spoke', 'SpokeSeq', 'Evidence'])
         gene2seq = {}
         seq2gene = {}
         fullppi = {}
         px = 0.0
         ptot = len(pairwise)
         ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog(
                 '\r#PPI',
                 'Processing full pairwise PPI: %.2f%%' % (px / ptot))
             px += 100.0
             [hub, spoke] = string.split(pair, '\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq
                 seq2gene[string.split(sseq, '__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]:
                 fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']
                 ppix += 1
         self.printLog(
             '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' %
             (rje.integerString(len(fullppi)), rje.integerString(ppix / 2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile, chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0
         ptot = len(fullppi)
         fppix = ppix
         ppix = 0
         for hub in fullppi:
             self.progLog(
                 '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' %
                 (px / ptot, rje.integerString(len(
                     self.dict['PPI'])), rje.integerString(ppix)))
             px += 100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype,
                                     fullppi[hub][spoke]):
                         goodspoke = True
                         break
                 if goodspoke:
                     self.dict['PPI'][hub].append(spoke)
                     continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub, spoke]: continue
                     if spoke2 in fullppi[spoke]:
                         goodspoke = False
                         break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog(
             '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' %
             (rje.integerString(
                 len(fullppi)), rje.integerString(len(self.dict['PPI'])),
              rje.integerString(fppix / 2), rje.integerString(ppix / 2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F', 'seqnr=F',
                 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq
         self.dict['Seq2Gene'] = seq2gene
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Example #27
0
    def setup(self):  ### Main class setup method.
        '''Main class setup method.'''
        try:  ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['Prog'] = None
            prog = self.getStrLC('Name')
            if prog in mod:
                i = self.obj['ProgInfo'] = mod[prog].makeInfo()
                self.printLog(
                    '#PROG',
                    '%s V%s: %s' % (i.program, i.version, i.description))
                progcmd = rje.getCmdList(
                    [], info=i) + self.cmd_list + ['newlog=F']
                out = rje.Out(cmd_list=progcmd)
                out.printIntro(i)
                #self.debug(prog); self.debug(progcmd)
                if self.getBool('Help'):
                    progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd)
                self.printLog('#CMD',
                              'Full %s CmdList: %s' %
                              (i.program,
                               rje.argString(
                                   rje.tidyArgs(progcmd,
                                                nopath=self.getStrLC('Rest')
                                                and not self.dev(),
                                                purgelist=purgelist))),
                              screen=False)
                #self.debug(prog); self.debug(progcmd)
                ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if prog in ['seqlist', 'rje_seqlist']:
                    self.obj['Prog'] = rje_seqlist.SeqList(self.log, progcmd)
                elif prog in ['uniprot', 'rje_uniprot']:
                    self.obj['Prog'] = rje_uniprot.UniProt(self.log, progcmd)
                elif prog in ['taxonomy', 'rje_taxonomy']:
                    self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log, progcmd)
                elif prog in ['tree', 'rje_tree']:
                    self.obj['Prog'] = rje_tree.Tree(self.log, progcmd)
                elif prog in ['xref', 'rje_xref']:
                    self.obj['Prog'] = rje_xref.XRef(self.log, progcmd)
                elif prog in ['seq', 'rje_seq']:
                    self.obj['Prog'] = rje_seq.SeqList(self.log, progcmd)
                elif prog in ['mitab', 'rje_mitab']:
                    self.obj['Prog'] = rje_mitab.MITAB(self.log, progcmd)
                elif prog in ['dbase', 'database']:
                    self.obj['Prog'] = rje_dbase.DatabaseController(
                        self.log, progcmd)
                elif prog in ['pydocs']:
                    self.obj['Prog'] = rje_pydocs.PyDoc(self.log, progcmd)
                elif prog in ['ensembl', 'rje_ensembl']:
                    self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log, progcmd)
                elif prog in ['genbank', 'rje_genbank']:
                    self.obj['Prog'] = rje_genbank.GenBank(self.log, progcmd)
                elif prog in ['extatic']:
                    self.obj['Prog'] = extatic.ExTATIC(self.log, progcmd)
                elif prog in ['revert']:
                    self.obj['Prog'] = revert.REVERT(self.log, progcmd)
                elif prog in ['fiesta']:
                    self.obj['Prog'] = fiesta.FIESTA(self.log, progcmd)
                elif prog in ['gablam']:
                    self.obj['Prog'] = gablam.GABLAM(self.log, progcmd)
                elif prog in ['gopher']:
                    self.obj['Prog'] = gopher.Gopher(self.log, progcmd)
                elif prog in ['haqesac']:
                    self.obj['Prog'] = haqesac.HAQESAC(self.log, progcmd)
                elif prog in ['multihaq']:
                    self.obj['Prog'] = multihaq.MultiHAQ(self.log, progcmd)
                elif prog in ['pingu']:
                    self.obj['Prog'] = pingu.PINGU(self.log, progcmd)
                elif prog in ['pacbio']:
                    self.obj['Prog'] = rje_pacbio.PacBio(self.log, progcmd)
                elif prog in ['rje_zen', 'zen']:
                    self.obj['Prog'] = rje_zen.Zen(self.log, progcmd)

            ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.obj['Prog']:
                self.printLog(
                    '#ERR',
                    'Program "%s" not recognised.' % self.getStr('Name'))
                if self.i() < 0: return False
                if rje.yesNo('Show SeqSuite help with program options?'):
                    extracmd = cmdHelp(cmd_list=['help'])[1:]
                    if extracmd:
                        self.cmd_list += extracmd
                        self._cmdList()
                        if prog != self.getStrLC('Name'): return self.setup()
                self.setStr({
                    'Name':
                    rje.choice('Give program name (Blank or CTRL+C to quit)')
                })
                if self.getStrLC('Name'): return self.setup()
                else: return False
            return self.obj['Prog']  # Setup successful
        except KeyboardInterrupt:
            return False
        except SystemExit:
            raise
        except:
            self.errorLog('Problem during %s setup.' % self.prog())
            return False  # Setup failed
Example #28
0
 def _peptideProbabilities(self):    ### Read in peptides and positives and calculate probability of return
     '''Read in peptides and positives and calculate probability of return.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getBool('CysWeight'): return self._cysteinePeptideProbabilities()
         self._positiveAndNegativePeptides()
         #return self.printLog('#NOPROB','Probability calculation temporarily suspended')
         pfile = '%s.pep_prob.tdt' % self.basefile()
         if rje.exists(pfile) and not self.getBool('Force'):
             try:
                 pdb = self.db().addTable(pfile,['PepSize'],name='PepProb')
                 pdb.dataFormat(reformat={'PepSize':'num','Positive':'int','Negative':'int','Prob':'num'})
                 for entry in pdb.entries():
                     if entry['PepSize'] < 100: entry['PepSize'] = int(entry['PepSize'])
                 return pdb
             except: pass
         pdb = self.db().addEmptyTable('PepProb',['PepSize','Positive','Negative','Prob'],['PepSize'])
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         ## ~ [0a] ~ Load Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         ## ~ [0b] ~ Load Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         ### ~ [1] ~ Digest Positives and Update PepProb Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         ## ~ [1a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         edict = {}
         for i in range(1,self.getInt('MaxPepLen')+1):
             edict[i] = pdb.addEntry({'PepSize':i,'Positive':0,'Negative':0,'Prob':1.0})
             if self.getBool('PepMWt'): edict[i*100.0] = pdb.addEntry({'PepSize':i*100.0,'Positive':0,'Negative':0,'Prob':1.0})
         ## ~ [1b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~~~~~ ##
         poslist = []; neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if self.getBool('NRPep') and pep in self.list['Redundant']: continue
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             plen = min(len(pep),self.getInt('MaxPepLen'))
             if pep in peplist: edict[plen]['Positive'] += 1
             else: edict[plen]['Negative'] += 1; poslist.remove(pep); neglist.append(pep)
             if self.getBool('PepMWt'):
                 pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen'))
                 if pep in peplist: edict[pwt]['Positive'] += 1
                 else: edict[pwt]['Negative'] += 1
         self.printLog('\r#PEP','Processing positive peptides (%s) complete.' % protease)
         ## ~ [1d] # Calculate peptide probabilities for protease combo ~~~~~~~~~~~~~~~~~~~~ ##
         for entry in edict.values():
             try: entry['Prob'] = float(entry['Positive']) / float(entry['Positive']+entry['Negative'])
             except: entry['Prob'] = 0.0
         ### ~ [2] ~ Save File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         return pdb
     except: self.errorLog('Problem during %s._peptideProbabilities().' % self); return None  # Setup failed
Example #29
0
    def _pepDis(self):  ### Peptide Distance
        '''
        Peptide Distance.
        '''
        try:
            ### <0> ### Setup
            seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=T'])
            dismatrix = rje_dismatrix.DisMatrix(self.log, self.cmd_list)
            dismatrix.info['Name'] = self.info['Method']
            dismatrix.opt['Symmetric'] = True
            if self.info['Method'] in ['ds_prop', 'tot_prop', 'best_prop']:
                aaprop = rje_aaprop.AAPropMatrix(self.log, self.cmd_list)
                #aaprop.readAAProp()
                aaprop.makePropDif()
            elif self.info['Method'] == 'pam':
                pam = rje_pam.PamCtrl(log=self.log, cmd_list=self.cmd_list)
            ### <1> ### Make DisMatrix
            for seq1 in seqlist.seq:
                for seq2 in seqlist.seq:
                    if seqlist.seq.index(seq1) > seqlist.seq.index(
                            seq2):  # No need to calculate - symmetrical!
                        continue
                    dis = 0
                    if seq1 == seq2 and self.info['OutMatrix'] == 'phylip':
                        dis = 0
                    elif self.info['Method'] in ['ds_prop', 'ds_id']:
                        (self_dis1, self_dis2) = (0, 0)
                        for r1 in range(seq1.seqLen()):
                            for r2 in range(r1, seq2.seqLen()):
                                (a1, a2) = (seq1.info['Sequence'][r1],
                                            seq2.info['Sequence'][r2])
                                (s1, s2) = (seq1.info['Sequence'][r2],
                                            seq2.info['Sequence'][r1])
                                phys_dis = r2 - r1
                                if self.info['Method'] == 'ds_prop':
                                    dis += (aaprop.pdif['%s%s' % (a1, a2)] *
                                            (seq1.seqLen() - phys_dis))
                                    self_dis1 += (aaprop.pdif['%s%s' %
                                                              (a1, s1)] *
                                                  (seq1.seqLen() - phys_dis))
                                    self_dis2 += (aaprop.pdif['%s%s' %
                                                              (a2, s2)] *
                                                  (seq1.seqLen() - phys_dis))
                                elif self.info[
                                        'Method'] == 'ds_id' and a1 != a2:
                                    dis += (seq1.seqLen() - phys_dis)
                                if self.info['Method'] == 'ds_id' and a1 != s1:
                                    self_dis1 += (seq1.seqLen() - phys_dis)
                                if self.info['Method'] == 'ds_id' and a2 != s2:
                                    self_dis2 += (seq1.seqLen() - phys_dis)
                        dis -= (self_dis1 + self_dis2) / 2.0
                    elif self.info['Method'] == 'tot_prop':
                        proptot = {}
                        for property in aaprop.prop.keys():
                            proptot[property] = {seq1: 0.0, seq2: 0.0}
                        for seq in [seq1, seq2]:
                            for r in range(seq.seqLen()):
                                aa = seq.info['Sequence'][r]
                                for property in aaprop.prop.keys():
                                    proptot[property][seq] += string.atof(
                                        aaprop.prop[property][aa])
                        for property in aaprop.prop.keys():
                            if proptot[property][seq1] > proptot[property][
                                    seq2]:
                                dis += (proptot[property][seq1] -
                                        proptot[property][seq2])
                            else:
                                dis += (proptot[property][seq2] -
                                        proptot[property][seq1])
                    elif self.info['Method'] == 'pam':
                        dis = pam.pamML(ancseq=seq1.info['Sequence'],
                                        descseq=seq2.info['Sequence'])
                    elif self.info['Method'] == 'best_prop':
                        min_dis = seq1.seqLen() * len(aaprop.prop)
                        pepseq1 = seq1.info['Sequence']
                        for c in range(seq1.seqLen()):  # Circular start
                            dis = 0
                            pepseq2 = seq2.info['Sequence'][c:] + seq2.info[
                                'Sequence'][:c]
                            for r in range(seq1.seqLen()):
                                (a1, a2) = (pepseq1[r], pepseq2[r])
                                dis += aaprop.pdif['%s%s' % (a1, a2)]
                            if dis < min_dis:
                                min_dis = dis
                        dis = min_dis
                    dismatrix.addDis(seq1, seq2, dis)
            ### <2> ### Output
            if self.info['OutMatrix'] == 'phylip':
                delimit = ' '
                format = 'phylip'
            else:
                delimit = rje.getDelimit(self.cmd_list, ',')
                format = 'None'
            outfile = '%s.%s.%s' % (rje.baseFile(
                seqlist.info['Name'],
                True), self.info['Method'], rje.delimitExt(delimit))
            dismatrix.saveMatrix(seqlist.seq, outfile, delimit, format=format)

        except:
            self.log.errorLog('Error in _pepDis',
                              printerror=True,
                              quitchoice=False)
            raise  # Delete this if method error not terrible
Example #30
0
    def slimJimMapping(self):  ### Generate SLiMJIM PNGs for all sequences
        '''Generate SpokeAln PNGs for all spokes.'''
        try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mapseq = {}  # Dictionary of {dataset:[seqs]}
            scmd = ['autoload=T', 'seqnr=F', 'accnr=F', 'replacechar=F']
            mseq = rje_seq.SeqList(self.log, self.cmd_list +
                                   scmd)  #!# Removed ['minregion=3']+ #!#
            while mseq.seq:
                ## ~ [1a] ~ Read in all sequences for one spoke ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                pseq = [mseq.seq.pop(0)
                        ]  # Pseq = list of sequences for this protein
                while mseq.seq:
                    if mseq.seq[0].info['Name'].find(
                            'Motifs') > 0 and string.split(
                                mseq.seq[0].info['Name'])[1] == 'Motifs':
                        break  # Next protein
                    pseq.append(mseq.seq.pop(0))
                ## ~ [1b] ~ Update relevant sequence dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                mapseq[pseq[0].shortName()] = pseq[0:]
            self.printLog('#ALN',
                          '%d distinct alignments identified' % len(mapseq))
            ### ~ [2] ~ Make SLiMJIM visualisations for each protein  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ex = 0  # Number of errors
            for mapping in rje.sortKeys(mapseq):
                try:
                    ## ~ [3a] ~ Rename sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    basefile = pseq[0].shortName()
                    if self.interactive() > 0 and not rje.yesNo(basefile):
                        continue
                    qryname = pseq[2].shortName()
                    pseq = mapseq[mapping][0:]
                    pseq[0].info['R'] = pseq[0].shortName()[len(qryname) + 1:]
                    pseq[1].info['R'] = 'Masked'
                    for seq in pseq[2:]:
                        seq.info['R'] = seq.info['ID']
                    ## ~ [3b] ~ Setup new SeqList, strip Query gaps, calculate RelCons ~~~~~~~~~~~~~~~~ ##
                    seqfile = '%s.aln.tdt' % basefile
                    if os.path.exists(seqfile): os.unlink(seqfile)
                    rseq = rje_seq.SeqList(
                        self.log, self.cmd_list + scmd + ['autoload=F'])
                    rseq.seq = pseq
                    rseq.obj['QuerySeq'] = pseq[2]
                    rseq.tidyQueryGaps()
                    rseq.saveR(rseq.seq, seqfile, name='R')
                    rseq.seq = pseq[2:]
                    relfile = '%s.rel.tdt' % basefile
                    if os.path.exists(relfile): os.unlink(relfile)
                    rseq.relCons(relfile)
                    self.deBug(rseq.obj['QuerySeq'].cmd_list)
                    ## ~ [3c] ~ Call R to generate graphics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    rcmd = '%s --no-restore --no-save --args "sfmap2png" "%s"' % (
                        self.info['RPath'], basefile)
                    rslimjim = '%srje.r' % self.info['Path']
                    rcmd += ' < "%s" > "%s.r.tmp.txt" 2>&1' % (rslimjim,
                                                               basefile)
                    self.printLog('#RSLIM', rcmd)
                    problems = os.popen(rcmd).read()
                    if problems: self.errorLog(problems, printerror=False)
                    pngx = len(glob.glob('%s*png' % basefile))
                    self.printLog(
                        '#PNG', '%d PNG files made for %s' % (pngx, basefile))
                    if pngx and os.path.exists('%s.r.tmp.txt' % basefile):
                        os.unlink('%s.r.tmp.txt' % basefile)
                except:
                    self.errorLog('SLiMJIM visualisation error for "%s"' %
                                  mapping)
                    ex += 1
            self.printLog('#SLIMJIM',
                          'Generation of SLiMJIMs complete. %d Problems.' % ex)

        except:
            self.errorLog(rje_zen.Zen().wisdom())