Ejemplo n.º 1
0
 def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes']
     '''Sets up output files self.str['MapFas','MissFas','MapRes'].'''
     ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     delimit = rje.getDelimit(self.cmd_list)
     if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = ''
     else:
         self.bool['Append'] = True
         self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom'])
     ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)}
     if self.getBool('Combine'): files.pop('MissFas')
     if self.str['ResFile'].lower() in ['','none']:
         self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True))
     for file in files.keys():
         self.setStr({file: self.getStr('ResFile') + '.' + files[file]})
         rje.backup(self,self.getStr(file))
     ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     #!# Consider replacing with rje_db object? #!#
     self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score']
     for qh in ['Query','Hit']:
         self.list['Headers'] += ['%s_Species' % qh]
         if self.bool['GablamOut']:
             for st in ['Len','Sim','ID']:
                 self.list['Headers'] += ['%s_%s' % (qh,st)]
     rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
Ejemplo n.º 2
0
 def blast2fas(self):    ### Executes BLAST2FAS and copies results files
     '''Executes BLAST2FAS and copies results files.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         need2blast = self.opt['Force']
         null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = []
         if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n')
         self.debug(null_file)
         for seq in self.seqs():
             if seq.info['AccNum'] in null_list: nx += 1; continue
             hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True)
             for db in self.obj['SeqList'].list['Blast2Fas']:
                 self.debug(rje.isYounger(hfile,db))
                 self.debug(rje.isYounger(hfile,db) == hfile)
                 need2blast = need2blast or not rje.isYounger(hfile,db) == hfile
         if not need2blast:
             self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx)
             return False
         ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.backup(self,null_file); nx = 0
         if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')]
         elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')]
         if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')]
         rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir'))
         for seq in self.seqs():
             sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum'])
             if os.path.exists(sbfile):
                 hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True)
                 os.rename(sbfile,hfile)
                 if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile))
                 if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile))
             else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1
         if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file))
         self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx))
         return True
     except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
Ejemplo n.º 3
0
 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 4
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.setup()
         if self.getBool('TaxTable'): self.setBool({'BatchMode':True})
         ### ~ [2] ~ Single Mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.getBool('BatchMode'): return self.mapTaxa(self.list['TaxIn'],self.list['TaxOut'],self.getBool('NodeOnly'),self.getBool('RankOnly'))
         ### ~ [3] ~ Batch Mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getBool('TaxTable'):
             tdb = self.db().addEmptyTable('taxtable',['TaxIn']+self.list['TaxOut'],['TaxIn'])
         basefile = self.baseFile()
         for taxa in self.list['TaxIn'][0:]:
             self._cmdReadList('taxin=%s' % taxa,'list',['TaxIn'])  # List of strings (split on commas or file lines)
             self.setBaseFile('%s.%s' % (basefile,rje.baseFile(taxa,strip_path=True)))
             taxdict = self.mapTaxa(self.list['TaxIn'],self.list['TaxOut'],self.getBool('NodeOnly'),self.getBool('RankOnly'),savetaxout=not self.getBool('TaxTable'))
             if self.getBool('TaxTable'):
                 tentry =  {'TaxIn':taxa}
                 for tfield in taxdict: tentry[tfield] = string.join(taxdict[tfield],'|')
                 tdb.addEntry(tentry)
         self.baseFile(basefile)
         if self.getBool('TaxTable'): tdb.saveToFile()
         return True
     except:
         self.errorLog(self.zen())
         raise   # Delete this if method error not terrible
Ejemplo n.º 5
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.db().addTable(filename=self.getStr('TDTFile'),mainkeys=self.list['TDTFields'],name='input',expect=True)
         if not self.baseFile(return_none=None): self.baseFile(rje.baseFile(self.getStr('TDTFile')))
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Ejemplo n.º 6
0
 def legacySetup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         ## Set Basefile
         if not self.basefile(return_none=None): self.basefile(rje.baseFile(self.getStr('OccFile')))
         tabkeys = {'OccFile':['dataset','runid','motif','seq','start_pos','end_pos','variant'],
                    'DomFile':['domain','uniprot'],
                    'DMIFile':['motif','domain'],
                    'PPIFile':['hub','spoke']}
         ## Load Tables
         for dfile in ['DomFile','DMIFile','OccFile','PPIFile']:
             dbtable = db.addTable(self.getStr(dfile),mainkeys=tabkeys[dfile],name=dfile,expect=True,replace=False,uselower=True)
             self.tidyMotifNames(dbtable)
             if dfile == 'OccFile':
                 #dbtable.addField('uniprot')
                 dbtable.addField('gene')
                 for entry in dbtable.entries():
                     #entry['uniprot'] = string.split(entry['seq'],'_')[-1]  # Don't want this: uniprot is spoke!
                     entry['gene'] = string.split(entry['seq'],'_')[0]
             elif dfile == 'DomFile':
                 dbtable.compress(['domain','uniprot'],default='str')
                 dbtable.keepFields(['domain','uniprot'])
             elif dfile == 'DMIFile':
                 dbtable.compress(['motif','domain'],default='str')
                 dbtable.keepFields(['motif','domain'])
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Ejemplo n.º 7
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [0] Setup File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('SaveDis').lower() in ['','none']:
             base = 'peptides'
             if rje.checkForFile(self.getStr('Peptides')): base = rje.baseFile(self.getStr('Peptides'))
             if self.baseFile().lower() not in ['','none']: base = self.baseFile()
             self.baseFile(base)
             self.setStr({'SaveDis':'%s.%s.%s' % (base,self.getStr('PeptDis'),self.getStr('PeptCluster'))})
         if self.getStr('OutMatrix') in ['tdt','csv','png','phylip']: self.str['SaveDis'] += '.%s' % self.getStr('OutMatrix')[:3]
         else: self.str['SaveDis'] += '.txt'
         self.dict['Output']['peptides'] = string.join(self.list['Peptides'],'\n')
         ### ~ [1] Setup Distance Matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['AADis'] = rje_dismatrix.DisMatrix(self.log,['nsf2nwk=T']+self.cmd_list)
         self.obj['AADis'].info['Name'] = 'Pairwise AA distances'
         self.obj['PeptDis'] = rje_dismatrix.DisMatrix(self.log,['nsf2nwk=T']+self.cmd_list)
         self.obj['PeptDis'].info['Name'] = 'Pairwise peptide distances'
         ### ~ [2] Optional loading of AA Distance Matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('AADis').lower() not in ['','none']: self.obj['AADis'].loadMatrix(self.getStr('AADis'))
         else:
             self.obj['AAProp'] = aaprop = rje_aaprop.AAPropMatrix(self.log,self.cmd_list)
             #aaprop.readAAProp()    # Does this on loading!
             for aa in aaprop.pdif: self.obj['AADis'].addDis(aa[0],aa[1],aaprop.pdif[aa])
         return True
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 8
0
 def run(self,imenu=False,outputmap=True,returndict=False):      ### Main controlling run Method
     '''
     Main controlling run Method.
     >> imenu:boolean = Whether to initiate interactive menu if appropriate [False].
     >> outputmap:boolean = Whether to output mapping into a file [True]
     >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False]
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(imenu): raise ValueError
         seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file'])
         if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {}
         ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1'])
         blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'),
                      'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))})  
         blast.setStat({'HitAln':blast.getStat('OneLine')})
         blast.list['ResTab'] = ['Search','Hit','GABLAM']
         if seqlist.nt(): blast.str['Type'] = 'blastx'
         ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if outputmap: self._setupOutput()                           ## Output Files ##
         if returndict: mapdict = {}
         else: self._setupMapped()                                   ## Previously Mapped Sequences ##
         seqx = seqlist.seqNum()             ## Number of sequences ##
         ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False)
         ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast.run(format=True)
         self.obj['DB'] = blast.obj['DB']
         ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.debug(self.getStr('MapDB'))
         self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']])
         self.obj['MapDB'].loadSeq(self.getStr('MapDB'))
         self.debug('%s' % self.obj['MapDB'].list['Seq'])
         sx = 0
         while seqlist.nextSeq() != None:
             search = seqlist.getSeq(format='short')
             sx += 1
             ## Check StartFrom ##
             if self.str['StartFrom']:
                 if self.str['StartFrom'] != search:
                     self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx))
                     continue
                 self.str['StartFrom'] = ''
                 self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx))
             ## Check if in Mapped ##
             if search in self.list['Mapped']:
                 resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'}
                 self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas']))
                 rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
                 continue
             ### Map Sequence ###
             self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx)))
             mapname = self.mapSeq(seqlist,blast,search)
             if returndict: mapdict[search] = mapname
         ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx)))           
         if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name'])     #!# Add option to keep BLAST! #!#
         if returndict: return mapdict
     except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise   
Ejemplo n.º 9
0
    def combineSNPs(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
        '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.list['SNPTables']: self.printLog('\r#SNP','No SNP tables to add.'); return False
            fdb = self.db().addTable(name='fdr',expect=True,mainkeys=['Locus','Pos'])
            fdb.remakeKeys()   #!# Delete once tuple thing OK
            fdbkeys = fdb.dataKeys()
            self.debug(fdbkeys[:100])
            snps = []
            snppos = []
            for snptable in self.list['SNPTables']:
                snps.append(self.db().addTable(snptable,name=rje.baseFile(snptable,True),expect=True,mainkeys=['Locus','Pos']))
                snps[-1].addField('SNP',evalue="YES")
                self.debug(snps[-1].dataKeys()[:100])
                snps[-1].remakeKeys()   #!# Delete once tuple thing OK
                self.debug(snps[-1].dataKeys()[:100])
                px = 0; ptot = snps[-1].entryNum(); sx = 0
                for pos in snps[-1].dataKeys(): # This should be a (Locus,Pos) tuple
                    self.progLog('\r#SNP','Scanning %s for extra SNP positions: %.2f%%' % (snps[-1].name(),px/ptot)); px += 100.0
                    if pos not in snppos + fdbkeys: snppos.append(pos); sx += 1
                self.printLog('\r#SNP','Scanned %s for extra SNP positions: %s to add.' % (snps[-1].name(),rje.iStr(sx)))
            ## ~ [0a] Add missing data from other tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if snppos:
                SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r'); px = 0; ptot = len(snppos); ix = 0
                fline = SAMSIG.readline(); headers = rje.readDelimit(fline)
                fline = SAMSIG.readline()
                self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix)))
                while fline:
                    data = rje.readDelimit(fline); ix += 1
                    if (data[0],data[1]) in snppos:
                        entry = {'p.FDR':'-'}
                        for i in range(len(data)): entry[headers[i]] = data[i]
                        fdb.addEntry(entry); px += 1
                        snppos.remove((data[0],data[1]))
                        self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix)))
                    else: self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix)))
                    if not snppos: break
                    fline = SAMSIG.readline()
                SAMSIG.close()
                self.printLog('\r#SNP','%s/%s SNP positions added from PDiff file.' % (rje.iStr(px),rje.iStr(ptot)))
            else: self.printLog('\r#SNP','No SNP positions to add.'); return False

            ### ~ [1] Join Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            temp = fdb
            temp.makeField('#Locus#|#Pos#')
            for snptable in snps:
                snptable.makeField('#Locus#|#Pos#')
                newtemp = self.db().joinTables(name='newtemp',join=[(temp,'#Locus#|#Pos#'),(snptable,'#Locus#|#Pos#',['SNP'])],newkey=['Locus','Pos'],keeptable=True)
                self.printLog('#SNP','Added SNPs from %s' % snptable.name())
                self.db().deleteTable(temp)
                temp = newtemp
                temp.renameField('SNP',snptable.name())
                temp.setStr({'Name':'temp'})
            temp.dropField('#Locus#|#Pos#')
            self.db().list['Tables'].append(temp)
            temp.setStr({'Name':'SNPs'})
            temp.saveToFile()
            return temp
        except: self.errorLog('%s.pileUpStats() error' % (self)); return None
Ejemplo n.º 10
0
 def batchRun(self,returnobj=False):     ### Execute batch mode runs
     '''Execute batch mode runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         barg = self.getStrLC('BatchArg')
         if not barg: raise ValueError('Cannot use batchrun=FILELIST if batcharg=None.')
         batchfiles = self.list['BatchRun'][0:]
         self.list['BatchRun'] = []  # Avoid recursive running!
         blog = self.getStr('BatchLog')
         if not blog.startswith('.'): blog = '.%s' % blog
         if not blog.endswith('.log'): blog = '%s.log' % blog
         rawcmd = self.cmd_list[0:]
         rawlog = self.log
         batchobj = []
         ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         bx = 0
         for bfile in batchfiles:
             bx += 1
             self.printLog('#BATCH','Batch running %s of %s: %s=%s' % (rje.iStr(bx),rje.iLen(batchfiles),barg,bfile))
             ## Setup parameters
             bbase = rje.baseFile(bfile,strip_path=True)
             bcmd = ['%s=%s' % (barg,bfile)]
             if self.getBool('BatchBase'):
                 if blog == '.log': bcmd += ['basefile=%s' % bbase]
                 else: bcmd += ['basefile=%s%s' % (bbase,rje.baseFile(blog))]
             elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase,blog)]
             else: bcmd += ['newlog=F']
             #self.debug(bcmd)
             ## Setup Seqsuite object
             self.cmd_list = rawcmd + bcmd
             self.log = rje.setLog(self.log.obj['Info'],self,self.cmd_list)                 # Sets up Log object for controlling log file output
             ## Run
             batchobj.append(self.run())
             ## Finish and Tidy
             self.log = rawlog
             runobj =  batchobj[-1]
             if runobj:
                 if not returnobj: batchobj[-1] = True
                 info = runobj.log.obj['Info']
                 self.printLog('#RUN','%s V%s run finished.' % (info.program,info.version))
             else: self.warnLog('Batch run failed (%s=%s).' % (barg,bfile))
         ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         failx = batchobj.count(False)
         self.printLog('#BATCH','%s batch runs complete: %s failed.' % (rje.iLen(batchfiles),rje.iStr(failx)))
         self.list['BatchRun'] = batchfiles
         return batchobj
     except: self.errorLog('%s.batchRun error' % self); return False
Ejemplo n.º 11
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease'])
         if self.getInt('MinPepLen') > 0: db.addField('MinPepLen')
         if self.getBool('NRPep'): db.addField('NRPep')
         if rje.exists(self.getStr('Source')):
             fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source')
             fdb.addField('File')
             fdb.addField('ProtMWt')
         else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum'])
         for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i)
         if self.getBool('PepMWt'):
             for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0)
         ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F'])
         self.obj['SeqList'].seq = fullseq = []
         for seqfile in self.list['SeqFiles']:
             file = rje.baseFile(seqfile,True)
             seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T'])
             fullseq += seqlist.seqs()
             for seq in seqlist.seqs():
                 accnum = seq.getStr('AccNum')
                 try:
                     entry = fdb.data()[accnum]
                     if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False)
                     entry['File'] = file
                     entry['ProtMWt'] = seq.MWt()
                 except:
                     entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()}
                     fdb.addEntry(entry)
                 self.deBug(fdb.dict['Data'][seq.getStr('AccNum')])
         self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq))
         fdb.fillBlanks()
         ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self._peptideProbabilities():
             db.addField('LenExp','PepCount');
             if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp')
             else: db.addField('Len7Exp','LenExp')
             db.addField('Len37','Len7Exp')
             if self.getBool('PepMWt'):
                 db.addField('Len5','MWtExp'); db.addField('MWt5','Len5')
                 db.addField('Len3','MWtExp'); db.addField('MWt3','Len3')
             else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp')
         return
         ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM')
         ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg')
         self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True)
         gdb.renameField('Qry','AccNum')
         tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False)
         tmp.saveToFile()
         tmp.compress(['File'],default='mean')
         tmp.dropFields(['AccNum'])
         tmp.info['Name'] = 'blastsum'
         tmp.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 12
0
 def run(self,batch=False):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch: self.setupResults()
         ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch and not self.obj['SeqList'].seqs():    ### Look for batch files and run for each
             batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0)
             self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles)))
             if not batchfiles: self.errorLog('No input files found!',printerror=False)
             else:
                 bx = 0
                 for infile in batchfiles:
                     bx += 1
                     self.printLog('#BATCH','Batch running %s' % infile)
                     bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile]
                     self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd)
                     self.run(batch=True)
                     self.opt['Append'] = True
                     self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False)
             if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32
             return
         ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.info['Special'].lower() == 'allbyall':
             self.printLog('#RUN','Performing special "all-by-all" pairwise run')
             self.info['Special'] = ''
             for i in range(len(self.seqs())-1):
                 self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i]
                 for j in range(i+1,len(self.seqs())):
                     self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1)
                     self.run(batch=True)
                     self.opt['Append'] = True
             self.info['Special'] = 'allbyall'; return                
         ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setup()
         ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.fitness()
         self.phenotype()
         self.grouping()
         for vector in ['Fitness','Phenotype','SeqGroup']:
             if len(self.list[vector]) != self.qry().seqLen():
                 self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False)
                 raise ValueError
         results = self.price()
         ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True)
         results['Query'] = self.qry().shortName()
         results['Fitness'] = self.info['Fmethod']
         results['Phenotype'] = self.info['Pmethod']
         results['SeqGroup'] = self.info['SeqGroup']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results)
         self.printLog('#OUT','Results output to %s' % self.info['ResFile'])
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Ejemplo n.º 13
0
 def parse(self):    ### Parse REST file into dictionaries
     '''Parse REST file into dictionaries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RestKeys'] = []
         rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True))
         if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'),'r').read()
         elif rje.matchExp('^(\d+)$',self.getStr('RestIn')):
             url = '%sretrieve&jobid=%s&password=%s' % (self.getStr('RestURL'),self.getStr('RestIn'),self.getStr('Password'))
             if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest'))
             else: url += '&rest=full'
             restin = urllib2.urlopen(url).read()
             if self.getBool('PureAPI'): return restin
         else: raise IOError('%s not found!' % self.getStr('RestIn'))
         jobid = None
         ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for restdata in string.split(restin,'###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'):
             if not jobid:
                 self.dict['Output']['intro'] = restdata
                 prog = rje.matchExp('Output for (\S+)',restdata)[0]
                 self.dict['Output']['prog'] = prog
                 jobid = rje.matchExp('JobID: (\d+)',restdata)[0]
                 self.dict['Output']['jobid'] = jobid
                 if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'),jobid)
                 self.dict['Outfile']['jobid'] =  '%s.jobid' % (rbase)
                 continue
             restlines = string.split(restdata,'\n')
             rparse = string.split(restlines.pop(0))
             if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue
             if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue
             rkey = rparse[1][:-1]
             try:
                 rfile = '%s.%s' % (rbase,rje.baseFile(rparse[2],strip_path=True,keepext=True))
             except: rfile = ''
             if not rfile: rfile = '%s.%s' % (rbase,rkey)
             rfile = string.replace(rfile,'%s.%s.' % (jobid,jobid),'%s.' % jobid)
             self.dict['Output'][rkey] = string.join(restlines,'\n')
             self.dict['Outfile'][rkey] = rfile
             self.list['RestKeys'].append(rkey)
         self.printLog('#PARSE','Parsed %s: %d REST outputs.' % (self.getStr('RestIn'),len(self.dict['Output'])))
         return True
     except: self.errorLog('%s.parse error' % self); return False
Ejemplo n.º 14
0
 def hmmSearch(self,hmm,dbase=None,outfile=None,wait=True):    ### Performs HMMer Search using object attributes
     '''
     Performs HMMer Search using object attributes.
     >> hmm:str = Name of HMM file 
     >> dbase:str = Name of DBase file [self.info['SearchDB']]
     >> outfile:str = Name of Output file file [self.info['HMMOut']]
     >> wait:boolean  = whether to wait for HMMer. [True]
     << returns outfile or None if fails
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.checkForFile(hmm): self.printLog('#ERR','HMM file %s is missing!' % hmm); return None
         if not dbase: dbase = self.info['SearchDB']
         if not rje.checkForFile(dbase): self.printLog('#ERR','Database file "%s" is missing!' % dbase); return None
         ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not outfile or outfile.lower() in ['','none']:       # Make an outfile per search
             outfile = '%s.%s.hmmer' % (rje.baseFile(hmm,True),rje.baseFile(dbase,True))
             resfile = outfile
             if not os.path.exists(outfile) and self.opt['GZip'] and os.path.exists('%s.gz' % outfile) and not self.opt['Force']:
                 resfile = '%s.gz' % outfile
             if not self.opt['Force'] and rje.isYounger(resfile,hmm) == resfile and rje.isYounger(resfile,dbase) == resfile:
                 self.printLog('#HMM','HMM results file "%s" exists.' % resfile)
                 return outfile      # Already exists
             else: rje.backup(self,outfile,unlink=True)
         ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.opt['HMMPFam']:
             _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile)
         else: _command = 'hmmsearch %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile)
         self.log.printLog('#HMM',_command)
         if not wait: os.system(self.info['HMMerPath'] + _command + ' &')
         elif not os.path.exists(outfile) or self.opt['Force']: open(outfile,'a').write(os.popen(self.info['HMMerPath'] + _command).read())
         self.printLog('#HMM','Outfile produced for %s: %s.' % (hmm,outfile))
         if self.opt['GZip']:
             rje.backup(self,'%s.gz' % outfile,unlink=True)
             os.system('gzip %s' % outfile)
             self.printLog('#GZIP','%s gzipped to save space' % outfile)
         return outfile
     except:
         self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm)
         return None
Ejemplo n.º 15
0
 def setup(self,gtext=''):    ### Main class setup method. gtext will over-ride input file.
     '''Main class setup method. gtext will over-ride input file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['HTML'] = rje_html.HTML(self.log,self.cmd_list)
         ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('InFile')))
         if self.getStr('OutFile').lower() in ['','none']: self.str['OutFile'] = '%s.html' % self.basefile()
         ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         interms = []
         if gtext:
             delimit = self.getStr('TermSplit')
             if delimit.lower() == 'tab': delimit = '\t'
             if delimit.lower() == 'space': delimit = ' '
             if delimit.lower() == 'comma': delimit = ','
             if delimit.lower() == 'period (.)': delimit = '.'
             if delimit.lower() == 'colon': delimit = ':'
             glossary = {}
             for line in string.split(gtext,'\n'):
                 splitline = string.split(line,delimit)
                 if delimit == '.' and (splitline[-1] in ['',' ']): splitline = splitline[:-1]
                 if not splitline: continue
                 (term,definition) = (splitline[0],string.join(splitline[1:],delimit))
                 if term == 'Term' and not glossary: continue
                 if term:
                     glossary[term] = {'Definition':definition}
                     interms.append(term)
         else: 
             try:
                 if not self.getBool('KeepOrder') and open(self.getStr('InFile'),'r').readline()[:4] == 'Term': 
                     glossary = rje.dataDict(self,self.getStr('InFile'),mainkeys=['Term'],datakeys=['Term','Definition'])
                 else: return self.setup(open(self.getStr('InFile'),'r').read())
             except: 
                 self.errorLog('Problem reading input as dataDict(). Will try as text.')
                 return self.setup(open(self.getStr('InFile'),'r').read())
         if self.list['Terms']:
             for term in glossary:
                 if term not in self.list['Terms']: glossary.pop(term)
         elif self.getBool('KeepOrder'): self.list['Terms'] = interms
         else: self.list['Terms'] = rje.sortKeys(glossary)
         for term in glossary: glossary[term] = glossary[term]['Definition']
         ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nested = {}
         for term in glossary:
             tdict = nested
             for word in string.split(term.lower()):
                 if word not in tdict: tdict[word] = {}
                 tdict = tdict[word]
             tdict['='] = glossary[term]
         self.dict['Glossary'] = nested
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 16
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = self.obj['SeqList']
         if self.getStr('Basefile').lower() in ['','none']:
             self.str['Basefile'] = rje.baseFile(seqlist.getStr('Name'))
             self.obj['DB'].setInfo({'Basefile':self.str['Basefile']})
         ## ~ [1a] Genetic Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         cdb = self.db().addEmptyTable('Code',['Codon','AA'],['Codon'])
         for codon in rje_sequence.genetic_code: cdb.addEntry({'Codon':codon,'AA':rje_sequence.genetic_code[codon]})
         cdb.index('AA')
         ### ~ [2] Calculate Codon Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         codons = rje.sortKeys(rje_sequence.genetic_code)
         db = self.db().addEmptyTable('Codons',['Seq','Len']+codons,['Seq'])
         sx = 0.0; seqx = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#COD','Calculating codon usage: %.2f%%' % (sx/seqx)); sx += 100.0
             entry = rje_sequence.codons(seq.getSequence(),{})
             #self.deBug(entry); self.deBug(entry.values())
             entry['Len'] = sum(entry.values())
             entry['Seq'] = seq.getStr('AccNum')
             db.addEntry(entry)
         self.printLog('\r#COD','Codon usage calculated for %s sequences' % rje.iStr(seqx))
         db.fillBlanks(blank=0,fillempty=True)
         db.saveToFile()
         ### ~ [3] Calculate NT Count Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nt = ['C','A','G','U']
         for i in [1,2,3]:
             for n in ['C','A','G','U']: nt.append('%s|%d' % (n,i))
         ndb = self.db().addEmptyTable('NT',['Seq','Len']+nt,['Seq'])
         sx = 0.0; seqx = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#NT','Calculating NT Counts: %.2f%%' % (sx/seqx)); sx += 100.0
             entry = rje_sequence.aaFreq(string.replace(seq.getSequence(),'T','U'),{'C':0,'A':0,'G':0,'U':0},False)
             entry['Len'] = sum(entry.values())
             entry['Seq'] = seq.getStr('AccNum')
             centry = db.data(entry['Seq'])
             for i in [1,2,3]:
                 for n in ['C','A','G','U']: entry['%s|%d' % (n,i)] = 0
             for codon in codons:
                 for i in [1,2,3]:
                     n = codon[i-1]
                     entry['%s|%d' % (n,i)] += centry[codon]
             ndb.addEntry(entry)
         self.printLog('\r#NT','NT Counts calculated for %s sequences' % rje.iStr(seqx))
         ndb.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 17
0
 def runJobs(self):  ### Runs all the jobs in self.list['SubJobs']                                               #V1.0
     '''Runs all the jobs in self.list['SubJobs'].'''
     ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     for j in range(self.getInt('KeepFree'),self.nprocs()): self.nextJob(j)    # Skip first node(s)
     pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile'])
     ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     while self.dict['Running']:
         PIDCHECK = open(pidcheck,'w')
         for j in rje.sortKeys(self.dict['Running']):
             if not self.dict['Running'][j]: self.dict['Running'].pop(j); continue   # No more jobs
             try:
                 pid = self.dict['Running'][j]['PID']
                 PIDCHECK.write('%s: %s\n' % (j,pid))
                 if string.split('%s' % pid)[0] == 'WAIT': status = 1
                 else: (status,exit_stat) = os.waitpid(pid,os.WNOHANG)
             except: status = 1
             if status > 0: self.endJob(j)       # subjob on processor j has finished: can replace with processing
         PIDCHECK.close()
         time.sleep(self.getInt('SubSleep'))
Ejemplo n.º 18
0
 def peptCluster(self):  ### Performs actual peptide clustering and stores results in self.obj['Tree']
     '''Performs actual peptide clustering and stores results in self.obj['Tree'].'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         base = rje.baseFile(self.getStr('SaveDis'))
         pretree = ['treeformats=nwk,text','basefile=%s' % base]
         ### ~ [1] ~ Phylip Neighbor method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('PeptCluster') == 'neighbor':
             disfile = '%s.phy' % base
             fasfile = '%s.fas' % base
             treecmd = ['autoload=T','maketree=neighbor','disin=%s' % disfile,'seqin=%s' % fasfile]
             pretree += ['root=mid']
             if disfile != self.getStr('SaveDis'):
                 rje.backup(self,disfile)
                 self.obj['PeptDis'].saveMatrix(filename=disfile,format='phylip')   ### Saves matrix
             if 'peptides=%s' % fasfile not in self.cmd_list:
                 rje.backup(self,fasfile)
                 FAS = open(fasfile,'w')
                 for pep in self.list['Peptides']: FAS.write('>%s\n%s\n' % (pep,pep))
                 FAS.close()
             tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd)
         ### ~ [2] ~ UPGMA method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         else:
             if self.getStr('PeptCluster') not in ['wpgma','upgma']:
                 self.errorLog('PeptCluster method "%s" not recognised. Will use UPGMA' % self.getStr('PeptCluster'),printerror=False)
                 base = string.replace(base,self.getStr('PeptCluster'),'upgma')
                 pretree += ['basefile=%s' % base]
             if self.getStr('PeptCluster') == 'upgma': nsftree = self.obj['PeptDis'].upgma()
             elif self.getStr('PeptCluster') == 'wpgma': nsftree = self.obj['PeptDis'].wpgma()
             #nwkfile = '%s.nwk' % base
             #treecmd += ['nsfin=%s' % nwkfile]
             #rje.backup(self,nwkfile)
             #open(nwkfile,'w').write(nsftree)
             treecmd = ['autoload=F']
             tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd)
             tree.buildTree(nsftree)
         ### ~ [3] ~ Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for node in tree.node:
             if node.info['Name'] in self.list['Peptides']: node.stat['ID'] = self.list['Peptides'].index(node.info['Name']) + 1
         tree.saveTrees()
         for outfmt in tree.list['TreeFormats']:
             treefile = '%s.%s' % (tree.info['Basefile'],rje_tree.formatext[outfmt])
             self.dict['Output'][outfmt] = treefile
     except: self.errorLog('%s.peptDis error' % self);
Ejemplo n.º 19
0
 def legacySetup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         ## Set Basefile
         if not self.basefile(return_none=None):
             self.basefile(rje.baseFile(self.getStr('OccFile')))
         tabkeys = {
             'OccFile': [
                 'dataset', 'runid', 'motif', 'seq', 'start_pos', 'end_pos',
                 'variant'
             ],
             'DomFile': ['domain', 'uniprot'],
             'DMIFile': ['motif', 'domain'],
             'PPIFile': ['hub', 'spoke']
         }
         ## Load Tables
         for dfile in ['DomFile', 'DMIFile', 'OccFile', 'PPIFile']:
             dbtable = db.addTable(self.getStr(dfile),
                                   mainkeys=tabkeys[dfile],
                                   name=dfile,
                                   expect=True,
                                   replace=False,
                                   uselower=True)
             self.tidyMotifNames(dbtable)
             if dfile == 'OccFile':
                 #dbtable.addField('uniprot')
                 dbtable.addField('gene')
                 for entry in dbtable.entries():
                     #entry['uniprot'] = string.split(entry['seq'],'_')[-1]  # Don't want this: uniprot is spoke!
                     entry['gene'] = string.split(entry['seq'], '_')[0]
             elif dfile == 'DomFile':
                 dbtable.compress(['domain', 'uniprot'], default='str')
                 dbtable.keepFields(['domain', 'uniprot'])
             elif dfile == 'DMIFile':
                 dbtable.compress(['motif', 'domain'], default='str')
                 dbtable.keepFields(['motif', 'domain'])
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self.prog())
         return False  # Setup failed
Ejemplo n.º 20
0
 def setup(self):  ### Main class setup method.
     '''
     Main class setup method. This will load sequences into a SeqList object, gaps into a 'gaps' database table, and
     check or generate a PAF file from the mapped long reads.
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log, self.cmd_list)
         if not self.getStrLC('SeqIn'):
             raise ValueError('seqin=FILE must be set')
         if not rje.exists(self.getStr('SeqIn')):
             raise IOError('Unable to read seqin=FILE: "{0}"'.format(
                 self.getStr('SeqIn')))
         seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True)
         if not self.getStrLC('Basefile'): self.baseFile(seqbase)
         if rje.checkForFiles(filelist=['.gaps.tdt'],
                              basename=seqbase,
                              log=self.log) and not self.force():
             self.cmd_list.append('gapstats=F')
         else:
             self.cmd_list.append('gapstats=T')
         seqin = self.seqinObj()
         gapdb = self.db().addTable('%s.gaps.tdt' % seqbase,
                                    mainkeys=['seqname', 'start', 'end'],
                                    name='gaps',
                                    ignore=[],
                                    expect=True)
         gapdb.dataFormat({'start': 'int', 'end': 'int'})
         ### ~ [2] PAF File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.getStrLC('PAF'):
             self.setStr({'PAF': self.baseFile() + '.paf'})
         pfile = self.getStr('PAF')
         if self.force() or not rje.exists(pfile):
             paf = rje_paf.PAF(self.log, self.cmd_list)
             paf.longreadMinimapPAF(pfile)
         if not rje.exists(self.getStr('PAF')):
             raise IOError(
                 'Unable to read or create PAF file: {0}'.format(pfile))
         return True
     except:
         self.errorLog('Problem during %s setup.' % self.prog())
         return False  # Setup failed
Ejemplo n.º 21
0
    def buildHMM(self,seqfile,hmmfile=None):    ### Makes an HMM from a sequence alignment file
        '''
        Makes an HMM from a sequence alignment file.
        >> seqfile:str = Name of sequence file
        >> hmmfile:str = Name of HMM file [*.hmm]
        << hmmfile if made, None if failed.
        '''
        try:
            ### Setup ###
            _hmmpath = self.info['HMMerPath']
            if not hmmfile:
                hmmfile = '%s.hmm' % rje.baseFile(seqfile)

            ### Build HMM ##
            os.system('%shmmbuild %s %s' % (_hmmpath,hmmfile,seqfile))
            if self.opt['HMMCalibrate']:
                os.system('%shmmcalibrate %s' % (_hmmpath,hmmfile))
            return hmmfile      #!# Add error catching during build/calibrate (How?!) #!#
        except:
            self.log.errorLog('Oh my, what a calamity during buildHMM(%s)!' % seqfile)
            return None
Ejemplo n.º 22
0
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Read in Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqfile = self.getStr('SeqIn')
            seqs = rje_seq.SeqList(log=self.log,cmd_list=['i=0']+self.cmd_list+['autofilter=F','autoload=F','seqin=None'])
            self.printLog('#SEQS','Loading sequences from %s' % seqfile)
            if not seqs.loadSeqs(seqfile=seqfile,seqtype='protein',aln=True):
                raise IOError('Cannot load from %s' % seqfile)
            seqfile = seqs.info['Name']
            basefile = rje.baseFile(seqfile)
            if not self.getStrLC('Basefile'): self.baseFile(basefile)
            self.printLog('#SEQ',"%s protein sequences read from %s\n" % (str(seqs.seqNum()),seqfile),1)
            #?# Add option to generate alignment?
            self.printLog('#SEQ',"Alignment = %s. (%d aa)\n" % (seqs.opt['Aligned'],seqs.seq[0].seqLen()),1)
            self.dict['Output']['seqin'] = seqfile

            ### ~ [1] Read in Tree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.getStrLC('NSFIn'):
                nsfin = self.getStr('NSFIn')
            else:
                nsfin = basefile + '.nsf'
            while not os.path.exists(nsfin):
                if self.i() >= 0:
                    nsfin = rje.choice(text='Input tree file "%s" not found. Input filename? (Blank to exit.)' % nsfin)
                    if nsfin == '':
                        raise KeyboardInterrupt
                else:
                    raise IOError('File %s not found. Cannot load tree!' % nsfin)
            self.dict['Output']['nsfin'] = nsfin
            self.cmd_list.append('nsfin=' + nsfin)
            self.printLog('#TREE','Loading tree from %s' % nsfin)
            self.obj['Tree'] = mytree = rje_tree.Tree(log=self.log,cmd_list=['root=yes']+self.cmd_list)
            mytree.mapSeq(seqlist=seqs)
            mytree.textTree()
            if mytree.opt['ReRooted']:
                mytree.saveTree(filename='%s.nsf' % basefile)
            return True     # Setup successful
        except KeyboardInterrupt: self.printLog('#CANCEL','User terminated.'); return False
        except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Ejemplo n.º 23
0
 def classify(self): ### Generate summary tables for each protein class
     '''Generate summary tables for each protein class.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         rankdb = self.db('taxamap')
         for cfile in self.list['Classify']:
             pclass = rje.baseFile(cfile,strip_path=True)
             clist = []
             for fline in open(cfile,'r').readlines():
                 prot = string.split(rje.chomp(fline),maxsplit=1)[0]
                 if prot: clist.append(prot)
             self.printLog('#CLASS','%s "%s" class proteins read from %s' % (rje.iLen(clist),pclass,cfile))
             if not clist:
                 self.warnLog('No proteins read from %s' % (cfile))
                 continue
             classdb = db.copyTable(rankdb,pclass)
             classdb.dropEntriesDirect('protein',clist,inverse=True)
             if not classdb.entries():
                 self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass))
                 continue
             self.summaryScores(classdb,pclass,'MinClass')
     except: self.errorLog('%s.classify() error' % self.prog())
Ejemplo n.º 24
0
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] ~ Sequence file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)   #!# Add code for memsaver/autoload=F #!#
            self.printLog('#SCAP','%s sequences loaded for SCAP analysis' % rje.integerString(seqlist.seqNum()))
            ## ~ [1b] ~ Xmer background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            mseqfile = self.info['XmerBack']
            if mseqfile.lower() in ['','none']: mseqfile = self.info['XmerBack'] = seqlist.info['Name']
            markov = self.obj['Markov'] = rje_markov.Markov(self.log,['autoload=T','accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % mseqfile,'direction=both','markov=F','scap=T'])
            markov.setup()
            maxx = markov.stat['MaxXmer']
            if self.info['Basefile'].lower() in ['','none']:
                self.info['Basefile'] = '%s.scap' % rje.baseFile(seqlist.info['Name'],True)
                if markov.opt['Sorted']: self.info['Basefile'] = '%s.sorted' % self.info['Basefile']
            basefile = self.info['Basefile']
            self.printLog('#MARKOV','Markov setup complete')
            ## ~ [1c] ~ SCAP Background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scapfile = self.info['ScapBack']
            if scapfile.lower() in ['','none',seqlist.info['Name'].lower()]: self.obj['ScapBack'] = self.obj['SeqList']
            elif scapfile == mseqfile: self.obj['ScapBack'] = markov.obj['SeqList'] 
            else: self.obj['ScapBack'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % scapfile])
            self.printLog('#SCAP','%s sequences for SCAP Background' % rje.integerString(seqlist.seqNum()))

            ### ~ [2] Markov Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if mseqfile == seqlist.info['Name']: markov.obj['SeqList'] = seqlist
            elif mseqfile == self.obj['ScapBack'].info['Name']: markov.obj['SeqList'] = self.obj['ScapBack']
            mpickle = markov.unpickleMe()
            if mpickle: markov = self.obj['Markov'] = mpickle
            if not markov.suftree() or not markov.pretree() or maxx > markov.stat['MaxXmer']:
                markov.run()
                markov.pickleMe()
            markov.opt['DeBug'] = self.opt['DeBug']
            self.deBug(markov.opt)
            self.deBug(markov.stat)
            #self.deBug(markov.suftree())
            #self.deBug(markov.pretree())
            return True     # Setup successful
        except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 25
0
 def rmdKnit(self,
             rmdfile,
             document='html',
             stdout=False):  ### Knit Rmd to HTML/PDF file
     '''
     Knit Rmd to HTML/PDF file.
     >> rmdfile:str = R markdown file to knit
     >> document:str ['html'] = type of document to knit into
     << success:bool = whether output is generated
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         outfile = '%s.%s' % (rje.baseFile(rmdfile), document)
         rcmd = 'Rscript -e \'library(rmarkdown); rmarkdown::render("%s", "%s_document")\'' % (
             rmdfile, document)
         self.printLog('#RCMD', rcmd)
         rcmd += ' 2>&1'
         if self.v() < 2 and not stdout: os.popen(rcmd).read()
         else:
             self.progLog('#RCMD', 'Knitting %s...' % (rmdfile))
             os.system(rcmd)
         success = rje.exists(outfile)
         if success:
             self.printLog('#RCMD',
                           '%s generated from %s' % (outfile, rmdfile))
         else:
             self.printLog(
                 '#SYS',
                 'If pandoc error, try setting global variable: export RSTUDIO_PANDOC=/Applications/RStudio.app/Contents/MacOS/pandoc'
             )
             self.printLog(
                 '#SYS',
                 'If no pandoc error, check that required libraries in %s are installed'
                 % rmdfile)
             raise IOError('%s not created' % outfile)
         return True
     except:
         self.errorLog('%s.rmdKnit error: check R installation' %
                       self.prog())
         return False
Ejemplo n.º 26
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(
             self.log, ['keepblast=T'] + self.cmd_list +
             ['autofilter=F', 'align=F', 'haqbat=None'])
         self.obj['SeqList']._checkForDup(True)
         if not self.seqNum():
             self.errorLog('No sequences loaded!', printerror=False)
             return False
         if self.opt['AddQueries'] and self.name(
         ) not in self.obj['SeqList'].list['Blast2Fas']:
             self.obj['SeqList'].list['Blast2Fas'].append(self.name())
         ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['HaqDir'].lower() in ['', 'none']:
             self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile(
                 self.name(), strip_path=True)
         rje.mkDir(self, self.info['HaqDir'])
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Ejemplo n.º 27
0
 def forking(self):  ### Keeps forking out and processing jobs until no more jobs in self.list['Forked'].
     '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].'''
     ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if self.getBool('PIDCheck') or self.dev(): pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile'])    # Set *.pid object to match log
     else: pidcheck = False
     #self.deBug(pidcheck)
     ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     while self.list['Forked']:
         if pidcheck: PIDCHECK = open(pidcheck,'w')
         for fdict in self.list['Forked'][0:]:
             try:
                 pid = fdict['PID']
                 if pidcheck: PIDCHECK.write('%s: %s\n' % (self.list['Forked'].index(fdict),pid))
                 if string.split('%s' % pid)[0] == 'WAIT': status = 1
                 else: (status,exit_stat) = os.waitpid(pid,os.WNOHANG)
             except:
                 self.errorLog('!')
                 status = 1
             if status > 0:
                 self.list['Forked'].remove(fdict)
                 self.endFork(fdict)   # Fork has finished: can replace with processing
         if pidcheck:
             PIDCHECK.close()
             #self.deBug(open(pidcheck,'r').read())
         ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if time.time() - self.getNum('KillTime') > self.getNum('KillForks'):
             self.verbose(0,1,'\n%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'),len(self.list['Forked'])),1)
             for fdict in self.list['Forked']:
                 self.verbose(0,2,' => Fork %s, PID %d still Active!' % (fdict['ID'],fdict['PID']),1)
             if self.i() < 0 or rje.yesNo('Kill Main Thread?'):
                 raise ValueError('%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'),len(self.list['Forked'])))
             elif rje.yesNo('Kill hanging forks?'):
                 for fdict in self.list['Forked']:
                     self.printLog('#KILL','Killing Fork %s, PID %d.' % (fdict['ID'],fdict['PID']))
                     os.system('kill %d' % fdict['PID'])
             else: self.setNum({'KillTime':time.time()})
         ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         time.sleep(self.getNum('ForkSleep'))
Ejemplo n.º 28
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         fixfields = ['Location','Name','Artist','Composer','Album']
         db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         #self.deBug(self.list['iTunes'])
         for ifile in self.list['iTunes']:
             #self.deBug(string.split(open(ifile,'r').readline(),'\t'))
             idb = db.addTable(ifile,mainkeys=['Location'],name=rje.baseFile(ifile,True))
             for field in iformat:
                 if iformat[field] == 'del' and field in idb.fields(): idb.dropField(field)
             idb.dataFormat(iformat)
             idb.addField('Album_Artist','Album')
             idb.addField('Tracks',evalue=1)
             if self.getBool('AddScore'): idb.addField('Score',evalue=0)
             for entry in idb.entries():
                 for field in fixfields:
                     newval = ''
                     for x in entry[field]:
                         if x.isalnum() or x in '\\/: -_()[].~$': newval += x
                     entry[field] = newval
                 entry['Album_Artist'] = entry['Artist']
                 try:
                     for divider in ['\\\\','\\',':','/']:
                         if len(string.split(entry['Location'],divider)) > 2: 
                             entry['Album_Artist'] = string.split(entry['Location'],divider)[-3]
                             break
                 except:
                     self.errorLog('!')
                     self.deBug(entry['Location'])
                 if not entry['Plays']: entry['Plays'] = 0
                 if not entry['Skips']: entry['Skips'] = 0
                 if self.getBool('AddScore'): 
                     if entry['My Rating']: entry['Score'] = (entry['My Rating'] - 60) / 20.0
             idb.remakeKeys()
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 29
0
    def hmmTable(self,outfile='',append=False,delimit=None):    ### Outputs results table
        '''
        Outputs results table.
        >> outfile:str = Name of output file
        >> append:boolean = whether to append file
        >> delimit:str = Delimiter to use [\t]
        '''
        try:
            ### Setup ###
            if not outfile: outfile = self.info['HMMTab']
            if outfile.lower() == 'none':
                self.log.printLog('#TAB','HMMTab = "None": No table output')
                return False
            if not delimit: delimit = rje.getDelimit(self.cmd_list,'\t')
            if not outfile: outfile = '%s.hmmer.%s' % (rje.baseFile(self.info['SearchDB'],True),rje.delimitExt(delimit))
            self.readResults()
            self.log.printLog('#TAB','Tabulating results for %s searches into %s' % (len(self.search),outfile),log=False)

            ### Setup Resfile ###
            if self.opt['MySQL']: headers = ['HMM','Hit','Hit_Start','Hit_End','Eval','Score']
            else: headers = ['Type','Name','Start','End','Eval','Score']
            if not append or not os.path.exists(outfile): rje.delimitedFileOutput(self,outfile,headers,delimit,rje_backup=True)
            
            ### Output Search details ###
            for search in self.search:
                for hit in search.hit:
                    for aln in hit.aln:
                        out = {'HMM':search.info['Name'],'Type':search.info['Name'],
                               'Name':hit.info['Name'],'Hit':hit.info['Name'],
                               'Start':'%d' % aln.stat['SbjStart'], 'End':'%d' % aln.stat['SbjEnd'],
                               'Hit_Start':'%d' % aln.stat['SbjStart'], 'Hit_End':'%d' % aln.stat['SbjEnd'],
                               'Eval':'%.2e' % aln.stat['Expect'],'Score':'%.1f' % aln.stat['BitScore']}
                        rje.delimitedFileOutput(self,outfile,headers,delimit,out)
            self.log.printLog('#OUT','Results for %s searches output to %s.' % (len(self.search),outfile))
        except:
            self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile)
            raise
Ejemplo n.º 30
0
 def save(self):     ### Saves parsed REST output to files
     '''Saves parsed REST output to files.'''
     rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True))
     rje.mkDir(self,self.getStr('RestOutDir'))
     outputs = rje.sortKeys(self.dict['Output'])
     if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')]
     elif self.getStrLC('Rest') in ['full','text']:
         outfile = '%s.rest' % rbase
         open(outfile,'w').write(self.restFullOutput())
         self.printLog('#OUT','%s: %s' % (self.getStrLC('Rest'),outfile))
         return True
     elif self.getStrLC('Rest'):
         self.printLog('#OUTFMT','REST output format "%s" not recognised.' % self.getStrLC('Rest'))
         if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False
         outfile = '%s.rest' % rbase
         open(outfile,'w').write(self.restFullOutput())
         self.printLog('#OUT','full: %s' % (outfile))
         return True
     for rkey in outputs:
         if rkey in self.dict['Outfile']:
             rje.backup(self,self.dict['Outfile'][rkey])
             open(self.dict['Outfile'][rkey],'w').write(self.dict['Output'][rkey])
             self.printLog('#OUT','%s: %s' % (rkey,self.dict['Outfile'][rkey]))
         elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
Ejemplo n.º 31
0
 def katKmers(self,
              assembly=None,
              kmerfiles=None,
              basefile=None,
              force=None,
              trim10x=True):  ### Performs read kmer kat sect analysis
     '''
     Performs read kmer kat sect analysis. Generates:
     - '{0}.kat-stats.tsv'.format(basefile) = kmer summary per sequence
     - '{1}.kat-counts.cvg'.format(basefile) = kmer counts per position (CVG format)
     >> assembly:str [None] = Assembly file. Will use self.getStr('SeqIn') if None
     >> kmerfiles:list [None] = files for setting kmers to count (self.list['KmerReads'] if None)
     >> basefile:bool [None] = output file prefix (self.baseFile() if None)
     >> force:bool [None] = whether to overwrite existing files (self.force() if None)
     >> trim10x:bool [True] = Whether to check 10xtrim setting.
     << katfile or None if failed
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.checkForKat(report=True): return None
         if not assembly: assembly = self.getStr('SeqIn')
         seqin = assembly
         if kmerfiles:
             if type(kmerfiles) == type('str'):
                 kmerfiles = [kmerfiles]
         else:
             if not self.list['KmerReads']:
                 self.printLog(
                     '#KAT',
                     'Cannot use KAT kmer analysis without KmerReads data')
                 return None
             kmerfiles = self.list['KmerReads']
         rje.checkForFiles(filelist=[seqin] + kmerfiles,
                           basename='',
                           log=self.log,
                           cutshort=False,
                           ioerror=True,
                           missingtext='Not found: aborting KAT run.')
         if not basefile: basefile = self.baseFile(return_none=None)
         if not basefile:
             self.baseFile(rje.baseFile(assembly, strip_path=True))
         if force == None: force = self.force()
         katfile = '{}.kat-stats.tsv'.format(basefile)
         # seq_name        median  mean    gc%     seq_length      kmers_in_seq    invalid_kmers   %_invalid       non_zero_kmers  %_non_zero      %_non_zero_corrected
         katcvg = '{}.kat-counts.cvg'.format(basefile)
         #i# Check for files
         if not force and rje.checkForFiles(
                 filelist=[katfile, katcvg],
                 basename='',
                 log=self.log,
                 cutshort=False,
                 ioerror=False,
                 missingtext='Not found: will generate.'):
             return katfile
         self.backup(katfile, appendable=False)
         self.backup(katcvg, appendable=False)
         ### ~ [2] Run KAT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         katcall = 'kat sect -t {} -o {}.kat {} {}'.format(
             self.threads(), basefile, seqin, ' '.join(kmerfiles))
         if trim10x and self.getBool('10xTrim'):
             trim5 = ['16'] + ['0'] * (len(self.list['KmerReads']) - 1)
             trim5 = ','.join(trim5)
             katcall = 'kat sect -t {} --5ptrim {} -o {}.kat {} {}'.format(
                 self.threads(), trim5, basefile, seqin,
                 ' '.join(kmerfiles))
         self.printLog('#SYS', katcall)
         #i# Catching completion in case KAT hangs after running
         KAT = os.popen(katcall)
         while not KAT.readline().startswith('Total runtime'):
             continue
         KAT.close()
         ### ~ [3] Check for outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if rje.checkForFiles(filelist=[katfile, katcvg],
                              basename='',
                              log=self.log,
                              cutshort=False,
                              ioerror=True,
                              missingtext='Not found: KAT failed?'):
             return katfile
     except:
         self.errorLog('%s.katKmers error' % self.prog())
         return None
Ejemplo n.º 32
0
    def inSilicoHybrid(
        self
    ):  ### Filter and combine subreads from parent and output to fasta file.
        '''
        Filter and combine subreads from parent and output to fasta file.

        This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each
        parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each
        parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell
        identifier table.)

        A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the
        unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest
        selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the
        two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added.
        This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is
        no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional
        subreads will be added from the other parent if they reduce the difference in cumulative output for each parent.

        Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for
        which the subread length distributions should also be similar. This is to overcome biases in resulting diploid
        assemblies, where one parent has higher quality data than the other.

        NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much
        higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to  reads with a
        minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be
        relaxed.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1'))
            base1 = rje.baseFile(self.getStr('Parent1'))
            parent1 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent1'),
                 'basefile=%s' % base1])
            parent1.setup()
            udb1 = parent1.udb()
            cdb = parent1.db('smrt', add=True, mainkeys=['Name'])
            cdb.dataFormat({'SMRT': 'int'})
            cx = cdb.entryNum()
            ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2'))
            base2 = rje.baseFile(self.getStr('Parent2'))
            parent2 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent2'),
                 'basefile=%s' % base2])
            parent2.setup()
            udb2 = parent2.udb()
            cdb2 = parent2.db('smrt', add=True, mainkeys=['Name'])
            cdb2.dataFormat({'SMRT': 'int'})
            # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1
            for entry in cdb2.entries() + udb2.entries():
                entry['SMRT'] = entry['SMRT'] + cx
            cdb = parent1.db().mergeTables(cdb, cdb2)
            ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #'
            )
            minlen = self.getInt('LenFilter')
            minrq = self.getNum('RQFilter')
            rqstr = '%s' % minrq
            filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen,
                                             rqstr[2:])
            ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqbatch = []  # List of SeqList objects
            self.printLog(
                '#BATCH', '%s sequence files to process.' %
                rje.iLen(parent1.list['Batch'] + parent2.list['Batch']))
            for seqfile in parent1.list['Batch'] + parent2.list['Batch']:
                seqcmd = self.cmd_list + [
                    'seqmode=file', 'autoload=T', 'summarise=F',
                    'seqin=%s' % seqfile, 'autofilter=F'
                ]
                seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd))
            self.printLog(
                '#BATCH',
                '%s sequence files to summarise.' % rje.iLen(seqbatch))
            if not seqbatch:
                raise IOError(
                    'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.'
                )
            ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elists = [
                udb1.sortedEntries('Len', reverse=True),
                udb2.sortedEntries('Len', reverse=True)
            ]
            plen = [0, 0]  # Summed lengths for each parent
            pseq = [0, 0]  # Total sequence number for each parent
            prq = [0, 0]  # Total sequence RQ for each parent (convert to mean)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            lastlen = max(elists[0][0]['Len'],
                          elists[1][0]['Len'])  # Length of last selected read
            for elist in elists:
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            nextp = 0  # Index of next parent to use
            if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1

            ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            zmwlist = []  # List of (smrt,zmw) meeting filtering criteria
            ux = 0.0
            utot = len(elists[0]) + len(elists[1])
            while lastlen:
                self.progLog('\r#DIP',
                             'Diploidising subreads: %.2f%%' % (ux / utot))
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
                    ux += 100.0
                if elist and elist[0]['Len'] < minlen:
                    ux += 100.0 * len(elist)
                    elist = []
                if not elist:
                    nextp = 1 - nextp
                    break  # Finish
                entry = elist.pop(0)
                ux += 100.0
                zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                plen[nextp] += entry['Len']
                prq[nextp] += entry['RQ']
                pseq[nextp] += 1
                if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp
                lastlen = entry['Len']
            ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            while elists[nextp]:
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    elist.pop(0)
                    ux += 100.0
                while elist and elist[0]['Len'] >= minlen:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    entry = elist.pop(0)
                    ux += 100.0
                    pdiff = rje.modulus(plen[0] - plen[1])
                    ediff = rje.modulus(plen[nextp] + entry['Len'] -
                                        plen[1 - nextp])
                    if ediff >= pdiff:
                        elists[nextp] = []
                        break  #Finish!
                    zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                    plen[nextp] += entry['Len']
                    prq[nextp] += entry['RQ']
                    pseq[nextp] += 1
            self.printLog(
                '\r#DIP',
                'Diploidising subreads complete: %s subreads to output.' %
                rje.iLen(zmwlist))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]),
                 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0]))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]),
                 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1]))
            ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rje.backup(self, filtfile)
            SEQOUT = open(filtfile, 'w')
            sx = 0.0
            stot = 0
            sn = len(seqbatch)
            fx = 0
            for seqlist in seqbatch:
                #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784
                si = 100.0 / seqlist.seqNum()
                stot += seqlist.seqNum()
                for seq in seqlist.seqs():
                    self.progLog('\r#OUT',
                                 'Extracting subreads: %.2f%%' % (sx / sn))
                    sx += si
                    (name, sequence) = seqlist.getSeq(seq)
                    try:
                        [smrt, zmw, pos,
                         rq] = string.split(string.replace(name, '/', ' '))
                    except:
                        [smrt, zmw,
                         pos] = string.split(string.replace(name, '/', ' '))
                        rq = minrq
                    if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist:
                        continue
                    SEQOUT.write('>%s\n%s\n' % (name, sequence))
                    fx += 1
            self.printLog(
                '\r#OUT',
                'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile))

            ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqcmd = self.cmd_list + [
                'seqmode=file', 'autoload=T', 'summarise=T',
                'seqin=%s' % filtfile, 'autofilter=F'
            ]
            rje_seqlist.SeqList(self.log, seqcmd)

            return True
        except:
            self.errorLog('%s.run error' % self.prog())
            return False
Ejemplo n.º 33
0
    def mapPhosByBLAST(self,fasfile):   ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F']
            qseqlist = rje_seq.SeqList(self.log,scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name','Pos','AA','PELM','PELMPos','Evidence']
            delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self,ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F'])
            pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile})
            pblast.setStat({'HitAln':pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key']
            for g in ['ID','Hom']:
                if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True,log=True)    # BLAST
            pblast.readBLAST(gablam=True)               # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F'])
                aln.seq = [qseq]
                pdict = {}      # Dictionary of {hseq:[poslist]}
                rdict = {qseq:0}      # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False):
                            self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln()   #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln','-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a],
                                     'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'}
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata)
                            self.addPhos(aln.seq[-1],a,pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()


            # Align hits for each > X %ID
            # Map phosphosites onto alignment and output #
            
            return
        except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
Ejemplo n.º 34
0
 def batchRun(self, returnobj=False):  ### Execute batch mode runs
     '''Execute batch mode runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         barg = self.getStrLC('BatchArg')
         if not barg:
             raise ValueError(
                 'Cannot use batchrun=FILELIST if batcharg=None.')
         batchfiles = self.list['BatchRun'][0:]
         self.list['BatchRun'] = []  # Avoid recursive running!
         blog = self.getStr('BatchLog')
         if not blog.startswith('.'): blog = '.%s' % blog
         if not blog.endswith('.log'): blog = '%s.log' % blog
         rawcmd = self.cmd_list[0:]
         rawlog = self.log
         batchobj = []
         ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         bx = 0
         for bfile in batchfiles:
             bx += 1
             self.printLog(
                 '#BATCH', 'Batch running %s of %s: %s=%s' %
                 (rje.iStr(bx), rje.iLen(batchfiles), barg, bfile))
             ## Setup parameters
             bbase = rje.baseFile(bfile, strip_path=True)
             bcmd = ['%s=%s' % (barg, bfile)]
             if self.getBool('BatchBase'):
                 if blog == '.log': bcmd += ['basefile=%s' % bbase]
                 else:
                     bcmd += ['basefile=%s%s' % (bbase, rje.baseFile(blog))]
             elif self.getStrLC('BatchLog'):
                 bcmd += ['log=%s%s' % (bbase, blog)]
             else:
                 bcmd += ['newlog=F']
             #self.debug(bcmd)
             ## Setup Seqsuite object
             self.cmd_list = rawcmd + bcmd
             self.log = rje.setLog(
                 self.log.obj['Info'], self, self.cmd_list
             )  # Sets up Log object for controlling log file output
             ## Run
             batchobj.append(self.run())
             ## Finish and Tidy
             self.log = rawlog
             runobj = batchobj[-1]
             if runobj:
                 if not returnobj: batchobj[-1] = True
                 info = runobj.log.obj['Info']
                 self.printLog(
                     '#RUN',
                     '%s V%s run finished.' % (info.program, info.version))
             else:
                 self.warnLog('Batch run failed (%s=%s).' % (barg, bfile))
         ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         failx = batchobj.count(False)
         self.printLog(
             '#BATCH', '%s batch runs complete: %s failed.' %
             (rje.iLen(batchfiles), rje.iStr(failx)))
         self.list['BatchRun'] = batchfiles
         return batchobj
     except:
         self.errorLog('%s.batchRun error' % self)
         return False
Ejemplo n.º 35
0
 def setup(self,
           gtext=''
           ):  ### Main class setup method. gtext will over-ride input file.
     '''Main class setup method. gtext will over-ride input file.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['HTML'] = rje_html.HTML(self.log, self.cmd_list)
         ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.basefile().lower() in ['', 'none']:
             self.basefile(rje.baseFile(self.getStr('InFile')))
         if self.getStr('OutFile').lower() in ['', 'none']:
             self.str['OutFile'] = '%s.html' % self.basefile()
         ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         interms = []
         if gtext:
             delimit = self.getStr('TermSplit')
             if delimit.lower() == 'tab': delimit = '\t'
             if delimit.lower() == 'space': delimit = ' '
             if delimit.lower() == 'comma': delimit = ','
             if delimit.lower() == 'period (.)': delimit = '.'
             if delimit.lower() == 'colon': delimit = ':'
             glossary = {}
             for line in string.split(gtext, '\n'):
                 splitline = string.split(line, delimit)
                 if delimit == '.' and (splitline[-1] in ['', ' ']):
                     splitline = splitline[:-1]
                 if not splitline: continue
                 (term, definition) = (splitline[0],
                                       string.join(splitline[1:], delimit))
                 if term == 'Term' and not glossary: continue
                 if term:
                     glossary[term] = {'Definition': definition}
                     interms.append(term)
         else:
             try:
                 if not self.getBool('KeepOrder') and open(
                         self.getStr('InFile'),
                         'r').readline()[:4] == 'Term':
                     glossary = rje.dataDict(
                         self,
                         self.getStr('InFile'),
                         mainkeys=['Term'],
                         datakeys=['Term', 'Definition'])
                 else:
                     return self.setup(
                         open(self.getStr('InFile'), 'r').read())
             except:
                 self.errorLog(
                     'Problem reading input as dataDict(). Will try as text.'
                 )
                 return self.setup(open(self.getStr('InFile'), 'r').read())
         if self.list['Terms']:
             for term in glossary:
                 if term not in self.list['Terms']: glossary.pop(term)
         elif self.getBool('KeepOrder'): self.list['Terms'] = interms
         else: self.list['Terms'] = rje.sortKeys(glossary)
         for term in glossary:
             glossary[term] = glossary[term]['Definition']
         ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nested = {}
         for term in glossary:
             tdict = nested
             for word in string.split(term.lower()):
                 if word not in tdict: tdict[word] = {}
                 tdict = tdict[word]
             tdict['='] = glossary[term]
         self.dict['Glossary'] = nested
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Ejemplo n.º 36
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T'])
         if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut')))
         if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return
         ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup'))
         ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         refseq = {}; rx = 0
         majors = {}
         locus = None
         WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0
         for line in WTDATA:
             self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01)
             data = rje.readDelimit(line); wx += 1
             if data[0] == 'Locus': continue
             else:
                 if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = []
                 pos = int(data[1])
                 while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1
                 while (pos - 1) > len(majors[locus]): majors[locus].append('-')
                 refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2])
         WTDATA.close()
         self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx)))
         for locus in rje.sortKeys(majors):
             if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError
         self.dict['WTMajor'] = majors
         self.dict['RefSeq'] = refseq
         ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True)
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 37
0
 def treeListSPCode(self):  ### Main taxa mapping from list of tree files
     '''Main taxa mapping from list of tree files.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         specdb = self.db('spcode',
                          add=True,
                          forcecheck=True,
                          mainkeys=['protein'])
         if not specdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.spcode.tdt' % self.getStr('TaxBase')
             specdb = db.addTable(spfile,
                                  mainkeys=['protein'],
                                  name='spcode',
                                  expect=False)
         if specdb:
             specdb.dataFormat({'boot': 'num'})
             return True
         specdb = db.addEmptyTable(
             'spcode',
             ['protein', 'boot', 'spcode', 'inpara', 'paralogues'],
             ['protein'])
         #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein'])
         self.dict['Duplicates'] = {}  # {prot1:[dups]}
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for nwkfile in self.list['NwkList']:
             tree = rje_tree.Tree(self.log, self.cmd_list)
             tree.loadTree(nwkfile, seqlist=None, postprocess=False)
             seqacc = rje.baseFile(nwkfile, strip_path=True)
             # Identify node corresponding to query sequence
             seqnode = None
             for node in tree.nodes():
                 try:
                     if string.split(node.shortName(), '__')[1] == seqacc:
                         seqnode = node
                 except:
                     pass  # Internal node or bad sequence format
             if not seqnode:
                 self.warnLog('Could not find %s in %s nodes!' %
                              (seqacc, nwkfile))
                 continue
             # Get species code for query sequence
             seqspec = tree.cladeSpec(seqnode)
             if len(seqspec) != 1:
                 self.warnLog('Could not find species in %s node!' %
                              (seqacc))
                 continue
             seqspec = seqspec.keys()[0]
             if seqspec != string.split(seqnode.shortName(), '_')[1]:
                 raise ValueError('Species mismatch for %s & %s' %
                                  (seqacc, seqnode.shortName()))
             # Find ancestor with closest orthologue outgroup
             rootnode = tree._getRootNode()
             if not rootnode:
                 self.warnLog('Could not find root node in %s!' % (nwkfile))
                 continue
             ancnode = seqnode.ancNode()
             try:
                 bootx = float(ancnode.ancBranch().stat['Bootstrap']
                               ) / tree.stat['Bootstraps']
             except:
                 bootx = 1.0
             inparanode = None  # Node to define in-paralogues
             ancspec = tree.cladeSpec(ancnode)
             while len(ancspec) < 2 or bootx < self.getNum('MinBoot'):
                 inparanode = ancnode  # All same species
                 if ancnode == rootnode: break
                 ancnode = ancnode.ancNode()
                 ancspec = tree.cladeSpec(ancnode)
                 try:
                     bootx = float(ancnode.ancBranch().stat['Bootstrap']
                                   ) / tree.stat['Bootstraps']
                 except:
                     bootx = 1.0
             ancspec.pop(
                 seqspec)  # Now only have counts of closest other species
             # Update table, replacing species codes with genera?
             sentry = {
                 'protein': seqacc,
                 'spcode': rje.sortUnique(ancspec.keys())
             }
             sentry['boot'] = bootx
             if not ancspec:
                 sentry['spcode'] = ['None']
                 sentry['boot'] = self.getNum('NoneBoot')
             sentry['spcode'] = string.join(sentry['spcode'], '|')
             # Establish list of duplicate proteins
             inpara = []  # List of in-paralogue nodes
             inparacc = []  # List of in-paralogue accnum
             if inparanode:
                 inpara = tree._nodeClade(inparanode, internal=False)
             self.dict['Duplicates'][seqacc] = []
             for node in tree._nodeClade(rootnode, internal=False):
                 if node == seqnode: continue
                 if len(string.split(node.shortName(), '_')) < 2: continue
                 if string.split(node.shortName(), '_')[1] == seqspec:
                     paracc = string.split(node.shortName(), '__')[1]
                     if node in inpara: inparacc.append(paracc)
                     else: self.dict['Duplicates'][seqacc].append(paracc)
             sentry['inpara'] = string.join(inparacc, '|')
             sentry['paralogues'] = string.join(
                 self.dict['Duplicates'][seqacc], '|')
             specdb.addEntry(sentry)
         ## Update specdb and save
         specdb.saveToFile()
         #dupdb.saveToFile()
         return True
     except:
         self.errorLog(self.zen())
         return False
Ejemplo n.º 38
0
    def _pepDis(self):  ### Peptide Distance
        '''
        Peptide Distance.
        '''
        try:
            ### <0> ### Setup
            seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=T'])
            dismatrix = rje_dismatrix.DisMatrix(self.log, self.cmd_list)
            dismatrix.info['Name'] = self.info['Method']
            dismatrix.opt['Symmetric'] = True
            if self.info['Method'] in ['ds_prop', 'tot_prop', 'best_prop']:
                aaprop = rje_aaprop.AAPropMatrix(self.log, self.cmd_list)
                #aaprop.readAAProp()
                aaprop.makePropDif()
            elif self.info['Method'] == 'pam':
                pam = rje_pam.PamCtrl(log=self.log, cmd_list=self.cmd_list)
            ### <1> ### Make DisMatrix
            for seq1 in seqlist.seq:
                for seq2 in seqlist.seq:
                    if seqlist.seq.index(seq1) > seqlist.seq.index(
                            seq2):  # No need to calculate - symmetrical!
                        continue
                    dis = 0
                    if seq1 == seq2 and self.info['OutMatrix'] == 'phylip':
                        dis = 0
                    elif self.info['Method'] in ['ds_prop', 'ds_id']:
                        (self_dis1, self_dis2) = (0, 0)
                        for r1 in range(seq1.seqLen()):
                            for r2 in range(r1, seq2.seqLen()):
                                (a1, a2) = (seq1.info['Sequence'][r1],
                                            seq2.info['Sequence'][r2])
                                (s1, s2) = (seq1.info['Sequence'][r2],
                                            seq2.info['Sequence'][r1])
                                phys_dis = r2 - r1
                                if self.info['Method'] == 'ds_prop':
                                    dis += (aaprop.pdif['%s%s' % (a1, a2)] *
                                            (seq1.seqLen() - phys_dis))
                                    self_dis1 += (aaprop.pdif['%s%s' %
                                                              (a1, s1)] *
                                                  (seq1.seqLen() - phys_dis))
                                    self_dis2 += (aaprop.pdif['%s%s' %
                                                              (a2, s2)] *
                                                  (seq1.seqLen() - phys_dis))
                                elif self.info[
                                        'Method'] == 'ds_id' and a1 != a2:
                                    dis += (seq1.seqLen() - phys_dis)
                                if self.info['Method'] == 'ds_id' and a1 != s1:
                                    self_dis1 += (seq1.seqLen() - phys_dis)
                                if self.info['Method'] == 'ds_id' and a2 != s2:
                                    self_dis2 += (seq1.seqLen() - phys_dis)
                        dis -= (self_dis1 + self_dis2) / 2.0
                    elif self.info['Method'] == 'tot_prop':
                        proptot = {}
                        for property in aaprop.prop.keys():
                            proptot[property] = {seq1: 0.0, seq2: 0.0}
                        for seq in [seq1, seq2]:
                            for r in range(seq.seqLen()):
                                aa = seq.info['Sequence'][r]
                                for property in aaprop.prop.keys():
                                    proptot[property][seq] += string.atof(
                                        aaprop.prop[property][aa])
                        for property in aaprop.prop.keys():
                            if proptot[property][seq1] > proptot[property][
                                    seq2]:
                                dis += (proptot[property][seq1] -
                                        proptot[property][seq2])
                            else:
                                dis += (proptot[property][seq2] -
                                        proptot[property][seq1])
                    elif self.info['Method'] == 'pam':
                        dis = pam.pamML(ancseq=seq1.info['Sequence'],
                                        descseq=seq2.info['Sequence'])
                    elif self.info['Method'] == 'best_prop':
                        min_dis = seq1.seqLen() * len(aaprop.prop)
                        pepseq1 = seq1.info['Sequence']
                        for c in range(seq1.seqLen()):  # Circular start
                            dis = 0
                            pepseq2 = seq2.info['Sequence'][c:] + seq2.info[
                                'Sequence'][:c]
                            for r in range(seq1.seqLen()):
                                (a1, a2) = (pepseq1[r], pepseq2[r])
                                dis += aaprop.pdif['%s%s' % (a1, a2)]
                            if dis < min_dis:
                                min_dis = dis
                        dis = min_dis
                    dismatrix.addDis(seq1, seq2, dis)
            ### <2> ### Output
            if self.info['OutMatrix'] == 'phylip':
                delimit = ' '
                format = 'phylip'
            else:
                delimit = rje.getDelimit(self.cmd_list, ',')
                format = 'None'
            outfile = '%s.%s.%s' % (rje.baseFile(
                seqlist.info['Name'],
                True), self.info['Method'], rje.delimitExt(delimit))
            dismatrix.saveMatrix(seqlist.seq, outfile, delimit, format=format)

        except:
            self.log.errorLog('Error in _pepDis',
                              printerror=True,
                              quitchoice=False)
            raise  # Delete this if method error not terrible
Ejemplo n.º 39
0
 def forking(
     self
 ):  ### Keeps forking out and processing jobs until no more jobs in self.list['Forked'].
     '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].'''
     ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if self.getBool('PIDCheck') or self.dev():
         pidcheck = '%s.pid' % rje.baseFile(
             self.log.info['LogFile'])  # Set *.pid object to match log
     else:
         pidcheck = None
     #self.deBug(pidcheck)
     ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     while self.list['Forked']:
         if not self.getBool('LogFork'):
             self.progLog(
                 '\r#FORK', 'Forking jobs: {0} running; {1} remain.'.format(
                     len(self.list['Forked']),
                     rje.iLen(self.list['ToFork'])))
         if pidcheck: PIDCHECK = open(pidcheck, 'w')
         for fdict in self.list['Forked'][0:]:
             try:
                 pid = fdict['PID']
                 if pidcheck:
                     PIDCHECK.write('%s: %s\n' %
                                    (self.list['Forked'].index(fdict), pid))
                 if string.split('%s' % pid)[0] == 'WAIT': status = 1
                 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG)
             except:
                 self.errorLog('!')
                 status = 1
             if status > 0:
                 self.list['Forked'].remove(fdict)
                 self.endFork(
                     fdict
                 )  # Fork has finished: can replace with processing
         if pidcheck:
             PIDCHECK.close()
             #self.deBug(open(pidcheck,'r').read())
         ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if time.time() - self.getNum('KillTime') > self.getNum(
                 'KillForks'):
             self.verbose(
                 0, 1,
                 '\n%d seconds of main thread inactivity. %d forks still active!'
                 % (self.getNum('KillForks'), len(self.list['Forked'])), 1)
             for fdict in self.list['Forked']:
                 self.verbose(
                     0, 2, ' => Fork %s, PID %d still Active!' %
                     (fdict['ID'], fdict['PID']), 1)
             if (self.i() < 0 and self.getBool('KillMain')) or rje.yesNo(
                     'Kill Main Thread?',
                     default={
                         True: 'N',
                         False: 'Y'
                     }[self.getBool('KillMain')]):
                 raise ValueError(
                     '%d seconds of main thread inactivity. %d forks still active!'
                     % (self.getNum('KillForks'), len(self.list['Forked'])))
             elif self.i() < 0 or rje.yesNo('Kill hanging forks?'):
                 self.printLog(
                     '#KILL', 'KillForks=%d seconds walltime reached.' %
                     (self.getNum('KillForks')))
                 for fdict in self.list['Forked']:
                     self.printLog(
                         '#KILL', 'Killing Fork %s, PID %d.' %
                         (fdict['ID'], fdict['PID']))
                     os.system('kill %d' % fdict['PID'])
             else:
                 self.setNum({'KillTime': time.time()})
         ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         time.sleep(self.getNum('ForkSleep'))
Ejemplo n.º 40
0
    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0
Ejemplo n.º 41
0
    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')
Ejemplo n.º 42
0
 def blast2fas(self):  ### Executes BLAST2FAS and copies results files
     '''Executes BLAST2FAS and copies results files.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         need2blast = self.opt['Force']
         null_file = '%s.blast2fas_null.txt' % self.baseFile()
         nx = 0
         null_list = []
         if os.path.exists(null_file):
             null_list = string.split(open(null_file, 'r').read(), '\n')
         self.debug(null_file)
         for seq in self.seqs():
             if seq.info['AccNum'] in null_list:
                 nx += 1
                 continue
             hfile = rje.makePath('%s%s.fas' %
                                  (self.info['HaqDir'], seq.info['AccNum']),
                                  wholepath=True)
             for db in self.obj['SeqList'].list['Blast2Fas']:
                 self.debug(rje.isYounger(hfile, db))
                 self.debug(rje.isYounger(hfile, db) == hfile)
                 need2blast = need2blast or not rje.isYounger(hfile,
                                                              db) == hfile
         if not need2blast:
             self.printLog(
                 '#BLAST',
                 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)'
                 % nx)
             return False
         ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.backup(self, null_file)
         nx = 0
         if self.getInt('MultiCut'):
             self.obj['SeqList'].cmd_list += [
                 'blastb=%d' % self.getInt('MultiCut'),
                 'blastv=%d' % self.getInt('MultiCut')
             ]
         elif self.getInt('BlastCut'):
             self.obj['SeqList'].cmd_list += [
                 'blastb=%d' % self.getInt('BlastCut'),
                 'blastv=%d' % self.getInt('BlastCut')
             ]
         if self.getInt('Forks'):
             self.obj['SeqList'].cmd_list += [
                 'blasta=%d' % self.getInt('Forks')
             ]
         rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir'))
         for seq in self.seqs():
             sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),
                                          seq.info['AccNum'])
             if os.path.exists(sbfile):
                 hfile = rje.makePath(
                     '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']),
                     wholepath=True)
                 os.rename(sbfile, hfile)
                 if os.path.exists('%s.pickle' % rje.baseFile(hfile)):
                     os.unlink('%s.pickle' % rje.baseFile(hfile))
                 if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)):
                     os.unlink('%s.pickle.gz' % rje.baseFile(hfile))
             else:
                 open(null_file, 'a').write('%s\n' % seq.info['AccNum'])
                 nx += 1
         if nx:
             self.printLog(
                 '#BLAST',
                 '%s Accession Numbers without BLAST2Fas hits output to %s'
                 % (nx, null_file))
         self.printLog(
             '#BLAST', '%s HAQESAC input files made using BLAST2Fas' %
             (self.seqNum() - nx))
         return True
     except:
         self.errorLog('Major problem with MultiHAQ.blast2fas')
         raise
Ejemplo n.º 43
0
    def hmmTable(self,
                 outfile='',
                 append=False,
                 delimit=None):  ### Outputs results table
        '''
        Outputs results table.
        >> outfile:str = Name of output file
        >> append:boolean = whether to append file
        >> delimit:str = Delimiter to use [\t]
        '''
        try:
            ### Setup ###
            if not outfile: outfile = self.info['HMMTab']
            if outfile.lower() == 'none':
                self.log.printLog('#TAB', 'HMMTab = "None": No table output')
                return False
            if not delimit: delimit = rje.getDelimit(self.cmd_list, '\t')
            if not outfile:
                outfile = '%s.hmmer.%s' % (rje.baseFile(
                    self.info['SearchDB'], True), rje.delimitExt(delimit))
            self.readResults()
            self.log.printLog('#TAB',
                              'Tabulating results for %s searches into %s' %
                              (len(self.search), outfile),
                              log=False)

            ### Setup Resfile ###
            if self.opt['MySQL']:
                headers = [
                    'HMM', 'Hit', 'Hit_Start', 'Hit_End', 'Eval', 'Score'
                ]
            else:
                headers = ['Type', 'Name', 'Start', 'End', 'Eval', 'Score']
            if not append or not os.path.exists(outfile):
                rje.delimitedFileOutput(self,
                                        outfile,
                                        headers,
                                        delimit,
                                        rje_backup=True)

            ### Output Search details ###
            for search in self.search:
                for hit in search.hit:
                    for aln in hit.aln:
                        out = {
                            'HMM': search.info['Name'],
                            'Type': search.info['Name'],
                            'Name': hit.info['Name'],
                            'Hit': hit.info['Name'],
                            'Start': '%d' % aln.stat['SbjStart'],
                            'End': '%d' % aln.stat['SbjEnd'],
                            'Hit_Start': '%d' % aln.stat['SbjStart'],
                            'Hit_End': '%d' % aln.stat['SbjEnd'],
                            'Eval': '%.2e' % aln.stat['Expect'],
                            'Score': '%.1f' % aln.stat['BitScore']
                        }
                        rje.delimitedFileOutput(self, outfile, headers,
                                                delimit, out)
            self.log.printLog(
                '#OUT', 'Results for %s searches output to %s.' %
                (len(self.search), outfile))
        except:
            self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile)
            raise
Ejemplo n.º 44
0
 def hmmSearch(
         self,
         hmm,
         dbase=None,
         outfile=None,
         wait=True):  ### Performs HMMer Search using object attributes
     '''
     Performs HMMer Search using object attributes.
     >> hmm:str = Name of HMM file 
     >> dbase:str = Name of DBase file [self.info['SearchDB']]
     >> outfile:str = Name of Output file file [self.info['HMMOut']]
     >> wait:boolean  = whether to wait for HMMer. [True]
     << returns outfile or None if fails
     '''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.checkForFile(hmm):
             self.printLog('#ERR', 'HMM file %s is missing!' % hmm)
             return None
         if not dbase: dbase = self.info['SearchDB']
         if not rje.checkForFile(dbase):
             self.printLog('#ERR', 'Database file "%s" is missing!' % dbase)
             return None
         ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not outfile or outfile.lower() in [
                 '', 'none'
         ]:  # Make an outfile per search
             outfile = '%s.%s.hmmer' % (rje.baseFile(
                 hmm, True), rje.baseFile(dbase, True))
             resfile = outfile
             if not os.path.exists(
                     outfile) and self.opt['GZip'] and os.path.exists(
                         '%s.gz' % outfile) and not self.opt['Force']:
                 resfile = '%s.gz' % outfile
             if not self.opt['Force'] and rje.isYounger(
                     resfile, hmm) == resfile and rje.isYounger(
                         resfile, dbase) == resfile:
                 self.printLog('#HMM',
                               'HMM results file "%s" exists.' % resfile)
                 return outfile  # Already exists
             else:
                 rje.backup(self, outfile, unlink=True)
         ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.opt['HMMPFam']:
             _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(
                 self.list['HMMOptions']), hmm, dbase, outfile)
         else:
             _command = 'hmmsearch %s %s %s > %s' % (string.join(
                 self.list['HMMOptions']), hmm, dbase, outfile)
         self.log.printLog('#HMM', _command)
         if not wait: os.system(self.info['HMMerPath'] + _command + ' &')
         elif not os.path.exists(outfile) or self.opt['Force']:
             open(outfile, 'a').write(
                 os.popen(self.info['HMMerPath'] + _command).read())
         self.printLog('#HMM',
                       'Outfile produced for %s: %s.' % (hmm, outfile))
         if self.opt['GZip']:
             rje.backup(self, '%s.gz' % outfile, unlink=True)
             os.system('gzip %s' % outfile)
             self.printLog('#GZIP', '%s gzipped to save space' % outfile)
         return outfile
     except:
         self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm)
         return None
Ejemplo n.º 45
0
 def uniFake(
     self,
     seqs=[],
     store=False
 ):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx, seqnum) = (0, seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(
             self.log, self.cmd_list)  # UniProt object for saving data
         if self.info['DatOut'].lower() in ['', 'none']:
             self.info['DatOut'] = rje.baseFile(
                 seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self, datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self, hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else:
             hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()
             self.printLog(
                 '#SEQ', 'Processing %s (%s aa) %s...' %
                 (seq.shortName(), rje.integerString(
                     seq.aaLen()), seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),
                                      seq.info['AccNum'])
                 open('%s.fas' % utmp, 'w').write(
                     '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence']))
                 udata = {
                     'CC': ['-!- Features generated using unifake.py'],
                     'AC': []
                 }
                 if seq.info['SpecCode'] in ['Unknown', 'UNK']:
                     seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name']):
                     details = rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name'])
                     self.addAlias(seq.info['AccNum'], details[0])
                     self.addAlias(seq.info['AccNum'], details[1])
                     self.addAlias(seq.info['AccNum'], details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(), seq.info['AccNum']]:
                     if id in self.dict['Aliases']:
                         udata['AC'].append(
                             '%s;' %
                             string.join(self.dict['Aliases'][id], '; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []  # List of features for sequence
                 for id in [
                         seq.shortName(), seq.info['AccNum'], seq.info['ID']
                 ]:
                     if id in self.dict['Features']:
                         ft += self.dict['Features'][id]
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list[
                                 'RegionDisorder']:
                             ft.append({
                                 'Type':
                                 'DISORDER',
                                 'Desc':
                                 'Predicted disorder: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 disorder[0],
                                 'End':
                                 disorder[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s > %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({
                                 'Type':
                                 'ORDER',
                                 'Desc':
                                 'Predicted order: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 fold[0],
                                 'End':
                                 fold[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s <= %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                     except:
                         self.log.errorLog(
                             'UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({
                             'SearchDB': '%s.fas' % utmp,
                             'HMMOut': '%s.hmm.out' % utmp
                         })  # This will be made for each sequence
                         hmm.search = []
                         hmm.list['HMMRes'] = [
                             hmm.hmmSearch(self.info['PFam'],
                                           outfile=hmm.info['HMMOut'])
                         ]  # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile, append=True)
                         if 'disorder' in self.list['UniFake']:
                             disorder = seq.obj['Disorder'].list[
                                 'ResidueDisorder']  # individual (IUPRed) residue results
                         else:
                             disorder = []
                         if hmm.search:
                             udata['CC'].append(
                                 'PFam: HMMer PFam search vs %s (Modified %s)'
                                 %
                                 (self.info['PFam'],
                                  time.ctime(
                                      os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append(
                                 '-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type': '!ERROR!', 'Name': name}
                             rje.delimitedFileOutput(
                                 self,
                                 hmmfile, [
                                     'Type', 'Name', 'Start', 'End', 'Eval',
                                     'Score'
                                 ],
                                 datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {
                                         'Start':
                                         aln.stat['SbjStart'],
                                         'End':
                                         aln.stat['SbjEnd'],
                                         'Type':
                                         'PFAM',
                                         'Desc':
                                         '%s PFam HMM Eval: %.2e; Score: %.1f'
                                         % (search.info['Name'],
                                            aln.stat['Expect'],
                                            aln.stat['BitScore'])
                                     }
                                     if disorder:
                                         region = disorder[
                                             aln.stat['SbjStart'] -
                                             1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(
                                             sum(region)) / len(region)
                                         pfamft[
                                             'Desc'] = '%s; IUPRed: %.2f' % (
                                                 pfamft['Desc'],
                                                 hmmdisorder)
                                         if hmmdisorder < self.stat[
                                                 'DisDom']:
                                             pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except:
                         self.log.errorLog(
                             'UniFake PFam HMM problem for %s.' % name)
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen(
                             '%s %s.fas -short' %
                             (self.info['TMHMM'], utmp)).readlines()
                         domlist = rje_tm.domainList(
                             rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1:
                             udata['CC'].append(
                                 'TMHMM: %d TM domains; N-Term %s' %
                                 ((len(domlist) - 1) / 2,
                                  domlist[0]['Type']))
                         else:
                             udata['CC'].append('TMHMM: 0 TM domains')
                     except:
                         self.log.errorLog('UniFake TMHMM problem for %s.' %
                                           name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system(
                             '%s -f short -t euk %s.fas > %s.signalp' %
                             (self.info['SignalP'], utmp, utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else:
                                     desc += ' (HMM also Y)'
                         if cpos > 0:
                             ft.append({
                                 'Type': 'SIGNALP',
                                 'Desc': desc,
                                 'Start': 1,
                                 'End': cpos
                             })
                     except:
                         self.log.errorLog(
                             'UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq, udata, ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(
                         seq, data=udata,
                         ft=ft):  ### Converts into UniProtEntry object
                     if not store: uniprot.saveUniProt(datfile, append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except:
                 self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp):
                 os.unlink(tmp)
             self.printLog(
                 '#UNIFAKE',
                 '|---------- %s run <<<|>>> %s to go -----------|' %
                 (rje.integerString(sx), rje.integerString(seqnum - sx)),
                 log=False)
         if store: uniprot.saveUniProt(datfile, append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)
     except:
         self.errorLog(
             'Oh, the shame of it! Trouble during UniFake.uniFake()')
Ejemplo n.º 46
0
 def rfAtt(self):  ### Generic method
     '''
     Generic method. Add description here (and arguments.)
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhead = [
             'Att', 'RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3', 'ObsRF1',
             'ObsRF2', 'ObsRF3', 'ObsRF-1', 'ObsRF-2', 'ObsRF-3', 'ExpRF1',
             'ExpRF2', 'ExpRF3', 'ExpRF-1', 'ExpRF-2', 'ExpRF-3'
         ]
         rfdata = {}
         rfobs = {}
         rfexp = {}
         ntfreq = {}
         for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
             rfdata[rf] = {}
             rfobs[rf] = {}
             rfexp[rf] = {}
             for x in rje_seq.alph_protx[:-1] + ['*']:
                 rfdata[rf][x] = 0
                 rfobs[rf][x] = 0
                 rfexp[rf][x] = 0
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfdata[rf]['%s%s' % (a1, a2)] = 0
                     rfobs[rf]['%s%s' % (a1, a2)] = 0
                     rfexp[rf]['%s%s' % (a1, a2)] = 0
         for x in rje_seq.alph_dna[:-1]:
             ntfreq[x] = 0
         seqlist = self.obj['SeqList']
         ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (sx, stot) = (0.0, seqlist.seqNum())
         for seq in seqlist.seq:
             self.progLog(
                 '\r#ATT',
                 'Counting sequence attributes: %.2f%%' % (sx / stot))
             sx += 100.0
             for x in seq.info['Sequence']:
                 if x in ntfreq: ntfreq[x] += 1
             rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence'])
             for r in rf6:
                 rseq = rf6[r]
                 rf = 'RF%d' % r
                 for i in range(len(rseq)):
                     a = rseq[i]
                     dia = rseq[i:i + 2]
                     if a in rfdata[rf]: rfdata[rf][a] += 1
                     if dia in rfdata[rf]: rfdata[rf][dia] += 1
         self.printLog('\r#ATT', 'Counting sequence attributes complete.')
         ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ntobs = rje.dictFreq(ntfreq, total=True, newdict=True)
         ntcomp = {'Total': ntobs['Total']}
         for xy in ['AT', 'GC']:
             ntcomp[xy[0]] = ntobs[xy[1]]
             ntcomp[xy[1]] = ntobs[xy[0]]
         for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
             aafreq = {}
             for a in rje_seq.alph_protx[:-1] + ['*']:
                 aafreq[a] = rfdata[rf][a]
             aafreq = rje.dictFreq(aafreq, total=True, newdict=True)
             for a in rje_seq.alph_protx[:-1] + ['*']:
                 rfobs[rf][a] = rfdata[rf][a]
                 rfexp[rf][a] = 0
             for n1 in 'GATC':
                 for n2 in 'GATC':
                     for n3 in 'GATC':
                         codon = '%s%s%s' % (n1, n2, n3)
                         aa = rje_sequence.dna2prot(codon)
                         if rf[-2] == '-':
                             rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) *
                                               ntcomp[n1] * ntcomp[n2] *
                                               ntcomp[n3])
                         else:
                             rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) *
                                               ntobs[n1] * ntobs[n2] *
                                               ntobs[n3])
                         #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3]))
                         #self.deBug('%s: %s' % (aa,rfexp[rf][aa]))
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfexp[rf]['%s%s' %
                               (a1, a2)] = (aafreq['Total'] -
                                            1) * aafreq[a1] * aafreq[a2]
                     rfobs[rf]['%s%s' % (a1, a2)] = rfdata[rf]['%s%s' %
                                                               (a1, a2)]
         ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt'
         rje.delimitedFileOutput(self, rfile, rfhead, rje_backup=True)
         for a in rje_seq.alph_protx[:-1] + ['*']:
             data = {'Att': a}
             for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
                 data['Obs%s' % rf] = rfobs[rf][a]
                 data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                 data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
             rje.delimitedFileOutput(self, rfile, rfhead, datadict=data)
         for a1 in rje_seq.alph_protx[:-1] + ['*']:
             for a2 in rje_seq.alph_protx[:-1] + ['*']:
                 a = '%s%s' % (a1, a2)
                 data = {'Att': a}
                 for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
                     data['Obs%s' % rf] = rfobs[rf][a]
                     data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                     data[rf] = rje.expectString(rfobs[rf][a] /
                                                 rfexp[rf][a])
                 rje.delimitedFileOutput(self, rfile, rfhead, datadict=data)
         self.printLog('#TDT', 'TDT output complete.')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Ejemplo n.º 47
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(self.log,['keepblast=T']+self.cmd_list+['autofilter=F','align=F','haqbat=None'])
         self.obj['SeqList']._checkForDup(True)
         if not self.seqNum(): self.errorLog('No sequences loaded!',printerror=False); return False
         if self.opt['AddQueries'] and self.name() not in self.obj['SeqList'].list['Blast2Fas']: self.obj['SeqList'].list['Blast2Fas'].append(self.name())
         ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['HaqDir'].lower() in ['','none']: self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile(self.name(), strip_path=True)
         rje.mkDir(self,self.info['HaqDir'])
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Ejemplo n.º 48
0
	def slimDisc(self):	### Runs SLiMDisc on batch of files
		'''Runs SLiMDisc on batch of files.'''
		try:
			### Setup ###
			if self.stat['MinSup'] > self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1:
				self.stat['MinSup'] = self.stat['SlimSupport']
			if self.stat['MaxSup'] > 0  and self.stat['MaxSup'] < self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1:
				self.stat['MaxSup'] = self.stat['SlimSupport']
			### Make File List ##
			_stage = 'Make File List'
			if self.info['SeqIn'].lower() not in ['','none']:
				if os.path.exists(self.info['SeqIn']):
					gfiles = [self.info['SeqIn']]
				else:
					self.log.errorLog('"seqin" file "%s" not found! No SLiMDisc analysis.' % self.info['SeqIn'],printerror=False)
					return False
			else:
				gfiles = rje.getFileList(callobj=self,filelist=self.list['SlimFiles'],subfolders=False,summary=False)
			self.log.printLog('#FILES','%s files identified for SLiMDisc analysis.' % rje.integerString(len(gfiles)))
			## Sort by size and filter by MinSup and MaxSup ###
			datasize = {}   # Dictionary for crude sorting of files by total AA content
			seqnum = {}		# Number of sequences in each file
			qry = {}		# Query sequence name (if any) for file
			tmpseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autofilter=F'])
			gx = 0
			while gx < len(gfiles):
				seqfilename = gfiles[gx]
				gx += 1
				seqfile = seqfilename[0:]
				tmpseq.seq = []
				tmpseq.loadSeqs(seqfile)
				## *** Special RemHub process *** ##
				checkhub = True
				for hubtype in ['rem','kept','no']:
					if seqfile.find('-%shub.fas' % hubtype) > 0:
						checkhub = False
				if self.stat['RemHub'] > 0.0 and checkhub:
					if rje.matchExp('(\S+)_PPI',seqfile):
						hub_acc = rje.matchExp('(\S+)_PPI',rje.baseFile(seqfile,strip_path=True))[0]
					else:
						hub_acc = rje.baseFile(seqfile,strip_path=True)
					hub_base = rje.matchExp('(\S+)%s' % hub_acc,seqfilename)[0]
					basefile = seqfile
					while rje.baseFile(basefile) != basefile:
						basefile = rje.baseFile(basefile)
					if tmpseq.querySeq(query=hub_acc):     ### Sets Hub as Query Sequence
						self.log.printLog('#HUB','Removing hub protein %s and >=%.1f%% ID from PPI dataset %s.' % (hub_acc,self.stat['RemHub'],seqfile))
						tmpseq.makeNR(text='Hub protein homologues',nrid=self.stat['RemHub'],blast=tmpseq.seqNum(),nrsim=0,nr_qry=tmpseq.obj['QuerySeq'])
						tmpseq.removeSeq(text='PPI Hub Protein (self-interactor)',seq=tmpseq.obj['QuerySeq'])
						tmpseq.obj['QuerySeq'] = None
						seqfile = '%s-remhub.fas' % basefile
						tmpseq.saveFasta(seqfile=seqfile)	### Saves sequences in fasta format
						keptfile = '%s-kepthub.fas' % basefile
						os.rename(seqfilename,keptfile)
						gfiles.append(keptfile)
					else:
						seqfile = '%s-nohub.fas' % basefile
						os.rename(seqfilename,seqfile)
						self.log.printLog('#HUB','Hub protein %s not in PPI dataset %s => %s.' % (hub_acc,seqfilename,seqfile))
						#X#print tmpseq.obj['QuerySeq']
				## Support Range ###				
				if tmpseq.seqNum() < self.stat['MinSup'] or (self.stat['MaxSup'] > 0 and tmpseq.seqNum() > self.stat['MaxSup']):
					self.log.printLog('#REJ','%s rejected: %s sequences = outside acceptable range of %d-%d.' % (seqfile,rje.integerString(tmpseq.seqNum()),self.stat['MinSup'],self.stat['MaxSup']))
					continue
				aasize = tmpseq.aaCount()
				self.log.printLog('#AA','%s = %s aa.' % (seqfile,rje.integerString(aasize)))
				while datasize.has_key(aasize):
					aasize += 1
				datasize[aasize] = seqfile
				seqnum[seqfile] = tmpseq.seqNum()
				## Query ##
				qry[seqfile] = None
				if self.opt['SlimQuery']:
					if rje.matchExp('qry_(\S+)\.',seqfilename):
						if tmpseq.querySeq(query=rje.matchExp('qry_(\S+)\.',seqfilename)[0]):     ### Sets Query Sequence if appropriate
							qry[seqfile] = tmpseq.obj['QuerySeq'].shortName()
			self.log.printLog('#INF','%s Datasets to process.' % rje.integerString(len(seqnum)))

			### Batch Output Mode ###
			batchout = None
			if self.info['BatchOut'].lower() not in ['','none']:
				batchout = self.info['BatchOut']
				if not self.opt['Append'] and os.path.exists(batchout):
					rje.backup(self,batchout)

			### Work through Files ###
			_stage = 'Work through files'
			for key in rje.sortKeys(datasize,revsort=self.opt['BigFirst']):
				seqfile = datasize[key]
				basefile = seqfile
				while rje.baseFile(basefile) != basefile:
					basefile = rje.baseFile(basefile)
				base = rje.baseFile(basefile,True)
				self.log.printLog('#DAT',seqfile,timeout=False)
				if not self.opt['UseRes']:
					slim_cmd = '-BT -TT'
				else:
					## Detect old files ##
					_stage = 'Detect old files'
					old_rank = '%s/%s.rank' % (basefile,base)
					self.log.printLog('#RES','Existing SLiMDisc Output?: %s' % (os.path.exists(old_rank)))
					old_b_list = glob.glob('%s/results/*.blastp' % basefile)
					old_t_file = '%s/%s.fasta.out' % (basefile,base)
					self.log.printLog('#RES','Existng TEIRESIAS Output?: %s' % (os.path.exists(old_t_file)))
					self.log.printLog('#RES','%s of %s BLAST files detected.' % (rje.integerString(len(old_b_list)),rje.integerString(seqnum[seqfile])))
					## TEIRESIAS ##
					if (os.path.exists(old_rank) or len(old_b_list) > 0) and os.path.exists(old_t_file):  # BLAST started: TEIRESIAS finished!
						slim_cmd = '-TF'
					else:
						slim_cmd = '-TT'
					## BLAST ##
					if len(old_b_list) != seqnum[seqfile]:	# Need BLAST
						slim_cmd += ' -BT'
					else:
						slim_cmd += ' -BF'
				## Query ##
				if self.opt['SlimQuery'] and qry[seqfile]:
					slim_cmd += ' -q %s' % qry[seqfile]
				## Ranks ##
				slim_cmd += ' -n %d' % self.stat['SlimRanks']
				## Support ##
				if self.stat['SlimSupport'] > 0 and self.stat['SlimSupport'] < 1:
					slim_cmd += ' -S %.1f' % self.stat['SlimSupport']
				elif self.stat['SlimSupport'] > 0:
					slim_cmd += ' -S %d' % self.stat['SlimSupport']
				## WallTime ##
				slim_cmd += ' -W %d' % self.stat['SlimWall']
				## MemSaver ##
				if self.opt['MemSaver']:
					slim_cmd += ' -X T'
				else:
					slim_cmd += ' -X F'
				## SlimOpt ##
				if self.info['SlimOpt']:
					slim_cmd += ' %s' % self.info['SlimOpt']
				## Perform SLiMDisc Run ##
				_stage = 'Peform SLiMDisc Run (%s)' % (seqfile)
				if batchout:
					BATCH = open(batchout,'a')
					BATCH.write('%s -i %s -Q0 %s\n' % (self.info['SlimCall'],seqfile,slim_cmd))
					BATCH.close()
				else:
					if self.stat['Verbose'] > 0:
						syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q2 %s' % (self.info['SlimVersion'],seqfile,slim_cmd)
					else:
						syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q0 %s' % (self.info['SlimVersion'],seqfile,slim_cmd)
					self.log.printLog('#SYS',syscmd)
					os.system(syscmd)
				if not batchout:
					new_rank = '%s/%s.rank' % (basefile,base)
					self.log.printLog('#RES','New rank result %s produced?: %s' % (new_rank,os.path.exists(new_rank)))

		except:
			self.log.errorLog('rje_pattern_discovery banjaxed in slimDisc() %s' % _stage,quitchoice=True)
Ejemplo n.º 49
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.getStrLC('Basefile'): self.basefile(rje.baseFile(self.getStr('SeqIn')))
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Ejemplo n.º 50
0
	def run(self):		### Main Run method
		'''
		Main Run method.
		'''
		try:
			### SLiMDisc Run ###
			if self.opt['SLiMDisc']:
				return self.slimDisc()
			
			### TEIRESIAS ###
			if self.opt['Teiresias']:
				## Setup ##
				seqlist = rje_seq.SeqList(self.log,self.cmd_list)
				infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True)
				outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True)
				run_teiresias = True
				if rje.isYounger(outfile,infile) == outfile:
					if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'):
						run_teiresias = False
				## Run TEIRESIAS ##
				if run_teiresias:
					seqlist.saveFasta(seqfile=infile,name='Teiresias')	### Saves sequences in fasta format
					command = rje.makePath(self.info['TeiresiasPath'],True)
					command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt'])
					self.log.printLog('#CMD',command)
					os.system(command)
				## Read Results ##
				self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1)
				self.list['Pattern'] = []
				RESULTS = open(outfile,'r')
				line = RESULTS.readline()
				while line:
					if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern
						self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line))
					elif len(line) > 3 and line[0] != '#':
						self.log.errorLog('Did not recognise line: %s' % line,False,False)
					line = RESULTS.readline()
				RESULTS.close()
				patx = len(self.list['Pattern'])
				self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile))
				## Calculate Information Content ##
				aafreq = seqlist.aaFreq()
				self.verbose(0,3,'Calculating Information Content & Length stats...',0)
				occx = 0
				for pattern in self.list['Pattern']:
					pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq)
					pattern._makeLength()
					occx += 1
					rje.progressPrint(self,occx,patx/100,patx/10)
				self.verbose(0,1,'...Done!',2)
				## Prepare Results ##
				delimit = rje.getDelimit(self.cmd_list)
				if self.info['Name'] == 'None':
					self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit))
				if self.opt['MySQL']:	# Two tables
					patfile = os.path.splitext(self.info['Name'])
					occfile = '%s.occ%s' % (patfile[0],patfile[1])
					patfile = '%s.patterns%s' % (patfile[0],patfile[1])
					if self.opt['Append']:
						PATFILE = open(patfile,'a')
						OCCFILE = open(occfile,'a')
					else:
						PATFILE = open(patfile,'w')
						rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit)
						OCCFILE = open(occfile,'a')
						rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit)
				else:
					if self.opt['Append']:
						RESFILE = open(self.info['Name'],'a')
					else:
						RESFILE = open(patfile,'w')
						rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit)
				## Save Results ##
				occx = 0
				for pattern in self.list['Pattern']:
					patstats = []
					for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']:
						patstats.append('%d' % pattern.stat[stat])
					patstats[2] = '%.3f' % pattern.stat['Info']
					if self.opt['MySQL']:	# Two tables
						rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit)
					for occ in rje.sortKeys(pattern.occ):
						seq = seqlist.seq[occ]
						for pos in pattern.occ[occ]:
							match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])]
							outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match]
							if self.opt['MySQL']:	# Two tables
								rje.writeDelimit(OCCFILE,outlist,delimit)
							else:
								rje.writeDelimit(RESFILE,outlist+patstats,delimit)
							occx += 1
				if self.opt['MySQL']:	# Two tables
					PATFILE.close()
					OCCFILE.close()
					self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile))
					self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile))
				else:
					RESFILE.close()
					self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' %
									  (rje.integerString(occx),rje.integerString(patx),self.info['Name']))

			### InfoContent ###
			elif self.info['Info'] != 'None':
				## Setup ##
				alphabet = rje_seq.alph_protx 
				if not os.path.exists(self.info['Info']):
					self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False)
					return False
				else:
					mypresto = presto.Presto(self.log,self.cmd_list)
					mypresto.loadMotifs(file=self.info['Info'],clear=True)
				seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
				if seqlist.seqNum() > 0:
					aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False)  ### Returns dictionary of AA (& gap etc.) frequencies
				else:
					aafreq = {}
					for aa in alphabet:
						aafreq[aa] = 1.0 / len(alphabet)
				alphabet = aafreq.keys()
				maxinfo = 0 
				for aa in alphabet:
					maxinfo +=  (aafreq[aa] * math.log(aafreq[aa],2))
				## Output ##
				delimit = rje.getDelimit(self.cmd_list)
				ext = rje.delimitExt(delimit)
				outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext)
				if self.opt['Append']:
					OUTFILE = open(outfile,'a')
				else:
					OUTFILE = open(outfile,'w')
					rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit)
				
				## Calculate Information Scores ##
				for motif in mypresto.motif:
					self.verbose(2,4,motif.info['Sequence'],0)
					pattern = string.replace(motif.info['Sequence'],'X','.')
					elements = string.split(pattern,'-')
					pattern = ''
					for el in elements:
						if el.find('.{') == 0:	# Ambiguous spacer length - compress
							pattern += '.'
						else:
							pattern += el
					self.verbose(2,2,'=> %s' % pattern,1)
					motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen'])
					self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1)
					## Output ##
					rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit)
				
				## Finish ##
				OUTFILE.close()
		except:
			self.log.errorLog('Error in run().',printerror=True,quitchoice=False)
			raise	# Delete this if method error not terrible
Ejemplo n.º 51
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         infile = self.getStr('InFile')
         while not rje.exists(infile):
             infile = rje.choice(
                 'File "%s" not found. Input file name? (Blank to quit):' %
                 infile)
             if not infile:
                 return self.printLog('#QUIT', 'Execution terminated!')
         db = rje_db.Database(self.log, self.cmd_list)
         db.basefile(rje.baseFile(infile))
         sdb = db.addTable(infile,
                           mainkeys='#',
                           delimit='\t',
                           name='SPF.Mod')
         levels = {
             'Level_1': 'k',
             'Level_2': 'p',
             'Level_3': 'c',
             'Level_4': 'o',
             'Level_5': 'f',
             'Level_6': 'g',
             'Level_7': 's'
         }
         # k__Bacteria	p__Proteobacteria	c__Alphaproteobacteria	o__Rhodospirillales	f__Rhodospirillaceae	g__	s__	denovo44
         # Unassigned	unclassified	unclassified	unclassified	unclassified	unclassified	unclassified	denovo49
         ### ~ [1] Modify Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         dupnames = []
         parents = {}  # Parent for each term
         renamed = []
         ex = 0.0
         etot = sdb.entryNum()
         for entry in sdb.entries():
             self.progLog('\r#SPF',
                          'Modifying SPF content: %.1f%%' % (ex / etot))
             ex += 100.0
             taxon = ''
             parent = ''
             #self.debug(entry)
             for lvl in [
                     'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5',
                     'Level_6', 'Level_7'
             ]:
                 entry[lvl] = string.replace(entry[lvl], 'unidentified',
                                             'unclassified')
                 #entry[lvl] = string.replace(entry[lvl],'Incertae_sedis','Incertae_sedis-%s' % levels[lvl])
                 null = '%s__' % levels[lvl]
                 #self.bugPrint(null)
                 #self.bugPrint(entry[lvl])
                 if entry[lvl] in [
                         null, 'Unassigned', 'unclassified',
                         '%sunclassified' % null,
                         '%sunidentified' % null,
                         '%sunculturedfungus' % null,
                         '%sIncertae_sedis' % null,
                         '%sunclassified_sp.' % null
                 ]:
                     if not taxon or taxon.endswith('unclassified'):
                         entry[lvl] = '%sunclassified' % null
                         #elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s' % (null,taxon[3:])
                         #elif taxon.endswith('unassigned)'): entry[lvl] = '%s(%s;%s-unassigned)' % (null,string.split(taxon,'(')[1][:-1],levels[lvl])
                     elif taxon.endswith('unassigned)'):
                         entry[lvl] = '%s%s;%s-unassigned)' % (
                             null, taxon[3:][:-1], levels[lvl])
                     else:
                         entry[lvl] = '%s%s(%s-unassigned)' % (
                             null, taxon[3:], levels[lvl])
                 if entry[lvl] in parents:
                     #self.debug(parents[entry[lvl]])
                     if parent in parents[entry[lvl]]:
                         entry[lvl] = parents[entry[lvl]][parent]
                     else:
                         self.bugPrint(entry[lvl])
                         self.bugPrint(parents[entry[lvl]])
                         renamed.append(entry[lvl])
                         newtax = '%s%d' % (entry[lvl],
                                            renamed.count(entry[lvl]))
                         self.warnLog(
                             '%s had multiple parents (%s & %s) -> %s' %
                             (entry[lvl],
                              string.join(parents[entry[lvl]],
                                          '|'), parent, newtax))
                         parents[newtax] = {parent: newtax}
                         parents[entry[lvl]][parent] = newtax
                         entry[lvl] = newtax
                         self.deBug(parents[entry[lvl]])
                 elif parent:
                     parents[entry[lvl]] = {parent: entry[lvl]}
                 parent = entry[lvl]
                 if entry[lvl][3:] == taxon[3:]:
                     if (entry[lvl], taxon) not in dupnames:
                         dupnames.append((entry[lvl], taxon))
                 #self.bugPrint(entry[lvl])
                 taxon = entry[lvl]
             #self.debug(entry)
             #self.debug(parents)
         self.printLog('\r#SPF', 'Modifying SPF content complete.')
         dupnames.sort()
         for (dupA, dupB) in dupnames:
             self.warnLog('Duplicate taxa names: %s & %s' % (dupA, dupB))
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb.saveToFile(savefields=sdb.list['Fields'][1:])
         ### ~ [3] Compress to different taxonomic levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         compress = [
             'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5',
             'Level_6', 'Level_7', '#'
         ]
         dump = compress.pop(-1)
         rules = {'Observation Ids': 'list', dump: 'str'}
         sdb.dropField('Observation Ids')
         while compress:
             sdb.compress(compress,
                          rules=rules,
                          default='sum',
                          best=[],
                          joinchar='|')
             #if dump == '#':
             sdb.dropField(dump)
             sdb.saveToFile(
                 '%s.SPF.%s.%s.spf' %
                 (rje.baseFile(infile), compress[-1], levels[compress[-1]]))
             dump = compress.pop(-1)
             rules[dump] = 'list'
         return
     except:
         self.errorLog(self.zen())
         raise  # Delete this if method error not terrible
Ejemplo n.º 52
0
def runMain():
    try:
        ### <0>  ### Basic Setup of Program
        [info, out, mainlog, cmd_list] = setupProgram()

        ### <1> ### Load Data
        ##  <a>  ## Read in Sequences
        try:
            out.verbose(1, 3, 'Loading sequences...', 0)
            seqfile = 'infile.fas'
            nsfin = None
            for cmd in cmd_list:
                if cmd.find('seqin=') == 0:
                    seqfile = cmd[len('seqin='):]
                if cmd.find('nsfin=') == 0:
                    nsfin = cmd[len('nsfin='):]
            basefile = seqfile
            extension = seqfile[-4:]
            if (extension == '.fas') or (extension == '.phy') or (extension
                                                                  == '.aln'):
                basefile = seqfile[:-4]
            seqs = rje_seq.SeqList(
                log=mainlog,
                cmd_list=['i=0'] + cmd_list +
                ['autofilter=F', 'autoload=F', 'seqin=None'])
            out.verbose(1, 3, "from %s" % seqfile, 1)
            if not seqs.loadSeqs(seqfile=seqfile, seqtype='protein', aln=True):
                raise
            seqfile = seqs.info['Name']
            basefile = rje.baseFile(seqfile)
            mainlog.printLog(
                '#SEQ', "%s protein sequences read from %s\n" %
                (str(seqs.seqNum()), seqfile), 1)
            mainlog.printLog(
                '#SEQ', "Alignment = %s. (%d aa)\n" %
                (seqs.opt['Aligned'], seqs.seq[0].seqLen()), 1)
        except:
            mainlog.errorLog("Fatal run Exception during Sequence Input\n")
            raise
        ##  <b>  ## Read in Tree
        try:
            if not nsfin:
                nsfin = basefile + '.nsf'
            while not os.path.exists(nsfin):
                if out.stat['Interactive'] >= 0:
                    nsfin = rje.choice(
                        text=
                        'Input tree file "%s" not found. Input filename? (Blank to exit.)'
                        % nsfin)
                    if nsfin == '':
                        raise KeyboardInterrupt
                else:
                    mainlog.log.errorLog(
                        'File %s not found. Cannot load tree!' % nsfin,
                        printerror=False,
                        quitchoice=True)
                    raise
            cmd_list.append('nsfin=' + nsfin)
            out.verbose(1, 3, 'Loading tree from %s...' % nsfin, 1)
            mytree = rje_tree.Tree(log=mainlog,
                                   cmd_list=['root=yes'] + cmd_list)
            mytree.mapSeq(seqlist=seqs)
            mytree.textTree()
            if mytree.opt['ReRooted']:
                mytree.saveTree(filename='%s.nsf' % basefile)
        except KeyboardInterrupt:
            mainlog.errorLog("User terminated.\n")
            raise
        except:
            mainlog.errorLog("Fatal run Exception during Tree Input\n")
            raise

        ### <2> ### GASP
        try:
            ## <a> ## InDel Tree Setup
            indeltree = None
            for cmd in cmd_list:
                if cmd.find('indeltree=') == 0:
                    indeltree = cmd[len('indeltree='):]

            ## <b> ## GASP
            if indeltree == None or mytree.node[-1].obj[
                    'Sequence'] == None:  # Perform GASP
                out.verbose(0, 2, '', 3)
                mainlog.printLog('#SEQ',
                                 'GASP: Gapped Ancestral Sequence Prediction',
                                 1)
                if basefile == 'infile':
                    basefile = 'gasp'
                mygasp = rje_ancseq.Gasp(tree=mytree,
                                         ancfile='%s' % basefile,
                                         cmd_list=cmd_list,
                                         log=mainlog)
                out.verbose(0, 2, '%s' % mygasp.details(), 1)
                if out.stat['Interactive'] > 0:
                    if rje.yesNo('Use these parameters?') == False:
                        mygasp.edit()
                mygasp.gasp()
                out.verbose(0, 1, "\n\nGASP run completed OK!", 2)

            ## <c> ## InDel Tree
            if indeltree:
                mytree.indelTree(filename=indeltree)

        except KeyboardInterrupt:
            mainlog.errorLog("User terminated.\n")
            raise
        except:
            mainlog.errorLog("Fatal run Exception during GASP\n")
            raise

        ### <X> ### End
    except KeyboardInterrupt:
        mainlog.errorLog("User terminated.\n")
    except:
        print "Unexpected error:", sys.exc_info()[0]
    mainlog.printLog(
        '#LOG',
        "%s V:%s End: %s\n" % (info.program, info.version,
                               time.asctime(time.localtime(time.time()))), 1)
Ejemplo n.º 53
0
 def splitMascot(self):  ### Reads the MASCOT file and splits into header, hits and unmatched files.
     '''Reads the MASCOT file and splits into header, hits and unmatched files.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         infile = self.getStr('MASCOT')
         if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('MASCOT')))
         #x#self.deBug(self.basefile())
         headfile = '%s.header.txt' % self.basefile()
         hitsfile = '%s.mascot.csv' % self.basefile()
         peptfile = '%s.nohits.csv' % self.basefile()
         if rje.isYounger(self.getStr('MASCOT'),hitsfile) == hitsfile and not self.force():
             return self.printLog('#FILE','%s file found (force=F)' % hitsfile)
         ### ~ [1] Split MASCOT~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         headlines = []
         csvhead = []
         mdb = None
         mx = 0
         itraq = []
         prot_data = {}
         for mline in open(self.getStr('MASCOT'),'r').readlines():
             mx += 1     # Index of next line in case needed for iTRAQ reading!
             ## ~ [1a] Skip down until Header found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not headlines and mline.find('Header') < 0: continue
             ## ~ [1b] Add Header lines to headlines until results headers found ~~~~~~~~~~~~~~~ ##
             if not csvhead and mline.find('prot_hit_num') < 0: headlines.append(mline); continue
             ## ~ [1c] Sort out MASCOT results headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if mline.find('prot_hit_num') >= 0:
                 ## ~ Read Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 open(headfile,'w').writelines(headlines)
                 csvhead = rje.readDelimit(string.join(string.split(rje.chomp(mline))),',')
                 while '' in csvhead: csvhead.remove('')
                 ## ~ Sort out iTRAQ headers (missing) ~~~~~~~~~ ##
                 if self.getBool('iTRAQ'):
                     iline = open(self.getStr('MASCOT'),'r').readlines()[mx]
                     for isplit in rje.readDelimit(iline,',')[len(csvhead):]:  # Should be start of iTRAQ data
                         if '/' in isplit: itraq.append(isplit)
                     self.printLog('#ITRAQ',string.join(itraq))
                     csvhead += itraq
                     idb = db.addEmptyTable('itraq',['prot_hit_num','prot_acc','prot_desc','itraq','ratio','n','geomean','summary'],keys=['prot_hit_num','itraq'])
                     idb.info['Delimit'] = ','
                 ## ~ Add emPAI header (also missing) ~~~~~~~~~~ ##
                 if self.getBool('emPAI'): csvhead.append('empai')
                 ## ~ Set up Database Table ~~~~~~~~~~~~~~~~~~~~ ##
                 self.printLog('#HEAD',string.join(csvhead,'; '))
                 mdb = db.addEmptyTable('mascot',csvhead,keys=['prot_hit_num','pep_query'])
                 mdb.info['Delimit'] = ','
             elif mline.find('Peptide matches') >= 0:
                 mdb.saveToFile()
                 if self.getBool('emPAI'): csvhead.remove('empai')
                 mdb = db.addEmptyTable('nohits',csvhead,keys=['pep_query'])
                 for field in mdb.fields():
                     if field[:4] == 'prot': mdb.dropField(field)
                 mdb.info['Delimit'] = ','
                 continue
             elif rje.chomp(mline):
                 #self.deBug('%s ... %s' % (mline[:20],mline.find('Peptide matches')))
                 data = rje.readDelimit(mline,',')
                 entry = {}; pretraq = True
                 #self.deBug(csvhead); self.deBug(itraq);
                 for d in range(len(csvhead)+len(itraq)):
                     if d >= len(data): break
                     if data[d] in itraq: dhead = data[d]; pretraq = False
                     elif data[d] == 'emPAI': entry['empai'] = data[d+1]; pretraq = False
                     elif pretraq and d < len(csvhead): dhead = csvhead[d]
                     elif pretraq: continue      # Unmatched peptides will not have emPAI or iTRAQ data
                     #self.deBug('%s > %s' % (data[d],dhead))
                     if d and data[d-1] == 'emPAI': continue
                     elif data[d] in itraq + ['emPAI']: continue
                     elif dhead not in entry: entry[dhead] = data[d]
                     #self.deBug('%s = %s' % (dhead,entry[dhead]))
                 if entry['prot_acc']: prot_data[entry['prot_hit_num']] = {'prot_acc':entry['prot_acc'],'prot_desc':entry['prot_desc']}
                 if self.getBool('iTRAQ') and 'Quantitation summary for protein' in data:
                     d = data.index('Quantitation summary for protein') + 1
                     if entry['prot_hit_num'] in prot_data:
                         pacc = prot_data[entry['prot_hit_num']]['prot_acc']
                         pdesc = prot_data[entry['prot_hit_num']]['prot_desc']
                     else:
                         pacc = entry['prot_acc']
                         pdesc = entry['prot_desc']
                     while d < len(data):
                         if data[d] in itraq:
                             idb.addEntry({'prot_hit_num':entry['prot_hit_num'],'prot_acc':pacc,'prot_desc':pdesc,
                                           'itraq':data[d],'ratio':data[d+1],'n':data[d+2],'geomean':data[d+3],'summary':data[d+4]})
                         d += 1
                 #self.deBug(entry)
                 if entry['prot_hit_num'] or entry['pep_query']: mdb.addEntry(entry)
         mdb.saveToFile()
         if self.getBool('iTRAQ'): idb.saveToFile()
         self.deBug('')
         return True
     except: self.errorLog('Error reading MASCOT file'); return False
Ejemplo n.º 54
0
 def parse(self):  ### Parse REST file into dictionaries
     '''Parse REST file into dictionaries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RestKeys'] = []
         rbase = '%s%s' % (self.getStr('RestOutDir'),
                           rje.baseFile(self.getStr('RestBase'),
                                        strip_path=True,
                                        keepext=True))
         if rje.exists(self.getStr('RestIn')):
             restin = open(self.getStr('RestIn'), 'r').read()
         elif rje.matchExp('^(\d+)$', self.getStr('RestIn')):
             url = '%sretrieve&jobid=%s&password=%s' % (self.getStr(
                 'RestURL'), self.getStr('RestIn'), self.getStr('Password'))
             if self.getBool('PureAPI') and self.getStrLC('Rest'):
                 url += '&rest=%s' % (self.getStr('Rest'))
             else:
                 url += '&rest=full'
             restin = urllib2.urlopen(url).read()
             if self.getBool('PureAPI'): return restin
         else: raise IOError('%s not found!' % self.getStr('RestIn'))
         jobid = None
         ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for restdata in string.split(
                 restin,
                 '###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'
         ):
             if not jobid:
                 self.dict['Output']['intro'] = restdata
                 jobid = rje.matchExp('JobID: (\d+)', restdata)[0]
                 self.dict['Output']['jobid'] = jobid
                 if not self.getStrLC('RestBase'):
                     rbase = '%s%s' % (self.getStr('RestOutDir'), jobid)
                 self.dict['Outfile']['jobid'] = '%s.jobid' % (rbase)
                 continue
             restlines = string.split(restdata, '\n')
             rparse = string.split(restlines.pop(0))
             if rparse[0] != '#':
                 self.errorLog('REST output format error: %s' %
                               string.join(rparse),
                               printerror=False)
                 continue
             if rparse[1][-1] != ':':
                 self.errorLog('REST output format error: %s' %
                               string.join(rparse),
                               printerror=False)
                 continue
             rkey = rparse[1][:-1]
             try:
                 rfile = '%s.%s' % (
                     rbase,
                     rje.baseFile(rparse[2], strip_path=True, keepext=True))
             except:
                 rfile = ''
             if not rfile: rfile = '%s.%s' % (rbase, rkey)
             self.dict['Output'][rkey] = string.join(restlines, '\n')
             self.dict['Outfile'][rkey] = rfile
             self.list['RestKeys'].append(rkey)
         self.printLog(
             '#PARSE', 'Parsed %s: %d REST outputs.' %
             (self.getStr('RestIn'), len(self.dict['Output'])))
         return True
     except:
         self.errorLog('%s.parse error' % self)
         return False