Example #1
0
 def tabulatePPIRegion(self):    ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile)
         headers = ['Protein','Start','End','Interactor']
         rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP',gcmd)
         prot = None; rx = 0; plist = []; ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)',gline): prot = rje.matchExp('ID   (\S+)',gline)[0]
             if rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline):
                 (rstart,rend,rint) = rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi):
                         datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]}
                         rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor'])
                         self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx)))
         self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #2
0
 def makeGOFile(self):   ### Maps GO to sequences and outputs table for R analysis
     '''Maps GO to sequences and outputs table for R analysis.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         outfile = '%s.goer.tdt' % self.info['ResFile']
         headers = ['GOID','Motif','Type','Gene','Cons','HomNum','GlobID','LocID','Hyd','SA']
         rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
         ### ~ [2] ~ Work through dictionary and output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,mtot) = (-100.0,len(self.dict['Occ']))
         for motif in rje.sortKeys(self.dict['Occ']):
             mx += 100.0; self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|CheckSeq)         ' % (outfile,(mx/mtot),motif))
             ## ~ [2a] ~ Check MinOcc in terms of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             for type in rje.sortKeys(self.dict['Occ'][motif]):
                 if len(self.dict['Occ'][motif][type]) < self.stat['MinOcc']: self.dict['Occ'][motif].pop(type)
             if 'ELM' not in self.dict['Occ'][motif] or len(self.dict['Occ'][motif]) < 2: continue
             for type in self.dict['Occ'][motif]:
                 ## ~ [2b] ~ Map GO terms and check MinOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Check%s) ' % (outfile,(mx/mtot),motif,type)); 
                 godict = {}     # Temp dictionary of {GOID:[Seqs]}
                 for gene in self.dict['Occ'][motif][type]:
                     for go in self.ensGO(gene):
                         if go not in godict: godict[go] = [gene]
                         else: godict[go].append(gene)
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|OccGO%s) ' % (outfile,(mx/mtot),motif,type)); 
                 for go in rje.sortKeys(godict):
                     if len(godict[go]) < self.stat['MinOcc']: godict.pop(go)
                 ## ~ [2c] ~ Output remaining GO terms occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Output%s)' % (outfile,(mx/mtot),motif,type)); 
                 for go in rje.sortKeys(godict):
                     for gene in godict[go]:
                         for occdict in self.dict['Occ'][motif][type][gene]:
                             datadict = rje.combineDict({'GOID':'GO:%s' % go,'Motif':motif,'Type':type,'Gene':gene},occdict)
                             rje.delimitedFileOutput(self,outfile,headers,datadict=datadict)
             self.printLog('#OUT','Output for %s %s complete.' % (motif,rje.sortKeys(self.dict['Occ'][motif])),screen=False)
         self.printLog('\r#OUT','Generating %s output complete!         ' % (outfile))
     except: self.log.errorLog(rje_zen.Zen().wisdom())
Example #3
0
 def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes']
     '''Sets up output files self.str['MapFas','MissFas','MapRes'].'''
     ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     delimit = rje.getDelimit(self.cmd_list)
     if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = ''
     else:
         self.bool['Append'] = True
         self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom'])
     ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)}
     if self.getBool('Combine'): files.pop('MissFas')
     if self.str['ResFile'].lower() in ['','none']:
         self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True))
     for file in files.keys():
         self.setStr({file: self.getStr('ResFile') + '.' + files[file]})
         rje.backup(self,self.getStr(file))
     ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     #!# Consider replacing with rje_db object? #!#
     self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score']
     for qh in ['Query','Hit']:
         self.list['Headers'] += ['%s_Species' % qh]
         if self.bool['GablamOut']:
             for st in ['Len','Sim','ID']:
                 self.list['Headers'] += ['%s_%s' % (qh,st)]
     rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
Example #4
0
    def run(self,setup=True):  ### Main Run Method
        '''
        Main Run Method
        >> setup:bool [True] = Sets up headers and reads in existing data if present.
        '''
        try:
            ### ~ Setup & Read existing data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if setup: self.setup()
            headers = self.list['Headers']
            delimit = rje.delimitFromExt(filename=self.info['CardOut'])
            if os.path.exists(self.info['EnsLoci']):
                for h in ['EnsLoci','EnsDesc']:
                    if h not in headers: headers.append(h)
            rje.delimitedFileOutput(self,self.info['CardOut'],headers,delimit,rje_backup=True)

            ### ~ Read EnsLoci for incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.ensLoci()
                        
            ### ~ Parse data from GeneCards website and/or previously read aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.processGenes(self.list['Genes'])
            self.interactiveUpdate()
        
            ### ~ Add EnsEMBL EnsLoci data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.addEnsLoci()

            ### ~ Output GeneCards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.outputCards()
            
        except:
            self.log.errorLog('Apocalyptic error with GeneCards.run()')
            raise
Example #5
0
 def outputCards(self):  ### Outputs cards to delimited file
     '''Outputs cards to delimited file.'''
     ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     genelist = self.list['Genes']
     if self.opt['Purify'] and self.opt['Restrict']:
         for gene in genelist[0:]:
             if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']:  # Replace with symbol
                 genelist.remove(gene)
                 if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol'])
     delimit = rje.delimitFromExt(filename=self.info['CardOut'])
     CARDOUT = open(self.info['CardOut'],'a')
     ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (noens,noloci,ox) = (0,0,0)
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if self.opt['Restrict'] and gene not in genelist: continue
         elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue
         self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1
         self.dict['GeneCard'][gene]['Alias'] = gene
         self.dict['GeneCard'][gene]['Species'] = self.info['Species']
         rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene])
         if self.dict['GeneCard'][gene]['Symbol'] == gene:   # Not an alias
             if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1
             if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1
     CARDOUT.close()
     self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut']))
     self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
Example #6
0
 def run(self,imenu=False,outputmap=True,returndict=False):      ### Main controlling run Method
     '''
     Main controlling run Method.
     >> imenu:boolean = Whether to initiate interactive menu if appropriate [False].
     >> outputmap:boolean = Whether to output mapping into a file [True]
     >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False]
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(imenu): raise ValueError
         seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file'])
         if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {}
         ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1'])
         blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'),
                      'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))})  
         blast.setStat({'HitAln':blast.getStat('OneLine')})
         blast.list['ResTab'] = ['Search','Hit','GABLAM']
         if seqlist.nt(): blast.str['Type'] = 'blastx'
         ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if outputmap: self._setupOutput()                           ## Output Files ##
         if returndict: mapdict = {}
         else: self._setupMapped()                                   ## Previously Mapped Sequences ##
         seqx = seqlist.seqNum()             ## Number of sequences ##
         ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False)
         ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast.run(format=True)
         self.obj['DB'] = blast.obj['DB']
         ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.debug(self.getStr('MapDB'))
         self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']])
         self.obj['MapDB'].loadSeq(self.getStr('MapDB'))
         self.debug('%s' % self.obj['MapDB'].list['Seq'])
         sx = 0
         while seqlist.nextSeq() != None:
             search = seqlist.getSeq(format='short')
             sx += 1
             ## Check StartFrom ##
             if self.str['StartFrom']:
                 if self.str['StartFrom'] != search:
                     self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx))
                     continue
                 self.str['StartFrom'] = ''
                 self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx))
             ## Check if in Mapped ##
             if search in self.list['Mapped']:
                 resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'}
                 self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas']))
                 rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
                 continue
             ### Map Sequence ###
             self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx)))
             mapname = self.mapSeq(seqlist,blast,search)
             if returndict: mapdict[search] = mapname
         ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx)))           
         if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name'])     #!# Add option to keep BLAST! #!#
         if returndict: return mapdict
     except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise   
Example #7
0
 def run(self,batch=False):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch: self.setupResults()
         ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch and not self.obj['SeqList'].seqs():    ### Look for batch files and run for each
             batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0)
             self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles)))
             if not batchfiles: self.errorLog('No input files found!',printerror=False)
             else:
                 bx = 0
                 for infile in batchfiles:
                     bx += 1
                     self.printLog('#BATCH','Batch running %s' % infile)
                     bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile]
                     self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd)
                     self.run(batch=True)
                     self.opt['Append'] = True
                     self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False)
             if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32
             return
         ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.info['Special'].lower() == 'allbyall':
             self.printLog('#RUN','Performing special "all-by-all" pairwise run')
             self.info['Special'] = ''
             for i in range(len(self.seqs())-1):
                 self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i]
                 for j in range(i+1,len(self.seqs())):
                     self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1)
                     self.run(batch=True)
                     self.opt['Append'] = True
             self.info['Special'] = 'allbyall'; return                
         ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setup()
         ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.fitness()
         self.phenotype()
         self.grouping()
         for vector in ['Fitness','Phenotype','SeqGroup']:
             if len(self.list[vector]) != self.qry().seqLen():
                 self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False)
                 raise ValueError
         results = self.price()
         ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True)
         results['Query'] = self.qry().shortName()
         results['Fitness'] = self.info['Fmethod']
         results['Phenotype'] = self.info['Pmethod']
         results['SeqGroup'] = self.info['SeqGroup']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results)
         self.printLog('#OUT','Results output to %s' % self.info['ResFile'])
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #8
0
 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Example #9
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         counter = ['>>']    # List containing count times
         menulist = [('F','Change output file name','outfile','OutFile'),('X','Exit','return',''),('R','Run','return','')]
         mchoice = rje_menu.menu(self,'WormPump Menu',menulist,choicetext='Please select:',changecase=True,default='R')
         if mchoice == 'X': return
         self.printLog('#OUT','Output will be to %s' % self.info['OutFile'])
         self.printLog('#START','Initialising counter...')
         ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         wormid = None
         while counter[-1] != 'X':
             if wormid: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid,default='').upper())
             else: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit',default='').upper())
             if counter[-1]:
                 wormid = counter[-1]
                 if wormid == 'X': break
                 self.printLog('#WORM','Worm "%s"' % wormid)
             counter.append(time.time())
             self.deBug(counter)
         ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         head = ['Worm','Count','WormTime','AbsTime']
         rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,rje_backup=True)
         wormstart = 0.0
         wormid = None
         wtot = 0
         while counter:
             x = counter.pop(0)
             if x in ['>>','X']: continue
             if x:
                 wormid = x
                 wormstart = counter[0]
                 wx = 0
                 wtot += 1
             else:
                 if not wormid: continue
                 wx += 1
             t = counter.pop(0)
             tt = time.localtime(t)
             wdata = {'Worm':wormid,'Count':wx,'WormTime':t-wormstart,
                      #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
                      'AbsTime':'%s:%s:%s' % (rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
             rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,datadict=wdata)
         self.printLog('#OUT','Counts for %d worms output to %s' % (wtot,self.info['OutFile']))
         rje.choice('<ENTER> to exit')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #10
0
    def saveMutations(self):    ### Outputs parsed mutations into a delimited file
        '''Outputs parsed mutations into a delimited file.'''
        try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease']
            outfile = 'omim_mutations.tdt'
            rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True)

            ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for gene in rje.sortKeys(self.dict['Mutations']):
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene,
                                'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease}
                    rje.delimitedFileOutput(self,outfile,headers,'\t',datadict)
            self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Example #11
0
 def saveTimePoints(self,filename='',format='tdt',entries=[]):   ### Saves TimePoints to a file
     '''
     Saves TimePoints to a file from main TimePoints table.
     >> filename:str [''] = Output filename. Will use basefile if none given.
     >> format:str ['tdt'] = Output file format (csv/tsv/txt/db)
     >> entries:list [] = Entries from main table to output. (All if none given).
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('TimePoints')
         if format.lower() in ['','none']: format = string.split(filename.lower(),'.')[-1]
         if not filename: filename = '%s.%s' % (self.basefile(),format)
         if not entries: entries = db.entries()
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if format in ['csv','tdt']: 
             self.blanksToEmpty()
             rje.delimitedFileOutput(self,filename,db.fields(),rje_backup=True)
             for entry in entries: rje.delimitedFileOutput(self,filename,db.fields(),datadict=entry)
         ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             self.emptyToBlank()
             rje.backup(self,filename)
             OUT = open(filename,'a')
             for entry in entries:
                 if format == 'db':
                     outlist = []
                     for field in db.fields(): outlist.append(entry[field])
                     out_txt = '%s' % outlist
                     OUT.write('(%s);\n' % out_txt[1:-1])
                 else:
                     # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                     out_text = '%s. (TimePoint) ' % entry['TimePoint Name']
                     if entry['month'] in ['','blank']: out_text += '%s %s.' % (entry['Year'],entry['yearUnit'])
                     else: out_text += '%s %s, %s %s.' % (entry['Year'],entry['yearUnit'],entry['month'],entry['day'])
                     out_text = '%s %s Source: <%s>[%s].' % (out_text,entry['TimePoint Description'],entry['Source URL'],entry['Source URL'])
                     klist = []
                     for i in range(1,6):
                         if entry['keyword%d' % i] not in ['','blank']: klist.append(entry['keyword%d' % i])
                     out_text = '%s (Keywords: %s)' % (out_text,string.join(klist,', '))
                     OUT.write('%s\n' % out_text)
         self.printLog('#OUT','%d entries output to %s' % (len(entries),filename))
     except: self.errorLog('%s.saveTimePoints(%s) error' % (self,filename)); return False
Example #12
0
    def hmmTable(self,outfile='',append=False,delimit=None):    ### Outputs results table
        '''
        Outputs results table.
        >> outfile:str = Name of output file
        >> append:boolean = whether to append file
        >> delimit:str = Delimiter to use [\t]
        '''
        try:
            ### Setup ###
            if not outfile: outfile = self.info['HMMTab']
            if outfile.lower() == 'none':
                self.log.printLog('#TAB','HMMTab = "None": No table output')
                return False
            if not delimit: delimit = rje.getDelimit(self.cmd_list,'\t')
            if not outfile: outfile = '%s.hmmer.%s' % (rje.baseFile(self.info['SearchDB'],True),rje.delimitExt(delimit))
            self.readResults()
            self.log.printLog('#TAB','Tabulating results for %s searches into %s' % (len(self.search),outfile),log=False)

            ### Setup Resfile ###
            if self.opt['MySQL']: headers = ['HMM','Hit','Hit_Start','Hit_End','Eval','Score']
            else: headers = ['Type','Name','Start','End','Eval','Score']
            if not append or not os.path.exists(outfile): rje.delimitedFileOutput(self,outfile,headers,delimit,rje_backup=True)
            
            ### Output Search details ###
            for search in self.search:
                for hit in search.hit:
                    for aln in hit.aln:
                        out = {'HMM':search.info['Name'],'Type':search.info['Name'],
                               'Name':hit.info['Name'],'Hit':hit.info['Name'],
                               'Start':'%d' % aln.stat['SbjStart'], 'End':'%d' % aln.stat['SbjEnd'],
                               'Hit_Start':'%d' % aln.stat['SbjStart'], 'Hit_End':'%d' % aln.stat['SbjEnd'],
                               'Eval':'%.2e' % aln.stat['Expect'],'Score':'%.1f' % aln.stat['BitScore']}
                        rje.delimitedFileOutput(self,outfile,headers,delimit,out)
            self.log.printLog('#OUT','Results for %s searches output to %s.' % (len(self.search),outfile))
        except:
            self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile)
            raise
Example #13
0
 def domainFasta(self):    ### Outputs parsed domain and domain PPI datasets in Fasta format
     '''Outputs parsed PPI datasets in Fasta format.'''
     try:
         ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         headers = ['Domain','HPRD','Gene']
         dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
         rje.delimitedFileOutput(self,dfile,headers,'\t')
         sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
         shead = ['Domain','Source']
         rje.delimitedFileOutput(self,sfile,shead,'\t')
         dx = 0.0
         for domain in rje.sortKeys(self.dict['Domains']):
             self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False)
             dx += 100.0
             for hid in self.dict['Domains'][domain]:
                 datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']}
                 rje.delimitedFileOutput(self,dfile,headers,'\t',datadict)
             for source in self.dict['DomainSource'][domain]:
                 datadict = {'Domain':domain,'Source':source}
                 rje.delimitedFileOutput(self,sfile,shead,'\t',datadict)
         self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains']))))
                    
         ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/')
         rje.mkDir(self,datpath)
         for domain in rje.sortKeys(self.dict['Domains']):
             ## Generate a list of all interactors with domain-containing proteins ##
             plist = []
             for p1 in self.dict['Domains'][domain]:
                 if p1 not in self.dict['PPI']: continue
                 for p2 in self.dict['PPI'][p1]:
                     if p2 not in plist: plist.append(p2)
             plist.sort()
             ## Generate Sequence list and output ##
             mylist = []
             for p in plist:
                 if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq']
                 else: mylist.append(self.dict['HPRD'][p]['Seq'])
             sfile = '%s%s_hprd.fas' % (datpath,domain)
             if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
             else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain)
         self.log.printLog('\r#DOM','HPRD Domain fasta output complete.')
     except:
         self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
         raise
Example #14
0
 def scap(self):     ### Full SCAP method
     '''Full SCAP method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         markov = self.obj['Markov']
         minx = markov.stat['MinXmer']
         maxx = markov.stat['MaxXmer']
         headers = ['seq','type','sorted']
         for x in range(minx,maxx+1): headers.append('X%d' % x)
         delimit = rje.getDelimit(self.cmd_list,'\t')
         scapfile = '%s.%s' % (self.info['Basefile'],rje.delimitExt(delimit))
         rje.delimitedFileOutput(self,scapfile,headers,delimit,rje_backup=True)
         ### ~ [2] SCAP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Query ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         (sx,stot) = (0.0,self.obj['SeqList'].seqNum())
         for seq in self.obj['SeqList'].seq:
             self.progLog('\r#SCAP','SCAP processing Query to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0
             datadict = {'seq':seq.shortName(),'type':'qry','sorted':markov.opt['Sorted']}
             for x in range(minx,maxx+1): 
                 datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x)
                 if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x]
                 else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x]
             rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict)
         self.printLog('\r#SCAP','SCAP processed Query to %s for %s sequences.' % (scapfile,rje.integerString(stot)))
         ## ~ [2b] Background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.obj['ScapBack'] != self.obj['SeqList']:
             (sx,stot) = (0.0,self.obj['ScapBack'].seqNum())
             for seq in self.obj['ScapBack'].seq:
                 self.progLog('\r#SCAP','SCAP processing Background to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0
                 datadict = {'seq':seq.shortName(),'type':'bg','sorted':markov.opt['Sorted']}
                 for x in range(minx,maxx+1):
                     datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x)
                     if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x]
                     else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x]
                 rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict)
             self.printLog('\r#SCAP','SCAP processed Background to %s for %s sequences.' % (scapfile,rje.integerString(stot)))
         if markov.opt['Sorted']: self.printLog('#SCAP','Sorted SCAP run complete')
         else: self.printLog('#SCAP','UnSorted SCAP run complete')
     except: self.errorLog(rje_zen.Zen().wisdom())
Example #15
0
    def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence
        '''
        Performs actual mapping of sequence.
        >> seq:SeqList object containing Sequence Object to be mapped
        >> blast:BLAST_Run object to perform BLAST and GABLAM
        >> search:Current BLAST search object for mapping
        >> outputmap:boolean = Whether to output mapping into a file [True]
        << returns shortName() of mapped sequence (or None if none)
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seq = seqlist.getSeq(format='tuple')
            mapseq = self.obj['MapDB']
            hits = blast.db('Hit').indexEntries('Query',search)
            self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits)))
            hitseq = {}; hitdata = {}
            for entry in hits:
                hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple')
                hitdata[entry['Hit']] = entry
            resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])}
            ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hits,hitdict) = self.orderHits(seq,hits,hitseq)
            self.debug(hits)
            self.debug(hitdict)
            ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for method in self.list['Mapping']:
                resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower())
                if resdict['Hit']:
                    resdict['Method'] = method[:1].upper() + method[1:].lower()
                    break
                elif method == 'gablam' and (len(hits) > 0):
                    resdict['Method'] = 'Rejected'
            self.debug(resdict)
            ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if resdict['Hit']:  #hitdict[hit]['Data']['ShortName']
                hit = resdict['Hit']['Hit']     # resdict['Hit'] is the BLAST table entry for Hit
                shortname = hitdict[hit]['Data']['ShortName']   # This is just hit!
                self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method']))
                ## Update Stats ##
                self.debug('')
                resdict['BlastRank'] = hitdata[hit]['Rank']
                for key in hitdict[hit]: resdict[key] = hitdict[hit][key]
                ## Fasta and Redundancy ##
                if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                else:
                    self.list['Mapped'].append(shortname)
                    if outputmap:
                        open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1]))
                resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode']
                resdict['Hit'] = shortname
            else:
                ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
                if 'grep' in self.list['Mapping']:
                    greplist = []; hitseq = ''
                    self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False)
                    for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines():
                        if line[:1] == '>': greplist.append(string.split(line[1:])[0])
                        elif not hitseq: hitseq = rje.chomp(line)
                    if greplist:
                        shortname = greplist.pop(0)
                        resdict['Hit'] = shortname
                        resdict['Method'] = 'Grep'
                        resdict['Qry_ID'] = '100.0'
                        resdict['Qry_Len'] = len(seq[1])
                        resdict['Hit_Len'] = len(hitseq)
                        resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1])
                        try: resdict['Hit_Species'] = string.split(shortname,'_')[1]
                        except: pass
                        if shortname in self.list['Mapped']:
                            self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                        else:
                            self.list['Mapped'].append(shortname)
                            if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq))
                    for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra))
                if not resdict['Hit'] and self.bool['Combine']:
                    ## Fasta and Redundancy ##
                    shortname = string.split(seq[0])[0]
                    if shortname in self.list['Mapped']:
                        self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas']))
                    else:
                        self.list['Mapped'].append(shortname)
                        if outputmap:
                            open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                elif outputmap:
                    open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method']))
            if outputmap:
                rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
            return resdict['Hit']

        except:
            self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True)
            return False
Example #16
0
 def saveTimePoints(self,
                    filename='',
                    format='tdt',
                    entries=[]):  ### Saves TimePoints to a file
     '''
     Saves TimePoints to a file from main TimePoints table.
     >> filename:str [''] = Output filename. Will use basefile if none given.
     >> format:str ['tdt'] = Output file format (csv/tsv/txt/db)
     >> entries:list [] = Entries from main table to output. (All if none given).
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('TimePoints')
         if format.lower() in ['', 'none']:
             format = string.split(filename.lower(), '.')[-1]
         if not filename: filename = '%s.%s' % (self.basefile(), format)
         if not entries: entries = db.entries()
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if format in ['csv', 'tdt']:
             self.blanksToEmpty()
             rje.delimitedFileOutput(self,
                                     filename,
                                     db.fields(),
                                     rje_backup=True)
             for entry in entries:
                 rje.delimitedFileOutput(self,
                                         filename,
                                         db.fields(),
                                         datadict=entry)
         ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             self.emptyToBlank()
             rje.backup(self, filename)
             OUT = open(filename, 'a')
             for entry in entries:
                 if format == 'db':
                     outlist = []
                     for field in db.fields():
                         outlist.append(entry[field])
                     out_txt = '%s' % outlist
                     OUT.write('(%s);\n' % out_txt[1:-1])
                 else:
                     # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                     out_text = '%s. (TimePoint) ' % entry['TimePoint Name']
                     if entry['month'] in ['', 'blank']:
                         out_text += '%s %s.' % (entry['Year'],
                                                 entry['yearUnit'])
                     else:
                         out_text += '%s %s, %s %s.' % (
                             entry['Year'], entry['yearUnit'],
                             entry['month'], entry['day'])
                     out_text = '%s %s Source: <%s>[%s].' % (
                         out_text, entry['TimePoint Description'],
                         entry['Source URL'], entry['Source URL'])
                     klist = []
                     for i in range(1, 6):
                         if entry['keyword%d' % i] not in ['', 'blank']:
                             klist.append(entry['keyword%d' % i])
                     out_text = '%s (Keywords: %s)' % (
                         out_text, string.join(klist, ', '))
                     OUT.write('%s\n' % out_text)
         self.printLog('#OUT',
                       '%d entries output to %s' % (len(entries), filename))
     except:
         self.errorLog('%s.saveTimePoints(%s) error' % (self, filename))
         return False
Example #17
0
    def clusterGoodSeq(self,searchset,data):   ### Clusters good sequences returned by search and updates data dictionary
        '''Clusters good sequences returned by search and updates data dictionary.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Extract Non-rejected sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list+['autoload=F'])
            #self.deBug(rje.sortKeys(self.dict['Acc2Seq']))
            for prot in rje.sortKeys(data):
                if data[prot]['class'] != 'REJECT': seqlist.seq.append(self.dict['Acc2Seq'][data[prot]['accnum']])
            if not seqlist.seqNum():
                return self.printLog('#NULL','No %s sequences remain for clustering' % searchset)
            seqfile = '%s.%s.tmpdb' % (self.info['Basefile'],searchset)
            seqlist.saveFasta(seqfile=seqfile)
            seqdict = seqlist.seqNameDic()

            ### ~ [2] Perform BLAST and generate hit matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            try:
                blast = rje_blast.blastObj(self.log,['blastf=T','blaste=1e-4']+self.cmd_list+['dna=F'],type='New')
                clusters = blast.blastClusters(seqfile,seqdict=seqdict,keepblast=False) 
            except:
                self.errorLog('Problem with new BLAST clustering')
                blast = rje_blast.blastObj(self.log,['blastf=T','blaste=1e-4']+self.cmd_list+['dna=F'],type='Old')
                blast.setInfo({'InFile':seqfile,'DBase':seqfile,'Name':'%s.tmp.blast' % self.info['Basefile'],'Type':'blastp'})
                blast.setStat({'OneLine':seqlist.seqNum(),'HitAln':0})
                blast.formatDB(fasfile=seqfile,force=True,protein=True)
                blast.blast(cleandb=False,use_existing=False,log=True)
                blast.readBLAST(gablam=False,unlink=True,log=True)
                rje_blast.cleanupDB(self,seqfile,deletesource=True)
                ## ~ [2a] Cluster by BLAST hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                cluster = {}    # Dictionary of {seq:hit seqs} for clustering
                for search in blast.search:
                    seq = seqdict[search.info['Name']]
                    cluster[seq] = []
                    for hit in search.hit: cluster[seq].append(seqdict[hit.info['Name']])
                #self.deBug(cluster)
                ## ~ [2b] Combine clusters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                clusters = []   # List of [seqs] in clusters
                for seq in seqlist.seqs():
                    if seq not in cluster: continue
                    newcluster = [seq]
                    hits = cluster.pop(seq)
                    while hits:
                        hit = hits.pop(0)
                        if hit not in newcluster: newcluster.append(hit)
                        if hit in cluster: hits += cluster.pop(hit)
                    clusters.append(newcluster)
            self.printLog('#CLUSTER','%d clusters of %s proteins hits' % (len(clusters),searchset))
            #self.deBug(clusters)

            ### ~ [3] Assign peptides to consensi as "Common", "Cluster" or "Unique" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Match peptides to sequence lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pepcons = {}
            for seq in seqlist.seqs():
                prot = seq.shortName()  #.info['AccNum']
                for pep in data[prot]['conpep']:
                    if pep not in pepcons: pepcons[pep] = []
                    pepcons[pep].append(seq)
            self.dict['PepSeq'] = pepcons
            ## ~ [3b] Classify peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['PepTypes'] = {'Common':[],'Cluster':[],'Unique':[]}
            for pep in pepcons:
                if len(pepcons[pep]) == 1: self.dict['PepTypes']['Unique'].append(pep); continue
                pepclus = []
                for seq in pepcons[pep]:
                    for cluster in clusters:
                        if seq in cluster and cluster not in pepclus: pepclus.append(cluster)
                if len(pepclus) == 1: self.dict['PepTypes']['Cluster'].append(pep)
                else: self.dict['PepTypes']['Common'].append(pep)
            ## ~ [3c] Summarise Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#PEP','%d different %s Peptide sequences' % (len(pepcons),searchset))
            for ptype in ['Common','Cluster','Unique']: self.dict['PepTypes'][ptype].sort()
            self.printLog('#UNIQ','%d Unique to one consensus' % (len(self.dict['PepTypes']['Unique'])))
            self.printLog('#CLUS','%d Resticted to one cluster' % (len(self.dict['PepTypes']['Cluster'])))
            self.printLog('#COMM','%d Common to multiple clusters' % (len(self.dict['PepTypes']['Common'])))

            ### ~ [4] Update dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            cx = 0
            for cluster in clusters:
                cx += 1
                for seq in cluster:
                    prot = seq.shortName()  #info['AccNum']
                    data[prot]['cluster'] = cx

            ### ~ [5] Peptide Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            peptdt = '%s.%s.peptides.tdt' % (self.info['Basefile'],searchset)
            pephead = ['Peptide','Classification','Hits']
            rje.delimitedFileOutput(self,peptdt,pephead,rje_backup=True)
            for ptype in ['Common','Cluster','Unique']:
                for pep in self.dict['PepTypes'][ptype]:
                    data = {'Peptide':pep,'Classification':ptype,'Hits':seqlist.accList(self.dict['PepSeq'][pep])}
                    data['Hits'].sort()
                    data['Hits'] = string.join(data['Hits'],'|')
                    rje.delimitedFileOutput(self,peptdt,pephead,datadict=data)
            self.printLog('#PEP','Peptide details output to %s' % peptdt)
        except: self.errorLog(rje_zen.Zen().wisdom())
Example #18
0
    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log,pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']: return self.log.errorLog('Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict['GeneCard']      # GeneCards dictionary
            ensloci = ping.getEnsLoci()     # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()  
            if not seqdict: return self.log.errorLog('Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}    # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try: seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog('#MAP','No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}     # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):                    
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {}   # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []   # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split('Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',',')
            headers = ['Gene','OMIM','SubID','Mutation','Disease'] + dataheaders
            rje.delimitedFileOutput(self,outfile,headers,delimit='\t',rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'):      # Potential SLiM
                slimdata = rje.dataDict(self,file,['Pattern','Hit','Pos','Match'],dataheaders,delimit=',')
                for occ in slimdata:
                    if slimdata[occ]['Hit'] in ensback:     # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start,end) = (int(slimdata[occ]['Pos']),int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog('#OMIMSLIM','%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'],slimdata[occ]['Hit'],slimdata[occ]['Pattern'],start,end,mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][mpos][0]
                                rje.delimitedFileOutput(self,outfile,headers,'\t',slimdata[occ])
                                if (gene,mpos) not in slimomim: slimomim.append((gene,mpos))
            
            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim,resx,mutx) = (0,0,0)  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:      # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:    # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start,end) = (int(allslims[gene][occ]['Pos'])-1,int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end-start) + res[end-1:]
                    self.deBug('%s %d (%d)' % (gene,sum(res),seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog('#COV','SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0*inslim/resx))
            self.log.printLog('#MUT','%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog('#PROB','Probability of observed %d mutation overlap = %.4f' % (len(slimomim),rje.binomial(len(slimomim),mutx,float(inslim)/resx,callobj=self)))
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Example #19
0
    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:  ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']:
                return self.log.errorLog(
                    'Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict[
                'GeneCard']  # GeneCards dictionary
            ensloci = ping.getEnsLoci(
            )  # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()
            if not seqdict:
                return self.log.errorLog(
                    'Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}  # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try:
                    seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog(
                        '#MAP', 'No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}  # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease, mutation) = self.dict['Mutations'][gene][subid]
                    (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',
                                                    mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(
                        pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {
            }  # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []  # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split(
                'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',
                ',')
            headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease'
                       ] + dataheaders
            rje.delimitedFileOutput(self,
                                    outfile,
                                    headers,
                                    delimit='\t',
                                    rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] +
                                  '*.occ.csv'):  # Potential SLiM
                slimdata = rje.dataDict(self,
                                        file,
                                        ['Pattern', 'Hit', 'Pos', 'Match'],
                                        dataheaders,
                                        delimit=',')
                for occ in slimdata:
                    if slimdata[occ][
                            'Hit'] in ensback:  # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start, end) = (int(slimdata[occ]['Pos']),
                                        int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[
                                    genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog(
                                    '#OMIMSLIM', '%s %s %s (%d-%d) = %s' %
                                    (slimdata[occ]['Dataset'],
                                     slimdata[occ]['Hit'],
                                     slimdata[occ]['Pattern'], start, end,
                                     mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(
                                    self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][
                                    mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][
                                    mpos][0]
                                rje.delimitedFileOutput(
                                    self, outfile, headers, '\t',
                                    slimdata[occ])
                                if (gene, mpos) not in slimomim:
                                    slimomim.append((gene, mpos))

            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim, resx, mutx) = (
                0, 0, 0
            )  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:  # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:  # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start, end) = (int(allslims[gene][occ]['Pos']) - 1,
                                        int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end - start) + res[end - 1:]
                    self.deBug('%s %d (%d)' %
                               (gene, sum(res),
                                seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog(
                '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' %
                (100.0 * inslim / resx))
            self.log.printLog(
                '#MUT',
                '%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog(
                '#PROB', 'Probability of observed %d mutation overlap = %.4f' %
                (len(slimomim),
                 rje.binomial(
                     len(slimomim), mutx, float(inslim) / resx, callobj=self)))
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Example #20
0
    def domainFasta(
        self
    ):  ### Outputs parsed domain and domain PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Domain', 'HPRD', 'Gene']
            dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
            rje.delimitedFileOutput(self, dfile, headers, '\t')
            sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
            shead = ['Domain', 'Source']
            rje.delimitedFileOutput(self, sfile, shead, '\t')
            dx = 0.0
            for domain in rje.sortKeys(self.dict['Domains']):
                self.log.printLog('\r#DOM',
                                  'HPRD Domain output (%s): %.1f%%' %
                                  (dfile, dx / len(self.dict['Domains'])),
                                  newline=False,
                                  log=False)
                dx += 100.0
                for hid in self.dict['Domains'][domain]:
                    datadict = {
                        'Domain': domain,
                        'HPRD': hid,
                        'Gene': self.dict['HPRD'][hid]['gene']
                    }
                    rje.delimitedFileOutput(self, dfile, headers, '\t',
                                            datadict)
                for source in self.dict['DomainSource'][domain]:
                    datadict = {'Domain': domain, 'Source': source}
                    rje.delimitedFileOutput(self, sfile, shead, '\t', datadict)
            self.log.printLog(
                '\r#DOM', 'HPRD Domain output (%s): %s domains.' %
                (dfile, rje.integerString(len(self.dict['Domains']))))

            ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datpath = self.info['OutDir'] + rje.makePath(
                'HPRD_Domain_Datasets/')
            rje.mkDir(self, datpath)
            for domain in rje.sortKeys(self.dict['Domains']):
                ## Generate a list of all interactors with domain-containing proteins ##
                plist = []
                for p1 in self.dict['Domains'][domain]:
                    if p1 not in self.dict['PPI']: continue
                    for p2 in self.dict['PPI'][p1]:
                        if p2 not in plist: plist.append(p2)
                plist.sort()
                ## Generate Sequence list and output ##
                mylist = []
                for p in plist:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath, domain)
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
                else:
                    self.log.printLog(
                        '#DOM', 'No PPI partners for domain "%s"' % domain)
            self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.saveFasta()',
                              printerror=True,
                              quitchoice=False)
            raise
Example #21
0
 def rfAtt(self):  ### Generic method
     '''
     Generic method. Add description here (and arguments.)
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhead = [
             'Att', 'RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3', 'ObsRF1',
             'ObsRF2', 'ObsRF3', 'ObsRF-1', 'ObsRF-2', 'ObsRF-3', 'ExpRF1',
             'ExpRF2', 'ExpRF3', 'ExpRF-1', 'ExpRF-2', 'ExpRF-3'
         ]
         rfdata = {}
         rfobs = {}
         rfexp = {}
         ntfreq = {}
         for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
             rfdata[rf] = {}
             rfobs[rf] = {}
             rfexp[rf] = {}
             for x in rje_seq.alph_protx[:-1] + ['*']:
                 rfdata[rf][x] = 0
                 rfobs[rf][x] = 0
                 rfexp[rf][x] = 0
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfdata[rf]['%s%s' % (a1, a2)] = 0
                     rfobs[rf]['%s%s' % (a1, a2)] = 0
                     rfexp[rf]['%s%s' % (a1, a2)] = 0
         for x in rje_seq.alph_dna[:-1]:
             ntfreq[x] = 0
         seqlist = self.obj['SeqList']
         ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (sx, stot) = (0.0, seqlist.seqNum())
         for seq in seqlist.seq:
             self.progLog(
                 '\r#ATT',
                 'Counting sequence attributes: %.2f%%' % (sx / stot))
             sx += 100.0
             for x in seq.info['Sequence']:
                 if x in ntfreq: ntfreq[x] += 1
             rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence'])
             for r in rf6:
                 rseq = rf6[r]
                 rf = 'RF%d' % r
                 for i in range(len(rseq)):
                     a = rseq[i]
                     dia = rseq[i:i + 2]
                     if a in rfdata[rf]: rfdata[rf][a] += 1
                     if dia in rfdata[rf]: rfdata[rf][dia] += 1
         self.printLog('\r#ATT', 'Counting sequence attributes complete.')
         ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ntobs = rje.dictFreq(ntfreq, total=True, newdict=True)
         ntcomp = {'Total': ntobs['Total']}
         for xy in ['AT', 'GC']:
             ntcomp[xy[0]] = ntobs[xy[1]]
             ntcomp[xy[1]] = ntobs[xy[0]]
         for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
             aafreq = {}
             for a in rje_seq.alph_protx[:-1] + ['*']:
                 aafreq[a] = rfdata[rf][a]
             aafreq = rje.dictFreq(aafreq, total=True, newdict=True)
             for a in rje_seq.alph_protx[:-1] + ['*']:
                 rfobs[rf][a] = rfdata[rf][a]
                 rfexp[rf][a] = 0
             for n1 in 'GATC':
                 for n2 in 'GATC':
                     for n3 in 'GATC':
                         codon = '%s%s%s' % (n1, n2, n3)
                         aa = rje_sequence.dna2prot(codon)
                         if rf[-2] == '-':
                             rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) *
                                               ntcomp[n1] * ntcomp[n2] *
                                               ntcomp[n3])
                         else:
                             rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) *
                                               ntobs[n1] * ntobs[n2] *
                                               ntobs[n3])
                         #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3]))
                         #self.deBug('%s: %s' % (aa,rfexp[rf][aa]))
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfexp[rf]['%s%s' %
                               (a1, a2)] = (aafreq['Total'] -
                                            1) * aafreq[a1] * aafreq[a2]
                     rfobs[rf]['%s%s' % (a1, a2)] = rfdata[rf]['%s%s' %
                                                               (a1, a2)]
         ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt'
         rje.delimitedFileOutput(self, rfile, rfhead, rje_backup=True)
         for a in rje_seq.alph_protx[:-1] + ['*']:
             data = {'Att': a}
             for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
                 data['Obs%s' % rf] = rfobs[rf][a]
                 data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                 data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
             rje.delimitedFileOutput(self, rfile, rfhead, datadict=data)
         for a1 in rje_seq.alph_protx[:-1] + ['*']:
             for a2 in rje_seq.alph_protx[:-1] + ['*']:
                 a = '%s%s' % (a1, a2)
                 data = {'Att': a}
                 for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
                     data['Obs%s' % rf] = rfobs[rf][a]
                     data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                     data[rf] = rje.expectString(rfobs[rf][a] /
                                                 rfexp[rf][a])
                 rje.delimitedFileOutput(self, rfile, rfhead, datadict=data)
         self.printLog('#TDT', 'TDT output complete.')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Example #22
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         counter = ['>>']  # List containing count times
         menulist = [('F', 'Change output file name', 'outfile', 'OutFile'),
                     ('X', 'Exit', 'return', ''),
                     ('R', 'Run', 'return', '')]
         mchoice = rje_menu.menu(self,
                                 'WormPump Menu',
                                 menulist,
                                 choicetext='Please select:',
                                 changecase=True,
                                 default='R')
         if mchoice == 'X': return
         self.printLog('#OUT',
                       'Output will be to %s' % self.info['OutFile'])
         self.printLog('#START', 'Initialising counter...')
         ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         wormid = None
         while counter[-1] != 'X':
             if wormid:
                 counter.append(
                     rje.choice(
                         'ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count'
                         % wormid,
                         default='').upper())
             else:
                 counter.append(
                     rje.choice(
                         'ID <ENTER> for new worm | X <ENTER> to exit',
                         default='').upper())
             if counter[-1]:
                 wormid = counter[-1]
                 if wormid == 'X': break
                 self.printLog('#WORM', 'Worm "%s"' % wormid)
             counter.append(time.time())
             self.deBug(counter)
         ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         head = ['Worm', 'Count', 'WormTime', 'AbsTime']
         rje.delimitedFileOutput(self,
                                 self.info['OutFile'],
                                 headers=head,
                                 rje_backup=True)
         wormstart = 0.0
         wormid = None
         wtot = 0
         while counter:
             x = counter.pop(0)
             if x in ['>>', 'X']: continue
             if x:
                 wormid = x
                 wormstart = counter[0]
                 wx = 0
                 wtot += 1
             else:
                 if not wormid: continue
                 wx += 1
             t = counter.pop(0)
             tt = time.localtime(t)
             wdata = {
                 'Worm':
                 wormid,
                 'Count':
                 wx,
                 'WormTime':
                 t - wormstart,
                 #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
                 'AbsTime':
                 '%s:%s:%s' % (rje.preZero(tt[3], 24), rje.preZero(
                     tt[4], 60), rje.preZero(tt[5], 60))
             }
             rje.delimitedFileOutput(self,
                                     self.info['OutFile'],
                                     headers=head,
                                     datadict=wdata)
         self.printLog(
             '#OUT', 'Counts for %d worms output to %s' %
             (wtot, self.info['OutFile']))
         rje.choice('<ENTER> to exit')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Example #23
0
    def run(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mygo = rje_go.GO(self.log,self.cmd_list)
            mygo.readGO()
            gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True)
            self.deBug(rje.sortKeys(gomap)[:100])
            #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!#
            self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap))))
            slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum'])
            self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc))))
            ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)
            seqmap = {}
            (sx,stot) = (0.0,seqlist.seqNum())
            for seq in seqlist.seq:
                self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0
                if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0]
            self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap))))
            self.deBug(rje.sortKeys(seqmap)[:100])

            ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            goocc = {}
            outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.')
            headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum']
            for okey in slimocc.keys():
                self.progLog('#NEW','Making new GO occurrences: %s    ' % (rje.integerString(len(slimocc))))
                data = slimocc.pop(okey)
                gene = seq = data['Seq']
                type = 'fwd'
                if string.split(data['Motif'],'_')[-1] in ['rev','scram']:
                    type = string.split(data['Motif'],'_')[-1]
                    data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_')
                if gene not in gomap and gene in seqmap: gene = seqmap[gene]
                golist = []
                if gene in gomap:
                    for id in gomap[gene]: golist += mygo.parents(id)
                else: golist = ['NoGo']
                self.deBug('%s:%s::%s' % (seq,gene,golist))
                for id in rje.sortUnique(golist,False,False):
                    if id not in goocc: goocc[id] = {}
                    if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]}
                    goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data))
            self.printLog('\r#NEW','Making new GO occurrences complete.    ' % (rje.integerString(len(slimocc))))

            rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
            (mx,ox,ix,itot) = (0,0,0.0,len(goocc))
            for id in rje.sortKeys(goocc):
                for motif in rje.sortKeys(goocc[id]):
                    for type in rje.sortKeys(goocc[id][motif]):
                        if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type)
                    if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue
                    mx += 1
                    for type in goocc[id][motif]:
                        for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1
                self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox)))
            self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox)))

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible
Example #24
0
class PhosphoSeq(rje.RJE_Object):
    '''
    PhosphoSeq Class. Author: Rich Edwards (2007).

    Info:str
    - PELM = Filename for phosphoELM download [None]
    - PELMFas = Filename for fasta file output of pELM sequences [pelm.fas]
    - PhosBlast = Fasta file of sequences to perform phosBLAST method against pELM [None]
    - PhosRes = Delimited text file containing input sequence, position and evidence [*.phosres.tdt]

    Opt:boolean
    - FilterSeq = Apply rje_seq sequence filters to phosphoELM data [False]
    - UseSpec = Use species codes for determing same species for ID matches [True]
    - PhosDat = Whether to produce a modified UniProt-format file with potential phosphoSites as features [False]

    Stat:numeric
    - IDSim = Percentage identity (GABLAM; phosblast qry) for marking as identity [95.0]
    - HomSim = Percentage identity (GABLAM; phosblast qry) for marking as homologue [40.0]

    List:list

    Dict:dictionary
    - PhosphoSites = Dictionary of {Seq:{Pos:details}}

    Obj:RJE_Objects
    - SeqList = rje_seq.SeqList() object for storing sequences
    - UniProt = rje_uniprot.UniProt() object for storing UniProt data
    '''

    #########################################################################################################################
    ### <1> ### Class Initiation etc.: sets attributes                                                                  #
    #########################################################################################################################
    def _setAttributes(self):  ### Sets Attributes of Object
        '''Sets Attributes of Object.'''
        ### Basics ###
        self.infolist = ['PELM', 'PELMFas', 'PhosBlast', 'PhosRes']
        self.optlist = ['FilterSeq', 'UseSpec', 'PhosDat']
        self.statlist = ['IDSim', 'HomSim']
        self.listlist = []
        self.dictlist = ['PhosphoSites']
        self.objlist = ['SeqList', 'UniProt']
        ### Defaults ###
        self._setDefaults(info='None',
                          opt=False,
                          stat=0.0,
                          obj=None,
                          setlist=True,
                          setdict=True)
        self.setInfo({'PELMFas': 'pelm.fas'})
        self.setStat({'IDSim': 95.0, 'HomSim': 40.0})
        self.setOpt({'UseSpec': True})
#########################################################################################################################

    def _cmdList(self):  ### Sets Attributes from commandline
        '''
        Sets attributes according to commandline parameters:
        - see .__doc__ or run with 'help' option
        '''
        for cmd in self.cmd_list:
            try:
                self._generalCmd(cmd)  ### General Options ###
                ### Class Options ###
                self._cmdReadList(cmd, 'file',
                                  ['PELM', 'PELMFas', 'PhosBlast', 'PhosRes'])
                self._cmdReadList(cmd, 'opt',
                                  ['FilterSeq', 'UseSpec', 'PhosDat'])
                self._cmdReadList(cmd, 'stat', ['IDSim', 'HomSim'])
            except:
                self.log.errorLog('Problem with cmd:%s' % cmd)
#########################################################################################################################
### <3> ### Main Run Methods                                                                                        #
#########################################################################################################################

    def run(self):  ### Main method for standalone functionality
        '''Main method for standalone functionality.'''
        self.readPELM()
        if self.info['PhosBlast'].lower() not in ['', 'none']:
            self.mapPhosByBLAST(self.info['PhosBlast'])
#########################################################################################################################

    def readPELM(
        self
    ):  ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.
        '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.'''
        try:  ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            data = rje.dataDict(self,
                                self.info['PELM'],
                                mainkeys=['acc', 'position'])
            seqdict = {}  # Dictionary of Acc:Sequence

            ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pdict = self.dict['PhosphoSites']
            for dkey in data:
                ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                (acc, pos) = string.split(dkey)
                pos = string.atoi(pos)
                if acc not in pdict: pdict[acc] = {}
                if pos not in pdict[acc]: pdict[acc][pos] = {}
                ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if acc not in seqdict: seqdict[acc] = data[dkey]['sequence']
                elif seqdict[acc] != data[dkey]['sequence']:
                    self.log.printLog(
                        '#ERR', 'Warning. Sequence mismatch for %s' % acc)
                if 'aa' not in pdict[acc][pos]:
                    pdict[acc][pos]['aa'] = data[dkey]['code']
                elif pdict[acc][pos]['aa'] != data[dkey]['code']:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s'
                        %
                        (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa']))
                if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s'
                        % (acc, pos, data[dkey]['code'],
                           seqdict[acc][pos - 1:pos]))

            ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            acclist = rje.sortKeys(seqdict)
            pelmuni = rje_uniprot.UniProt(self.log,
                                          self.cmd_list)  # UniProt entry
            unidict = pelmuni.accDict(
                acclist)  # Dictionary of {acc:UniProtEntry}
            pelmseq = rje_seq.SeqList(self.log, self.cmd_list +
                                      ['seqin=None'])  # SeqList object
            ## ~ [3b] Add one sequence for each AccNum and update seqdict  ~~~~~~~~~~~~~~~~~~~~~~~~ ##
            #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!#
            for acc in acclist:  #!# Make accdict of {acc:Seq} using unidict and seqlist #!#
                sequence = seqdict[acc]
                try:
                    uni = unidict[string.split(acc, '-')[0]]
                    desc = uni.obj['Sequence'].info['Description']
                    name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc,
                                          desc)
                    if sequence != uni.obj['Sequence'].info['Sequence']:
                        self.log.printLog(
                            '#WARNING',
                            'Sequence mismatch for UniProt entry %s' % acc)
                except:
                    self.log.errorLog('Problem with %s' % acc)
                    name = '%s_UNK__%s' % (
                        acc, acc)  #!# Add sequences where UniProt missing #!#
                seqdict[acc] = pelmseq._addSeq(name, sequence)
            ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['FilterSeq']:
                pelmseq.autoFilter()
                for acc in acclist:
                    if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc)
                acclist = rje.sortKeys(seqdict)
            ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not os.path.exists(
                    self.info['PELMFas']
            ) or self.stat['Interactive'] < 0 or rje.yesNo(
                    '%s exists: overwrite?' % self.info['PELMFas']):
                pelmseq.saveFasta(seqfile=self.info['PELMFas'])
            self.obj['SeqList'] = pelmseq
            self.obj['UniProt'] = pelmuni
        except:
            self.log.errorLog('Problem during PhosphoSeq.readPELM')
#########################################################################################################################

    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln(
                )  #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln', '-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,
                                                         qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][
                                a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {
                                'Name': search.info['Name'],
                                'Pos': rdict[qseq],
                                'AA': qseq.info['Sequence'][a],
                                'PELM': hseq.shortName(),
                                'PELMPos': rdict[hseq],
                                'Evidence': 'Hom'
                            }
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self, self.info['PhosRes'],
                                                    headers, delimit, pdata)
                            self.addPhos(aln.seq[-1], a, pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()
Example #25
0
    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0
Example #26
0
 def uniFake(
     self,
     seqs=[],
     store=False
 ):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx, seqnum) = (0, seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(
             self.log, self.cmd_list)  # UniProt object for saving data
         if self.info['DatOut'].lower() in ['', 'none']:
             self.info['DatOut'] = rje.baseFile(
                 seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self, datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self, hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else:
             hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()
             self.printLog(
                 '#SEQ', 'Processing %s (%s aa) %s...' %
                 (seq.shortName(), rje.integerString(
                     seq.aaLen()), seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),
                                      seq.info['AccNum'])
                 open('%s.fas' % utmp, 'w').write(
                     '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence']))
                 udata = {
                     'CC': ['-!- Features generated using unifake.py'],
                     'AC': []
                 }
                 if seq.info['SpecCode'] in ['Unknown', 'UNK']:
                     seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name']):
                     details = rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name'])
                     self.addAlias(seq.info['AccNum'], details[0])
                     self.addAlias(seq.info['AccNum'], details[1])
                     self.addAlias(seq.info['AccNum'], details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(), seq.info['AccNum']]:
                     if id in self.dict['Aliases']:
                         udata['AC'].append(
                             '%s;' %
                             string.join(self.dict['Aliases'][id], '; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []  # List of features for sequence
                 for id in [
                         seq.shortName(), seq.info['AccNum'], seq.info['ID']
                 ]:
                     if id in self.dict['Features']:
                         ft += self.dict['Features'][id]
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list[
                                 'RegionDisorder']:
                             ft.append({
                                 'Type':
                                 'DISORDER',
                                 'Desc':
                                 'Predicted disorder: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 disorder[0],
                                 'End':
                                 disorder[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s > %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({
                                 'Type':
                                 'ORDER',
                                 'Desc':
                                 'Predicted order: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 fold[0],
                                 'End':
                                 fold[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s <= %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                     except:
                         self.log.errorLog(
                             'UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({
                             'SearchDB': '%s.fas' % utmp,
                             'HMMOut': '%s.hmm.out' % utmp
                         })  # This will be made for each sequence
                         hmm.search = []
                         hmm.list['HMMRes'] = [
                             hmm.hmmSearch(self.info['PFam'],
                                           outfile=hmm.info['HMMOut'])
                         ]  # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile, append=True)
                         if 'disorder' in self.list['UniFake']:
                             disorder = seq.obj['Disorder'].list[
                                 'ResidueDisorder']  # individual (IUPRed) residue results
                         else:
                             disorder = []
                         if hmm.search:
                             udata['CC'].append(
                                 'PFam: HMMer PFam search vs %s (Modified %s)'
                                 %
                                 (self.info['PFam'],
                                  time.ctime(
                                      os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append(
                                 '-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type': '!ERROR!', 'Name': name}
                             rje.delimitedFileOutput(
                                 self,
                                 hmmfile, [
                                     'Type', 'Name', 'Start', 'End', 'Eval',
                                     'Score'
                                 ],
                                 datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {
                                         'Start':
                                         aln.stat['SbjStart'],
                                         'End':
                                         aln.stat['SbjEnd'],
                                         'Type':
                                         'PFAM',
                                         'Desc':
                                         '%s PFam HMM Eval: %.2e; Score: %.1f'
                                         % (search.info['Name'],
                                            aln.stat['Expect'],
                                            aln.stat['BitScore'])
                                     }
                                     if disorder:
                                         region = disorder[
                                             aln.stat['SbjStart'] -
                                             1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(
                                             sum(region)) / len(region)
                                         pfamft[
                                             'Desc'] = '%s; IUPRed: %.2f' % (
                                                 pfamft['Desc'],
                                                 hmmdisorder)
                                         if hmmdisorder < self.stat[
                                                 'DisDom']:
                                             pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except:
                         self.log.errorLog(
                             'UniFake PFam HMM problem for %s.' % name)
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen(
                             '%s %s.fas -short' %
                             (self.info['TMHMM'], utmp)).readlines()
                         domlist = rje_tm.domainList(
                             rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1:
                             udata['CC'].append(
                                 'TMHMM: %d TM domains; N-Term %s' %
                                 ((len(domlist) - 1) / 2,
                                  domlist[0]['Type']))
                         else:
                             udata['CC'].append('TMHMM: 0 TM domains')
                     except:
                         self.log.errorLog('UniFake TMHMM problem for %s.' %
                                           name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system(
                             '%s -f short -t euk %s.fas > %s.signalp' %
                             (self.info['SignalP'], utmp, utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else:
                                     desc += ' (HMM also Y)'
                         if cpos > 0:
                             ft.append({
                                 'Type': 'SIGNALP',
                                 'Desc': desc,
                                 'Start': 1,
                                 'End': cpos
                             })
                     except:
                         self.log.errorLog(
                             'UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq, udata, ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(
                         seq, data=udata,
                         ft=ft):  ### Converts into UniProtEntry object
                     if not store: uniprot.saveUniProt(datfile, append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except:
                 self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp):
                 os.unlink(tmp)
             self.printLog(
                 '#UNIFAKE',
                 '|---------- %s run <<<|>>> %s to go -----------|' %
                 (rje.integerString(sx), rje.integerString(seqnum - sx)),
                 log=False)
         if store: uniprot.saveUniProt(datfile, append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)
     except:
         self.errorLog(
             'Oh, the shame of it! Trouble during UniFake.uniFake()')
Example #27
0
    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')
Example #28
0
 def rfAtt(self):      ### Generic method
     '''
     Generic method. Add description here (and arguments.)
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhead = ['Att','RF1','RF2','RF3','RF-1','RF-2','RF-3','ObsRF1','ObsRF2','ObsRF3','ObsRF-1','ObsRF-2','ObsRF-3','ExpRF1','ExpRF2','ExpRF3','ExpRF-1','ExpRF-2','ExpRF-3']
         rfdata = {}; rfobs = {}; rfexp = {}; ntfreq = {}
         for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
             rfdata[rf] = {}; rfobs[rf] = {}; rfexp[rf] = {}
             for x in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf][x] = 0; rfobs[rf][x] = 0; rfexp[rf][x] = 0
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf]['%s%s' % (a1,a2)] = 0; rfobs[rf]['%s%s' % (a1,a2)] = 0; rfexp[rf]['%s%s' % (a1,a2)] = 0
         for x in rje_seq.alph_dna[:-1]: ntfreq[x] = 0
         seqlist = self.obj['SeqList'] 
         ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (sx,stot) = (0.0,seqlist.seqNum())
         for seq in seqlist.seq:
             self.progLog('\r#ATT','Counting sequence attributes: %.2f%%' % (sx/stot)); sx += 100.0
             for x in seq.info['Sequence']:
                 if x in ntfreq: ntfreq[x] += 1
             rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence'])
             for r in rf6:
                 rseq = rf6[r]
                 rf = 'RF%d' % r
                 for i in range(len(rseq)):
                     a = rseq[i]; dia = rseq[i:i+2]
                     if a in rfdata[rf]: rfdata[rf][a] += 1
                     if dia in rfdata[rf]: rfdata[rf][dia] += 1
         self.printLog('\r#ATT','Counting sequence attributes complete.')
         ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ntobs = rje.dictFreq(ntfreq,total=True,newdict=True)
         ntcomp = {'Total':ntobs['Total']}
         for xy in ['AT','GC']: ntcomp[xy[0]] = ntobs[xy[1]]; ntcomp[xy[1]] = ntobs[xy[0]]
         for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
             aafreq = {}
             for a in rje_seq.alph_protx[:-1] + ['*']: aafreq[a] = rfdata[rf][a]
             aafreq = rje.dictFreq(aafreq,total=True,newdict=True)
             for a in rje_seq.alph_protx[:-1] + ['*']: rfobs[rf][a] = rfdata[rf][a]; rfexp[rf][a] = 0
             for n1 in 'GATC':
                 for n2 in 'GATC':
                     for n3 in 'GATC':
                         codon = '%s%s%s' % (n1, n2, n3)
                         aa = rje_sequence.dna2prot(codon)
                         if rf[-2] == '-': rfexp[rf][aa] += (int(ntobs['Total']/3.0) * ntcomp[n1] * ntcomp[n2] * ntcomp[n3])
                         else: rfexp[rf][aa] += (int(ntobs['Total']/3.0) * ntobs[n1] * ntobs[n2] * ntobs[n3])
                         #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3]))
                         #self.deBug('%s: %s' % (aa,rfexp[rf][aa]))
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfexp[rf]['%s%s' % (a1,a2)] = (aafreq['Total'] - 1) * aafreq[a1] * aafreq[a2]
                     rfobs[rf]['%s%s' % (a1,a2)] = rfdata[rf]['%s%s' % (a1,a2)] 
         ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt'
         rje.delimitedFileOutput(self,rfile,rfhead,rje_backup=True)
         for a in rje_seq.alph_protx[:-1] + ['*']:
             data = {'Att':a}
             for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
                 data['Obs%s' % rf] = rfobs[rf][a]
                 data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                 data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
             rje.delimitedFileOutput(self,rfile,rfhead,datadict=data)
         for a1 in rje_seq.alph_protx[:-1] + ['*']:
             for a2 in rje_seq.alph_protx[:-1] + ['*']:
                 a = '%s%s' % (a1,a2)
                 data = {'Att':a}
                 for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
                     data['Obs%s' % rf] = rfobs[rf][a]
                     data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                     data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
                 rje.delimitedFileOutput(self,rfile,rfhead,datadict=data)
         self.printLog('#TDT','TDT output complete.')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Example #29
0
 def tabulatePPIRegion(
         self):  ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']:
             return self.printLog('#REGTAB',
                                  '%s found. (Force=F)' % tabfile)
         headers = ['Protein', 'Start', 'End', 'Interactor']
         rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP', gcmd)
         prot = None
         rx = 0
         plist = []
         ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)', gline):
                 prot = rje.matchExp('ID   (\S+)', gline)[0]
             if rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline):
                 (rstart, rend, rint) = rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi):
                         datadict = {
                             'Protein':
                             prot,
                             'Start':
                             rstart,
                             'End':
                             rend,
                             'Interactor':
                             rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0]
                         }
                         rje.delimitedFileOutput(self,
                                                 tabfile,
                                                 headers,
                                                 datadict=datadict)
                         rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist:
                             ilist.append(datadict['Interactor'])
                         self.progLog(
                             '\r#REGTAB',
                             'Tabulating regions: %s proteins; %s interactors; %s regions'
                             % (rje.integerString(
                                 len(plist)), rje.integerString(
                                     len(ilist)), rje.integerString(rx)))
         self.printLog(
             '\r#REGTAB',
             'Tabulated regions (%s proteins; %s interactors; %s regions) => %s'
             % (rje.integerString(len(plist)), rje.integerString(
                 len(ilist)), rje.integerString(rx), tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Example #30
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())
Example #31
0
 def setupResults(self):    ### Main results setup method.
     '''Main results setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['Headers'] = ['Dataset','Query','Fitness','Phenotype','SeqGroup','CovP','CovB','CovW','Price','Ratio']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],rje_backup=True)
     except: self.errorLog('Problem during %s setupResults().' % self); raise
Example #32
0
    def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence
        '''
        Performs actual mapping of sequence.
        >> seq:SeqList object containing Sequence Object to be mapped
        >> blast:BLAST_Run object to perform BLAST and GABLAM
        >> search:Current BLAST search object for mapping
        >> outputmap:boolean = Whether to output mapping into a file [True]
        << returns shortName() of mapped sequence (or None if none)
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seq = seqlist.getSeq(format='tuple')
            mapseq = self.obj['MapDB']
            hits = blast.db('Hit').indexEntries('Query',search)
            self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits)))
            hitseq = {}; hitdata = {}
            for entry in hits:
                hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple')
                hitdata[entry['Hit']] = entry
            resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])}
            ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hits,hitdict) = self.orderHits(seq,hits,hitseq)
            self.debug(hits)
            self.debug(hitdict)
            ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for method in self.list['Mapping']:
                resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower())
                if resdict['Hit']:
                    resdict['Method'] = method[:1].upper() + method[1:].lower()
                    break
                elif method == 'gablam' and (len(hits) > 0):
                    resdict['Method'] = 'Rejected'
            self.debug(resdict)
            ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if resdict['Hit']:  #hitdict[hit]['Data']['ShortName']
                hit = resdict['Hit']['Hit']     # resdict['Hit'] is the BLAST table entry for Hit
                shortname = hitdict[hit]['Data']['ShortName']   # This is just hit!
                self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method']))
                ## Update Stats ##
                self.debug('')
                resdict['BlastRank'] = hitdata[hit]['Rank']
                for key in hitdict[hit]: resdict[key] = hitdict[hit][key]
                ## Fasta and Redundancy ##
                if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                else:
                    self.list['Mapped'].append(shortname)
                    if outputmap:
                        open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1]))
                resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode']
                resdict['Hit'] = shortname
            else:
                ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
                if 'grep' in self.list['Mapping']:
                    greplist = []; hitseq = ''
                    self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False)
                    for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines():
                        if line[:1] == '>': greplist.append(string.split(line[1:])[0])
                        elif not hitseq: hitseq = rje.chomp(line)
                    if greplist:
                        shortname = greplist.pop(0)
                        resdict['Hit'] = shortname
                        resdict['Method'] = 'Grep'
                        resdict['Qry_ID'] = '100.0'
                        resdict['Qry_Len'] = len(seq[1])
                        resdict['Hit_Len'] = len(hitseq)
                        resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1])
                        try: resdict['Hit_Species'] = string.split(shortname,'_')[1]
                        except: pass
                        if shortname in self.list['Mapped']:
                            self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                        else:
                            self.list['Mapped'].append(shortname)
                            if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq))
                    for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra))
                if not resdict['Hit'] and self.bool['Combine']:
                    ## Fasta and Redundancy ##
                    shortname = string.split(seq[0])[0]
                    if shortname in self.list['Mapped']:
                        self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas']))
                    else:
                        self.list['Mapped'].append(shortname)
                        if outputmap:
                            open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                elif outputmap:
                    open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method']))
            if outputmap:
                rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
            return resdict['Hit']

        except:
            self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True)
            return False
Example #33
0
 def uniFake(self,seqs=[],store=False):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx,seqnum) = (0,seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(self.log,self.cmd_list)   # UniProt object for saving data
         if self.info['DatOut'].lower() in ['','none']: self.info['DatOut'] = rje.baseFile(seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self,datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log,self.cmd_list+['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self,hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else: hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log,self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()                    
             self.printLog('#SEQ','Processing %s (%s aa) %s...' % (seq.shortName(),rje.integerString(seq.aaLen()),seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),seq.info['AccNum'])
                 open('%s.fas' % utmp,'w').write('>%s\n%s\n' % (seq.shortName(),seq.info['Sequence']))
                 udata = {'CC':['-!- Features generated using unifake.py'],'AC':[]}
                 if seq.info['SpecCode'] in ['Unknown','UNK']: seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']):
                     details = rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name'])
                     self.addAlias(seq.info['AccNum'],details[0])
                     self.addAlias(seq.info['AccNum'],details[1])
                     self.addAlias(seq.info['AccNum'],details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(),seq.info['AccNum']]:
                     if id in self.dict['Aliases']: udata['AC'].append('%s;' % string.join(self.dict['Aliases'][id],'; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []     # List of features for sequence
                 for id in [seq.shortName(),seq.info['AccNum'],seq.info['ID']]:
                     if id in self.dict['Features']: ft += self.dict['Features'][id]                        
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list['RegionDisorder']:
                             ft.append({'Type':'DISORDER','Desc':'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'],'Start':disorder[0],'End':disorder[1]})
                             if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % (ft[-1]['Desc'],dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({'Type':'ORDER','Desc':'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'],'Start':fold[0],'End':fold[1]})
                             if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % (ft[-1]['Desc'],dis.stat['IUCut'])
                     except: self.log.errorLog('UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({'SearchDB':'%s.fas' % utmp,'HMMOut':'%s.hmm.out' % utmp})      # This will be made for each sequence                    
                         hmm.search = []
                         hmm.list['HMMRes'] = [hmm.hmmSearch(self.info['PFam'],outfile=hmm.info['HMMOut'])]   # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile,append=True)
                         if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list['ResidueDisorder']          # individual (IUPRed) residue results
                         else: disorder = []
                         if hmm.search: udata['CC'].append('PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'],time.ctime(os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append('-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type':'!ERROR!','Name':name}
                             rje.delimitedFileOutput(self,hmmfile,['Type','Name','Start','End','Eval','Score'],datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {'Start':aln.stat['SbjStart'],'End':aln.stat['SbjEnd'],'Type':'PFAM',
                                                'Desc':'%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'],aln.stat['Expect'],aln.stat['BitScore'])}
                                     if disorder:
                                         region = disorder[aln.stat['SbjStart']-1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(sum(region)) / len(region)
                                         pfamft['Desc'] = '%s; IUPRed: %.2f' % (pfamft['Desc'],hmmdisorder)
                                         if hmmdisorder < self.stat['DisDom']: pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except: self.log.errorLog('UniFake PFam HMM problem for %s.' % name)                  
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen('%s %s.fas -short' % (self.info['TMHMM'],utmp)).readlines()
                         domlist = rje_tm.domainList(rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1: udata['CC'].append('TMHMM: %d TM domains; N-Term %s' % ((len(domlist)-1)/2,domlist[0]['Type']))
                         else: udata['CC'].append('TMHMM: 0 TM domains')
                     except: self.log.errorLog('UniFake TMHMM problem for %s.' % name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system('%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'],utmp,utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else: desc += ' (HMM also Y)'
                         if cpos > 0: ft.append({'Type':'SIGNALP','Desc':desc,'Start':1,'End':cpos})
                     except: self.log.errorLog('UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq,udata,ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(seq,data=udata,ft=ft):    ### Converts into UniProtEntry object 
                     if not store: uniprot.saveUniProt(datfile,append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except: self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp): os.unlink(tmp)
             self.printLog('#UNIFAKE','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx),rje.integerString(seqnum-sx)),log=False)
         if store: uniprot.saveUniProt(datfile,append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)            
     except: self.errorLog('Oh, the shame of it! Trouble during UniFake.uniFake()')
Example #34
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A','C','G','T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []     # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {}  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {}  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1,n2,n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod,'T','U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,total=False)                                 # Normalise triplet freq.
            for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('#FREQ','Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0]
                except:
                    try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0]
                    except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0]
                self.deBug(exons)
                exons = string.split(exons,',')
                elen = []
                try:
                    for exon in exons:
                        (start,end) = string.split(exon,'..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue
                        
                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False)
                if seq.aaLen()/3 != seq.aaLen()/3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence,'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                elen = seq.list['ExonLen'] 
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0      # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:    # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:              # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1,3):
                                    if elen[0] > ax + i:    # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:    # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:              # Look at overlap with previous codon
                        for i in range(1,3):
                            if elen[0] > ax + i:    # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq,total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False)    
            self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip']
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self,tfile,headers,rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod,'T','U')]
                datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod],
                            'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod],
                            'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]}
                rje.delimitedFileOutput(self,tfile,headers,datadict=datadict)
            self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Example #35
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A', 'C', 'G', 'T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []  # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {
            }  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {
            }  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1, n2, n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod, 'T', 'U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][
                            cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,
                                     total=False)  # Normalise triplet freq.
            for aa in nts_cfreq:
                nts_cfreq[aa] = rje.dictFreq(
                    nts_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog('#FREQ', 'Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog(
                    '\r#OBS',
                    'Calculating observed codon frequencies: %.1f%%' %
                    (sx / stot),
                    newline=False,
                    log=False)
                sx += 100.0
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try:
                    exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',
                                         pos)[0]
                except:
                    try:
                        exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0]
                    except:
                        exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0]
                self.deBug(exons)
                exons = string.split(exons, ',')
                elen = []
                try:
                    for exon in exons:
                        (start, end) = string.split(exon, '..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue

                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen():
                    self.log.errorLog('%s exon length error' % id,
                                      printerror=False)
                if seq.aaLen() / 3 != seq.aaLen() / 3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,
                                      printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence, 'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id, printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq:
                obs_cfreq[aa] = rje.dictFreq(
                    obs_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog(
                '\r#OBS', 'Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP',
                                  'Calculating triplet frequencies: %.1f%%' %
                                  (sx / stot),
                                  newline=False,
                                  log=False)
                sx += 100.0
                elen = seq.list['ExonLen']
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0  # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:  # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:  # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1, 3):
                                    if elen[0] > ax + i:  # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (
                                            obs_cfreq[prevaa][cod1] *
                                            obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (
                                            nts_cfreq[prevaa][cod1] *
                                            nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:  # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:  # Look at overlap with previous codon
                        for i in range(1, 3):
                            if elen[0] > ax + i:  # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq, total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False)
            self.log.printLog('\r#TRIP',
                              'Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip',
                'NT_Trip', 'ObCod_Trip', 'NTCod_Trip'
            ]
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self, tfile, headers, rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod, 'T', 'U')]
                datadict = {
                    'Triplet': cod,
                    'AA': aa,
                    'Degen': len(obs_cfreq[aa]),
                    'Obs_Codon': obs_cfreq[aa][cod],
                    'NT_Codon': nts_cfreq[aa][cod],
                    'Obs_Trip': obs_tfreq[cod],
                    'NT_Trip': nts_tfreq[cod],
                    'ObCod_Trip': ocd_tfreq[cod],
                    'NTCod_Trip': ncd_tfreq[cod]
                }
                rje.delimitedFileOutput(self,
                                        tfile,
                                        headers,
                                        datadict=datadict)
            self.log.printLog('#OUT',
                              'Triplet & codon data output to %s' % tfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Example #36
0
    def mapPhosByBLAST(self,fasfile):   ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F']
            qseqlist = rje_seq.SeqList(self.log,scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name','Pos','AA','PELM','PELMPos','Evidence']
            delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self,ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F'])
            pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile})
            pblast.setStat({'HitAln':pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key']
            for g in ['ID','Hom']:
                if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True,log=True)    # BLAST
            pblast.readBLAST(gablam=True)               # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F'])
                aln.seq = [qseq]
                pdict = {}      # Dictionary of {hseq:[poslist]}
                rdict = {qseq:0}      # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False):
                            self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln()   #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln','-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a],
                                     'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'}
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata)
                            self.addPhos(aln.seq[-1],a,pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()


            # Align hits for each > X %ID
            # Map phosphosites onto alignment and output #
            
            return
        except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
Example #37
0
    def run(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for fasta in glob.glob('*.fasta'):
                fas = fasta[:-2]
                if os.path.exists(fas): continue
                sx = 0
                for line in open(fasta, 'r').readlines():
                    if line[:1] == '>':
                        try:
                            (name,
                             desc) = rje.matchExp('^>(\S+) (\S.+)$', line)
                        except:
                            name = rje.matchExp('^>(\S+)', line)[0]
                        if len(string.split(name, '|')) == 3:
                            name = '6rf_NEIME__%s' % string.split(name, '|')[2]
                            open(fas, 'a').write('>%s\n' % name)
                        elif len(string.split(name, '|')) == 5:
                            name = 'ref_NEIME__%s' % string.split(name, '|')[3]
                            open(fas, 'a').write('>%s %s\n' % (name, desc))
                        else:
                            print string.split(name, '|')
                            raise ValueError
                        self.progLog(
                            '\r#FAS', 'Processing %s: %s seqs' %
                            (fas, rje.integerString(sx)))
                        sx += 1
                    else:
                        open(fas, 'a').write(line)
                self.printLog(
                    '\r#FAS', 'Processed %s: %s seqs from %s' %
                    (fas, rje.integerString(sx), fasta))
                rje_blast.BLASTRun(self.log,
                                   self.cmd_list).formatDB(fas,
                                                           protein=True,
                                                           force=True)
            ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            rfhits = {}  # Dictionary of {hit:['File:hit_num']}
            acc = 'MC58_6RF_Hits.acc'
            open(acc, 'w')
            gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
            cx = 0
            for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
                cx += 1
                file = os.path.basename(csv)[:-4]
                hits = False
                for line in open(csv, 'r').readlines():
                    if line.find('prot_hit_num,prot_acc') == 0: hits = True
                    elif hits:
                        data = rje.readDelimit(line, ',')
                        if len(data) < 2: continue
                        [num, name] = data[:2]
                        try:
                            name = string.split(name, '|')[2]
                        except:
                            continue
                        if name not in rfhits:
                            open(acc, 'a').write('6rf_NEIME__%s\n' % name)
                            rfhits[name] = []
                        id = '%s:%s' % (file, num)
                        if id not in rfhits[name]: rfhits[name].append(id)
                        self.progLog(
                            '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' %
                            (cx, rje.integerString(len(rfhits))))
            self.printLog(
                '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' %
                (cx, rje.integerString(len(rfhits)), acc))
            ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(gfile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=%s' % acc, 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas',
                        'qryacc=F'
                    ]).gablam()
            ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum'])
            zeros = []
            for hit in gdata:
                if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
            zeros = rje.sortUnique(zeros, False)
            open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n'))
            self.printLog(
                '#ZERO',
                '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
            ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
            if not os.path.exists(ufile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Zeros.fas',
                        'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas',
                        'qryacc=F'
                    ]).gablam()
            gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True)
            fdata = rje.dataDict(self,
                                 string.replace(ufile, 'hitsum', 'gablam'),
                                 ['Qry'], ['Hit'],
                                 lists=True)
            headers = gdata.pop('Headers')
            headers.insert(1, 'Sample')
            headers.append('BestHit')
            rje.delimitedFileOutput(self,
                                    'MC58_6RF_Zeros.tdt',
                                    headers,
                                    rje_backup=True)
            for rf in rje.sortKeys(gdata):
                rfcut = string.split(rf, '__')[1]
                gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ')
                gdata[rf]['Qry'] = rfcut
                try:
                    gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
                except:
                    gdata[rf]['BestHit'] = '-'
                rje.delimitedFileOutput(self,
                                        'MC58_6RF_Zeros.tdt',
                                        headers,
                                        datadict=gdata[rf])

        except:
            self.errorLog(rje_zen.Zen().wisdom())
        self.printLog('#ZEN', rje_zen.Zen().wisdom())