Python delimitedFileOutput Examples

Programming Language: Python

Namespace/Package Name: rje

Method/Function: delimitedFileOutput

Examples at hotexamples.com: 37

Python delimitedFileOutput - 37 examples found. These are the top rated real world Python examples of rje.delimitedFileOutput extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: qsf_analysis.py Project: slimsuite/SLiMSuite

 def tabulatePPIRegion(self):    ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile)
         headers = ['Protein','Start','End','Interactor']
         rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP',gcmd)
         prot = None; rx = 0; plist = []; ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)',gline): prot = rje.matchExp('ID   (\S+)',gline)[0]
             if rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline):
                 (rstart,rend,rint) = rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi):
                         datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]}
                         rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor'])
                         self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx)))
         self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible

Example #2

Show file

File: slimgoer.py Project: slimsuite/SLiMSuite

 def makeGOFile(self):   ### Maps GO to sequences and outputs table for R analysis
     '''Maps GO to sequences and outputs table for R analysis.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         outfile = '%s.goer.tdt' % self.info['ResFile']
         headers = ['GOID','Motif','Type','Gene','Cons','HomNum','GlobID','LocID','Hyd','SA']
         rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
         ### ~ [2] ~ Work through dictionary and output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,mtot) = (-100.0,len(self.dict['Occ']))
         for motif in rje.sortKeys(self.dict['Occ']):
             mx += 100.0; self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|CheckSeq)         ' % (outfile,(mx/mtot),motif))
             ## ~ [2a] ~ Check MinOcc in terms of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             for type in rje.sortKeys(self.dict['Occ'][motif]):
                 if len(self.dict['Occ'][motif][type]) < self.stat['MinOcc']: self.dict['Occ'][motif].pop(type)
             if 'ELM' not in self.dict['Occ'][motif] or len(self.dict['Occ'][motif]) < 2: continue
             for type in self.dict['Occ'][motif]:
                 ## ~ [2b] ~ Map GO terms and check MinOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Check%s) ' % (outfile,(mx/mtot),motif,type)); 
                 godict = {}     # Temp dictionary of {GOID:[Seqs]}
                 for gene in self.dict['Occ'][motif][type]:
                     for go in self.ensGO(gene):
                         if go not in godict: godict[go] = [gene]
                         else: godict[go].append(gene)
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|OccGO%s) ' % (outfile,(mx/mtot),motif,type)); 
                 for go in rje.sortKeys(godict):
                     if len(godict[go]) < self.stat['MinOcc']: godict.pop(go)
                 ## ~ [2c] ~ Output remaining GO terms occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Output%s)' % (outfile,(mx/mtot),motif,type)); 
                 for go in rje.sortKeys(godict):
                     for gene in godict[go]:
                         for occdict in self.dict['Occ'][motif][type][gene]:
                             datadict = rje.combineDict({'GOID':'GO:%s' % go,'Motif':motif,'Type':type,'Gene':gene},occdict)
                             rje.delimitedFileOutput(self,outfile,headers,datadict=datadict)
             self.printLog('#OUT','Output for %s %s complete.' % (motif,rje.sortKeys(self.dict['Occ'][motif])),screen=False)
         self.printLog('\r#OUT','Generating %s output complete!         ' % (outfile))
     except: self.log.errorLog(rje_zen.Zen().wisdom())

Example #3

Show file

File: seqmapper.py Project: slimsuite/SLiMSuite

 def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes']
     '''Sets up output files self.str['MapFas','MissFas','MapRes'].'''
     ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     delimit = rje.getDelimit(self.cmd_list)
     if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = ''
     else:
         self.bool['Append'] = True
         self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom'])
     ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)}
     if self.getBool('Combine'): files.pop('MissFas')
     if self.str['ResFile'].lower() in ['','none']:
         self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True))
     for file in files.keys():
         self.setStr({file: self.getStr('ResFile') + '.' + files[file]})
         rje.backup(self,self.getStr(file))
     ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     #!# Consider replacing with rje_db object? #!#
     self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score']
     for qh in ['Query','Hit']:
         self.list['Headers'] += ['%s_Species' % qh]
         if self.bool['GablamOut']:
             for st in ['Len','Sim','ID']:
                 self.list['Headers'] += ['%s_%s' % (qh,st)]
     rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)

Example #4

Show file

File: rje_genecards.py Project: kwikwag/SLiMSuite

    def run(self,setup=True):  ### Main Run Method
        '''
        Main Run Method
        >> setup:bool [True] = Sets up headers and reads in existing data if present.
        '''
        try:
            ### ~ Setup & Read existing data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if setup: self.setup()
            headers = self.list['Headers']
            delimit = rje.delimitFromExt(filename=self.info['CardOut'])
            if os.path.exists(self.info['EnsLoci']):
                for h in ['EnsLoci','EnsDesc']:
                    if h not in headers: headers.append(h)
            rje.delimitedFileOutput(self,self.info['CardOut'],headers,delimit,rje_backup=True)

            ### ~ Read EnsLoci for incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.ensLoci()
                        
            ### ~ Parse data from GeneCards website and/or previously read aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.processGenes(self.list['Genes'])
            self.interactiveUpdate()
        
            ### ~ Add EnsEMBL EnsLoci data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.addEnsLoci()

            ### ~ Output GeneCards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.outputCards()
            
        except:
            self.log.errorLog('Apocalyptic error with GeneCards.run()')
            raise

Example #5

Show file

File: rje_genecards.py Project: kwikwag/SLiMSuite

 def outputCards(self):  ### Outputs cards to delimited file
     '''Outputs cards to delimited file.'''
     ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     genelist = self.list['Genes']
     if self.opt['Purify'] and self.opt['Restrict']:
         for gene in genelist[0:]:
             if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']:  # Replace with symbol
                 genelist.remove(gene)
                 if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol'])
     delimit = rje.delimitFromExt(filename=self.info['CardOut'])
     CARDOUT = open(self.info['CardOut'],'a')
     ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (noens,noloci,ox) = (0,0,0)
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if self.opt['Restrict'] and gene not in genelist: continue
         elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue
         self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1
         self.dict['GeneCard'][gene]['Alias'] = gene
         self.dict['GeneCard'][gene]['Species'] = self.info['Species']
         rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene])
         if self.dict['GeneCard'][gene]['Symbol'] == gene:   # Not an alias
             if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1
             if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1
     CARDOUT.close()
     self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut']))
     self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))

Example #6

Show file

File: seqmapper.py Project: slimsuite/SLiMSuite

 def run(self,imenu=False,outputmap=True,returndict=False):      ### Main controlling run Method
     '''
     Main controlling run Method.
     >> imenu:boolean = Whether to initiate interactive menu if appropriate [False].
     >> outputmap:boolean = Whether to output mapping into a file [True]
     >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False]
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(imenu): raise ValueError
         seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file'])
         if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {}
         ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1'])
         blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'),
                      'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))})  
         blast.setStat({'HitAln':blast.getStat('OneLine')})
         blast.list['ResTab'] = ['Search','Hit','GABLAM']
         if seqlist.nt(): blast.str['Type'] = 'blastx'
         ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if outputmap: self._setupOutput()                           ## Output Files ##
         if returndict: mapdict = {}
         else: self._setupMapped()                                   ## Previously Mapped Sequences ##
         seqx = seqlist.seqNum()             ## Number of sequences ##
         ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False)
         ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast.run(format=True)
         self.obj['DB'] = blast.obj['DB']
         ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.debug(self.getStr('MapDB'))
         self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']])
         self.obj['MapDB'].loadSeq(self.getStr('MapDB'))
         self.debug('%s' % self.obj['MapDB'].list['Seq'])
         sx = 0
         while seqlist.nextSeq() != None:
             search = seqlist.getSeq(format='short')
             sx += 1
             ## Check StartFrom ##
             if self.str['StartFrom']:
                 if self.str['StartFrom'] != search:
                     self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx))
                     continue
                 self.str['StartFrom'] = ''
                 self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx))
             ## Check if in Mapped ##
             if search in self.list['Mapped']:
                 resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'}
                 self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas']))
                 rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
                 continue
             ### Map Sequence ###
             self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx)))
             mapname = self.mapSeq(seqlist,blast,search)
             if returndict: mapdict[search] = mapname
         ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx)))           
         if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name'])     #!# Add option to keep BLAST! #!#
         if returndict: return mapdict
     except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise

Example #7

Show file

File: rje_price.py Project: slimsuite/SLiMSuite

 def run(self,batch=False):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch: self.setupResults()
         ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch and not self.obj['SeqList'].seqs():    ### Look for batch files and run for each
             batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0)
             self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles)))
             if not batchfiles: self.errorLog('No input files found!',printerror=False)
             else:
                 bx = 0
                 for infile in batchfiles:
                     bx += 1
                     self.printLog('#BATCH','Batch running %s' % infile)
                     bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile]
                     self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd)
                     self.run(batch=True)
                     self.opt['Append'] = True
                     self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False)
             if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32
             return
         ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.info['Special'].lower() == 'allbyall':
             self.printLog('#RUN','Performing special "all-by-all" pairwise run')
             self.info['Special'] = ''
             for i in range(len(self.seqs())-1):
                 self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i]
                 for j in range(i+1,len(self.seqs())):
                     self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1)
                     self.run(batch=True)
                     self.opt['Append'] = True
             self.info['Special'] = 'allbyall'; return                
         ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setup()
         ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.fitness()
         self.phenotype()
         self.grouping()
         for vector in ['Fitness','Phenotype','SeqGroup']:
             if len(self.list[vector]) != self.qry().seqLen():
                 self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False)
                 raise ValueError
         results = self.price()
         ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True)
         results['Query'] = self.qry().shortName()
         results['Fitness'] = self.info['Fmethod']
         results['Phenotype'] = self.info['Pmethod']
         results['SeqGroup'] = self.info['SeqGroup']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results)
         self.printLog('#OUT','Results output to %s' % self.info['ResFile'])
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible

Example #8

Show file

File: picsi.py Project: kwikwag/SLiMSuite

 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed

Example #9

Show file

File: wormpump.py Project: slimsuite/SLiMSuite

 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         counter = ['>>']    # List containing count times
         menulist = [('F','Change output file name','outfile','OutFile'),('X','Exit','return',''),('R','Run','return','')]
         mchoice = rje_menu.menu(self,'WormPump Menu',menulist,choicetext='Please select:',changecase=True,default='R')
         if mchoice == 'X': return
         self.printLog('#OUT','Output will be to %s' % self.info['OutFile'])
         self.printLog('#START','Initialising counter...')
         ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         wormid = None
         while counter[-1] != 'X':
             if wormid: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid,default='').upper())
             else: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit',default='').upper())
             if counter[-1]:
                 wormid = counter[-1]
                 if wormid == 'X': break
                 self.printLog('#WORM','Worm "%s"' % wormid)
             counter.append(time.time())
             self.deBug(counter)
         ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         head = ['Worm','Count','WormTime','AbsTime']
         rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,rje_backup=True)
         wormstart = 0.0
         wormid = None
         wtot = 0
         while counter:
             x = counter.pop(0)
             if x in ['>>','X']: continue
             if x:
                 wormid = x
                 wormstart = counter[0]
                 wx = 0
                 wtot += 1
             else:
                 if not wormid: continue
                 wx += 1
             t = counter.pop(0)
             tt = time.localtime(t)
             wdata = {'Worm':wormid,'Count':wx,'WormTime':t-wormstart,
                      #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
                      'AbsTime':'%s:%s:%s' % (rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
             rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,datadict=wdata)
         self.printLog('#OUT','Counts for %d worms output to %s' % (wtot,self.info['OutFile']))
         rje.choice('<ENTER> to exit')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible

Example #10

Show file

File: rje_omim.py Project: slimsuite/SLiMSuite

    def saveMutations(self):    ### Outputs parsed mutations into a delimited file
        '''Outputs parsed mutations into a delimited file.'''
        try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease']
            outfile = 'omim_mutations.tdt'
            rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True)

            ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for gene in rje.sortKeys(self.dict['Mutations']):
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene,
                                'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease}
                    rje.delimitedFileOutput(self,outfile,headers,'\t',datadict)
            self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())

Example #11

Show file

File: maptime.py Project: slimsuite/SLiMSuite

 def saveTimePoints(self,filename='',format='tdt',entries=[]):   ### Saves TimePoints to a file
     '''
     Saves TimePoints to a file from main TimePoints table.
     >> filename:str [''] = Output filename. Will use basefile if none given.
     >> format:str ['tdt'] = Output file format (csv/tsv/txt/db)
     >> entries:list [] = Entries from main table to output. (All if none given).
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('TimePoints')
         if format.lower() in ['','none']: format = string.split(filename.lower(),'.')[-1]
         if not filename: filename = '%s.%s' % (self.basefile(),format)
         if not entries: entries = db.entries()
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if format in ['csv','tdt']: 
             self.blanksToEmpty()
             rje.delimitedFileOutput(self,filename,db.fields(),rje_backup=True)
             for entry in entries: rje.delimitedFileOutput(self,filename,db.fields(),datadict=entry)
         ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             self.emptyToBlank()
             rje.backup(self,filename)
             OUT = open(filename,'a')
             for entry in entries:
                 if format == 'db':
                     outlist = []
                     for field in db.fields(): outlist.append(entry[field])
                     out_txt = '%s' % outlist
                     OUT.write('(%s);\n' % out_txt[1:-1])
                 else:
                     # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                     out_text = '%s. (TimePoint) ' % entry['TimePoint Name']
                     if entry['month'] in ['','blank']: out_text += '%s %s.' % (entry['Year'],entry['yearUnit'])
                     else: out_text += '%s %s, %s %s.' % (entry['Year'],entry['yearUnit'],entry['month'],entry['day'])
                     out_text = '%s %s Source: <%s>[%s].' % (out_text,entry['TimePoint Description'],entry['Source URL'],entry['Source URL'])
                     klist = []
                     for i in range(1,6):
                         if entry['keyword%d' % i] not in ['','blank']: klist.append(entry['keyword%d' % i])
                     out_text = '%s (Keywords: %s)' % (out_text,string.join(klist,', '))
                     OUT.write('%s\n' % out_text)
         self.printLog('#OUT','%d entries output to %s' % (len(entries),filename))
     except: self.errorLog('%s.saveTimePoints(%s) error' % (self,filename)); return False

Example #12

Show file

File: rje_hmm_V1.py Project: kwikwag/SLiMSuite

    def hmmTable(self,outfile='',append=False,delimit=None):    ### Outputs results table
        '''
        Outputs results table.
        >> outfile:str = Name of output file
        >> append:boolean = whether to append file
        >> delimit:str = Delimiter to use [\t]
        '''
        try:
            ### Setup ###
            if not outfile: outfile = self.info['HMMTab']
            if outfile.lower() == 'none':
                self.log.printLog('#TAB','HMMTab = "None": No table output')
                return False
            if not delimit: delimit = rje.getDelimit(self.cmd_list,'\t')
            if not outfile: outfile = '%s.hmmer.%s' % (rje.baseFile(self.info['SearchDB'],True),rje.delimitExt(delimit))
            self.readResults()
            self.log.printLog('#TAB','Tabulating results for %s searches into %s' % (len(self.search),outfile),log=False)

            ### Setup Resfile ###
            if self.opt['MySQL']: headers = ['HMM','Hit','Hit_Start','Hit_End','Eval','Score']
            else: headers = ['Type','Name','Start','End','Eval','Score']
            if not append or not os.path.exists(outfile): rje.delimitedFileOutput(self,outfile,headers,delimit,rje_backup=True)
            
            ### Output Search details ###
            for search in self.search:
                for hit in search.hit:
                    for aln in hit.aln:
                        out = {'HMM':search.info['Name'],'Type':search.info['Name'],
                               'Name':hit.info['Name'],'Hit':hit.info['Name'],
                               'Start':'%d' % aln.stat['SbjStart'], 'End':'%d' % aln.stat['SbjEnd'],
                               'Hit_Start':'%d' % aln.stat['SbjStart'], 'Hit_End':'%d' % aln.stat['SbjEnd'],
                               'Eval':'%.2e' % aln.stat['Expect'],'Score':'%.1f' % aln.stat['BitScore']}
                        rje.delimitedFileOutput(self,outfile,headers,delimit,out)
            self.log.printLog('#OUT','Results for %s searches output to %s.' % (len(self.search),outfile))
        except:
            self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile)
            raise

Example #13

Show file

File: rje_hprd.py Project: kwikwag/SLiMSuite

 def domainFasta(self):    ### Outputs parsed domain and domain PPI datasets in Fasta format
     '''Outputs parsed PPI datasets in Fasta format.'''
     try:
         ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         headers = ['Domain','HPRD','Gene']
         dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
         rje.delimitedFileOutput(self,dfile,headers,'\t')
         sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
         shead = ['Domain','Source']
         rje.delimitedFileOutput(self,sfile,shead,'\t')
         dx = 0.0
         for domain in rje.sortKeys(self.dict['Domains']):
             self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False)
             dx += 100.0
             for hid in self.dict['Domains'][domain]:
                 datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']}
                 rje.delimitedFileOutput(self,dfile,headers,'\t',datadict)
             for source in self.dict['DomainSource'][domain]:
                 datadict = {'Domain':domain,'Source':source}
                 rje.delimitedFileOutput(self,sfile,shead,'\t',datadict)
         self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains']))))
                    
         ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/')
         rje.mkDir(self,datpath)
         for domain in rje.sortKeys(self.dict['Domains']):
             ## Generate a list of all interactors with domain-containing proteins ##
             plist = []
             for p1 in self.dict['Domains'][domain]:
                 if p1 not in self.dict['PPI']: continue
                 for p2 in self.dict['PPI'][p1]:
                     if p2 not in plist: plist.append(p2)
             plist.sort()
             ## Generate Sequence list and output ##
             mylist = []
             for p in plist:
                 if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq']
                 else: mylist.append(self.dict['HPRD'][p]['Seq'])
             sfile = '%s%s_hprd.fas' % (datpath,domain)
             if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
             else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain)
         self.log.printLog('\r#DOM','HPRD Domain fasta output complete.')
     except:
         self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
         raise

Example #14

Show file

File: scap.py Project: slimsuite/SLiMSuite

 def scap(self):     ### Full SCAP method
     '''Full SCAP method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         markov = self.obj['Markov']
         minx = markov.stat['MinXmer']
         maxx = markov.stat['MaxXmer']
         headers = ['seq','type','sorted']
         for x in range(minx,maxx+1): headers.append('X%d' % x)
         delimit = rje.getDelimit(self.cmd_list,'\t')
         scapfile = '%s.%s' % (self.info['Basefile'],rje.delimitExt(delimit))
         rje.delimitedFileOutput(self,scapfile,headers,delimit,rje_backup=True)
         ### ~ [2] SCAP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Query ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         (sx,stot) = (0.0,self.obj['SeqList'].seqNum())
         for seq in self.obj['SeqList'].seq:
             self.progLog('\r#SCAP','SCAP processing Query to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0
             datadict = {'seq':seq.shortName(),'type':'qry','sorted':markov.opt['Sorted']}
             for x in range(minx,maxx+1): 
                 datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x)
                 if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x]
                 else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x]
             rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict)
         self.printLog('\r#SCAP','SCAP processed Query to %s for %s sequences.' % (scapfile,rje.integerString(stot)))
         ## ~ [2b] Background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.obj['ScapBack'] != self.obj['SeqList']:
             (sx,stot) = (0.0,self.obj['ScapBack'].seqNum())
             for seq in self.obj['ScapBack'].seq:
                 self.progLog('\r#SCAP','SCAP processing Background to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0
                 datadict = {'seq':seq.shortName(),'type':'bg','sorted':markov.opt['Sorted']}
                 for x in range(minx,maxx+1):
                     datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x)
                     if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x]
                     else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x]
                 rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict)
             self.printLog('\r#SCAP','SCAP processed Background to %s for %s sequences.' % (scapfile,rje.integerString(stot)))
         if markov.opt['Sorted']: self.printLog('#SCAP','Sorted SCAP run complete')
         else: self.printLog('#SCAP','UnSorted SCAP run complete')
     except: self.errorLog(rje_zen.Zen().wisdom())

Example #15

Show file

File: seqmapper.py Project: kwikwag/SLiMSuite

    def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence
        '''
        Performs actual mapping of sequence.
        >> seq:SeqList object containing Sequence Object to be mapped
        >> blast:BLAST_Run object to perform BLAST and GABLAM
        >> search:Current BLAST search object for mapping
        >> outputmap:boolean = Whether to output mapping into a file [True]
        << returns shortName() of mapped sequence (or None if none)
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seq = seqlist.getSeq(format='tuple')
            mapseq = self.obj['MapDB']
            hits = blast.db('Hit').indexEntries('Query',search)
            self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits)))
            hitseq = {}; hitdata = {}
            for entry in hits:
                hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple')
                hitdata[entry['Hit']] = entry
            resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])}
            ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hits,hitdict) = self.orderHits(seq,hits,hitseq)
            self.debug(hits)
            self.debug(hitdict)
            ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for method in self.list['Mapping']:
                resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower())
                if resdict['Hit']:
                    resdict['Method'] = method[:1].upper() + method[1:].lower()
                    break
                elif method == 'gablam' and (len(hits) > 0):
                    resdict['Method'] = 'Rejected'
            self.debug(resdict)
            ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if resdict['Hit']:  #hitdict[hit]['Data']['ShortName']
                hit = resdict['Hit']['Hit']     # resdict['Hit'] is the BLAST table entry for Hit
                shortname = hitdict[hit]['Data']['ShortName']   # This is just hit!
                self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method']))
                ## Update Stats ##
                self.debug('')
                resdict['BlastRank'] = hitdata[hit]['Rank']
                for key in hitdict[hit]: resdict[key] = hitdict[hit][key]
                ## Fasta and Redundancy ##
                if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                else:
                    self.list['Mapped'].append(shortname)
                    if outputmap:
                        open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1]))
                resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode']
                resdict['Hit'] = shortname
            else:
                ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
                if 'grep' in self.list['Mapping']:
                    greplist = []; hitseq = ''
                    self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False)
                    for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines():
                        if line[:1] == '>': greplist.append(string.split(line[1:])[0])
                        elif not hitseq: hitseq = rje.chomp(line)
                    if greplist:
                        shortname = greplist.pop(0)
                        resdict['Hit'] = shortname
                        resdict['Method'] = 'Grep'
                        resdict['Qry_ID'] = '100.0'
                        resdict['Qry_Len'] = len(seq[1])
                        resdict['Hit_Len'] = len(hitseq)
                        resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1])
                        try: resdict['Hit_Species'] = string.split(shortname,'_')[1]
                        except: pass
                        if shortname in self.list['Mapped']:
                            self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                        else:
                            self.list['Mapped'].append(shortname)
                            if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq))
                    for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra))
                if not resdict['Hit'] and self.bool['Combine']:
                    ## Fasta and Redundancy ##
                    shortname = string.split(seq[0])[0]
                    if shortname in self.list['Mapped']:
                        self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas']))
                    else:
                        self.list['Mapped'].append(shortname)
                        if outputmap:
                            open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                elif outputmap:
                    open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method']))
            if outputmap:
                rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
            return resdict['Hit']

        except:
            self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True)
            return False

Example #16

Show file

 def saveTimePoints(self,
                    filename='',
                    format='tdt',
                    entries=[]):  ### Saves TimePoints to a file
     '''
     Saves TimePoints to a file from main TimePoints table.
     >> filename:str [''] = Output filename. Will use basefile if none given.
     >> format:str ['tdt'] = Output file format (csv/tsv/txt/db)
     >> entries:list [] = Entries from main table to output. (All if none given).
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('TimePoints')
         if format.lower() in ['', 'none']:
             format = string.split(filename.lower(), '.')[-1]
         if not filename: filename = '%s.%s' % (self.basefile(), format)
         if not entries: entries = db.entries()
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if format in ['csv', 'tdt']:
             self.blanksToEmpty()
             rje.delimitedFileOutput(self,
                                     filename,
                                     db.fields(),
                                     rje_backup=True)
             for entry in entries:
                 rje.delimitedFileOutput(self,
                                         filename,
                                         db.fields(),
                                         datadict=entry)
         ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             self.emptyToBlank()
             rje.backup(self, filename)
             OUT = open(filename, 'a')
             for entry in entries:
                 if format == 'db':
                     outlist = []
                     for field in db.fields():
                         outlist.append(entry[field])
                     out_txt = '%s' % outlist
                     OUT.write('(%s);\n' % out_txt[1:-1])
                 else:
                     # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                     out_text = '%s. (TimePoint) ' % entry['TimePoint Name']
                     if entry['month'] in ['', 'blank']:
                         out_text += '%s %s.' % (entry['Year'],
                                                 entry['yearUnit'])
                     else:
                         out_text += '%s %s, %s %s.' % (
                             entry['Year'], entry['yearUnit'],
                             entry['month'], entry['day'])
                     out_text = '%s %s Source: <%s>[%s].' % (
                         out_text, entry['TimePoint Description'],
                         entry['Source URL'], entry['Source URL'])
                     klist = []
                     for i in range(1, 6):
                         if entry['keyword%d' % i] not in ['', 'blank']:
                             klist.append(entry['keyword%d' % i])
                     out_text = '%s (Keywords: %s)' % (
                         out_text, string.join(klist, ', '))
                     OUT.write('%s\n' % out_text)
         self.printLog('#OUT',
                       '%d entries output to %s' % (len(entries), filename))
     except:
         self.errorLog('%s.saveTimePoints(%s) error' % (self, filename))
         return False

Example #17

Show file

File: picsi.py Project: lyhniupi1/SLiMSuite

    def clusterGoodSeq(self,searchset,data):   ### Clusters good sequences returned by search and updates data dictionary
        '''Clusters good sequences returned by search and updates data dictionary.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Extract Non-rejected sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list+['autoload=F'])
            #self.deBug(rje.sortKeys(self.dict['Acc2Seq']))
            for prot in rje.sortKeys(data):
                if data[prot]['class'] != 'REJECT': seqlist.seq.append(self.dict['Acc2Seq'][data[prot]['accnum']])
            if not seqlist.seqNum():
                return self.printLog('#NULL','No %s sequences remain for clustering' % searchset)
            seqfile = '%s.%s.tmpdb' % (self.info['Basefile'],searchset)
            seqlist.saveFasta(seqfile=seqfile)
            seqdict = seqlist.seqNameDic()

            ### ~ [2] Perform BLAST and generate hit matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            try:
                blast = rje_blast.blastObj(self.log,['blastf=T','blaste=1e-4']+self.cmd_list+['dna=F'],type='New')
                clusters = blast.blastClusters(seqfile,seqdict=seqdict,keepblast=False) 
            except:
                self.errorLog('Problem with new BLAST clustering')
                blast = rje_blast.blastObj(self.log,['blastf=T','blaste=1e-4']+self.cmd_list+['dna=F'],type='Old')
                blast.setInfo({'InFile':seqfile,'DBase':seqfile,'Name':'%s.tmp.blast' % self.info['Basefile'],'Type':'blastp'})
                blast.setStat({'OneLine':seqlist.seqNum(),'HitAln':0})
                blast.formatDB(fasfile=seqfile,force=True,protein=True)
                blast.blast(cleandb=False,use_existing=False,log=True)
                blast.readBLAST(gablam=False,unlink=True,log=True)
                rje_blast.cleanupDB(self,seqfile,deletesource=True)
                ## ~ [2a] Cluster by BLAST hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                cluster = {}    # Dictionary of {seq:hit seqs} for clustering
                for search in blast.search:
                    seq = seqdict[search.info['Name']]
                    cluster[seq] = []
                    for hit in search.hit: cluster[seq].append(seqdict[hit.info['Name']])
                #self.deBug(cluster)
                ## ~ [2b] Combine clusters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                clusters = []   # List of [seqs] in clusters
                for seq in seqlist.seqs():
                    if seq not in cluster: continue
                    newcluster = [seq]
                    hits = cluster.pop(seq)
                    while hits:
                        hit = hits.pop(0)
                        if hit not in newcluster: newcluster.append(hit)
                        if hit in cluster: hits += cluster.pop(hit)
                    clusters.append(newcluster)
            self.printLog('#CLUSTER','%d clusters of %s proteins hits' % (len(clusters),searchset))
            #self.deBug(clusters)

            ### ~ [3] Assign peptides to consensi as "Common", "Cluster" or "Unique" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Match peptides to sequence lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pepcons = {}
            for seq in seqlist.seqs():
                prot = seq.shortName()  #.info['AccNum']
                for pep in data[prot]['conpep']:
                    if pep not in pepcons: pepcons[pep] = []
                    pepcons[pep].append(seq)
            self.dict['PepSeq'] = pepcons
            ## ~ [3b] Classify peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['PepTypes'] = {'Common':[],'Cluster':[],'Unique':[]}
            for pep in pepcons:
                if len(pepcons[pep]) == 1: self.dict['PepTypes']['Unique'].append(pep); continue
                pepclus = []
                for seq in pepcons[pep]:
                    for cluster in clusters:
                        if seq in cluster and cluster not in pepclus: pepclus.append(cluster)
                if len(pepclus) == 1: self.dict['PepTypes']['Cluster'].append(pep)
                else: self.dict['PepTypes']['Common'].append(pep)
            ## ~ [3c] Summarise Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#PEP','%d different %s Peptide sequences' % (len(pepcons),searchset))
            for ptype in ['Common','Cluster','Unique']: self.dict['PepTypes'][ptype].sort()
            self.printLog('#UNIQ','%d Unique to one consensus' % (len(self.dict['PepTypes']['Unique'])))
            self.printLog('#CLUS','%d Resticted to one cluster' % (len(self.dict['PepTypes']['Cluster'])))
            self.printLog('#COMM','%d Common to multiple clusters' % (len(self.dict['PepTypes']['Common'])))

            ### ~ [4] Update dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            cx = 0
            for cluster in clusters:
                cx += 1
                for seq in cluster:
                    prot = seq.shortName()  #info['AccNum']
                    data[prot]['cluster'] = cx

            ### ~ [5] Peptide Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            peptdt = '%s.%s.peptides.tdt' % (self.info['Basefile'],searchset)
            pephead = ['Peptide','Classification','Hits']
            rje.delimitedFileOutput(self,peptdt,pephead,rje_backup=True)
            for ptype in ['Common','Cluster','Unique']:
                for pep in self.dict['PepTypes'][ptype]:
                    data = {'Peptide':pep,'Classification':ptype,'Hits':seqlist.accList(self.dict['PepSeq'][pep])}
                    data['Hits'].sort()
                    data['Hits'] = string.join(data['Hits'],'|')
                    rje.delimitedFileOutput(self,peptdt,pephead,datadict=data)
            self.printLog('#PEP','Peptide details output to %s' % peptdt)
        except: self.errorLog(rje_zen.Zen().wisdom())

Example #18

Show file

File: rje_omim.py Project: slimsuite/SLiMSuite

    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log,pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']: return self.log.errorLog('Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict['GeneCard']      # GeneCards dictionary
            ensloci = ping.getEnsLoci()     # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()  
            if not seqdict: return self.log.errorLog('Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}    # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try: seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog('#MAP','No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}     # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):                    
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {}   # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []   # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split('Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',',')
            headers = ['Gene','OMIM','SubID','Mutation','Disease'] + dataheaders
            rje.delimitedFileOutput(self,outfile,headers,delimit='\t',rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'):      # Potential SLiM
                slimdata = rje.dataDict(self,file,['Pattern','Hit','Pos','Match'],dataheaders,delimit=',')
                for occ in slimdata:
                    if slimdata[occ]['Hit'] in ensback:     # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start,end) = (int(slimdata[occ]['Pos']),int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog('#OMIMSLIM','%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'],slimdata[occ]['Hit'],slimdata[occ]['Pattern'],start,end,mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][mpos][0]
                                rje.delimitedFileOutput(self,outfile,headers,'\t',slimdata[occ])
                                if (gene,mpos) not in slimomim: slimomim.append((gene,mpos))
            
            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim,resx,mutx) = (0,0,0)  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:      # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:    # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start,end) = (int(allslims[gene][occ]['Pos'])-1,int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end-start) + res[end-1:]
                    self.deBug('%s %d (%d)' % (gene,sum(res),seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog('#COV','SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0*inslim/resx))
            self.log.printLog('#MUT','%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog('#PROB','Probability of observed %d mutation overlap = %.4f' % (len(slimomim),rje.binomial(len(slimomim),mutx,float(inslim)/resx,callobj=self)))
        except: self.log.errorLog(rje_zen.Zen().wisdom())

Example #19

Show file

File: rje_omim.py Project: lyhniupi1/SLiMSuite

    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:  ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']:
                return self.log.errorLog(
                    'Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict[
                'GeneCard']  # GeneCards dictionary
            ensloci = ping.getEnsLoci(
            )  # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()
            if not seqdict:
                return self.log.errorLog(
                    'Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}  # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try:
                    seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog(
                        '#MAP', 'No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}  # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease, mutation) = self.dict['Mutations'][gene][subid]
                    (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',
                                                    mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(
                        pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {
            }  # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []  # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split(
                'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',
                ',')
            headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease'
                       ] + dataheaders
            rje.delimitedFileOutput(self,
                                    outfile,
                                    headers,
                                    delimit='\t',
                                    rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] +
                                  '*.occ.csv'):  # Potential SLiM
                slimdata = rje.dataDict(self,
                                        file,
                                        ['Pattern', 'Hit', 'Pos', 'Match'],
                                        dataheaders,
                                        delimit=',')
                for occ in slimdata:
                    if slimdata[occ][
                            'Hit'] in ensback:  # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start, end) = (int(slimdata[occ]['Pos']),
                                        int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[
                                    genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog(
                                    '#OMIMSLIM', '%s %s %s (%d-%d) = %s' %
                                    (slimdata[occ]['Dataset'],
                                     slimdata[occ]['Hit'],
                                     slimdata[occ]['Pattern'], start, end,
                                     mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(
                                    self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][
                                    mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][
                                    mpos][0]
                                rje.delimitedFileOutput(
                                    self, outfile, headers, '\t',
                                    slimdata[occ])
                                if (gene, mpos) not in slimomim:
                                    slimomim.append((gene, mpos))

            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim, resx, mutx) = (
                0, 0, 0
            )  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:  # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:  # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start, end) = (int(allslims[gene][occ]['Pos']) - 1,
                                        int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end - start) + res[end - 1:]
                    self.deBug('%s %d (%d)' %
                               (gene, sum(res),
                                seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog(
                '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' %
                (100.0 * inslim / resx))
            self.log.printLog(
                '#MUT',
                '%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog(
                '#PROB', 'Probability of observed %d mutation overlap = %.4f' %
                (len(slimomim),
                 rje.binomial(
                     len(slimomim), mutx, float(inslim) / resx, callobj=self)))
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())

Example #20

Show file

    def domainFasta(
        self
    ):  ### Outputs parsed domain and domain PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Domain', 'HPRD', 'Gene']
            dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
            rje.delimitedFileOutput(self, dfile, headers, '\t')
            sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
            shead = ['Domain', 'Source']
            rje.delimitedFileOutput(self, sfile, shead, '\t')
            dx = 0.0
            for domain in rje.sortKeys(self.dict['Domains']):
                self.log.printLog('\r#DOM',
                                  'HPRD Domain output (%s): %.1f%%' %
                                  (dfile, dx / len(self.dict['Domains'])),
                                  newline=False,
                                  log=False)
                dx += 100.0
                for hid in self.dict['Domains'][domain]:
                    datadict = {
                        'Domain': domain,
                        'HPRD': hid,
                        'Gene': self.dict['HPRD'][hid]['gene']
                    }
                    rje.delimitedFileOutput(self, dfile, headers, '\t',
                                            datadict)
                for source in self.dict['DomainSource'][domain]:
                    datadict = {'Domain': domain, 'Source': source}
                    rje.delimitedFileOutput(self, sfile, shead, '\t', datadict)
            self.log.printLog(
                '\r#DOM', 'HPRD Domain output (%s): %s domains.' %
                (dfile, rje.integerString(len(self.dict['Domains']))))

            ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datpath = self.info['OutDir'] + rje.makePath(
                'HPRD_Domain_Datasets/')
            rje.mkDir(self, datpath)
            for domain in rje.sortKeys(self.dict['Domains']):
                ## Generate a list of all interactors with domain-containing proteins ##
                plist = []
                for p1 in self.dict['Domains'][domain]:
                    if p1 not in self.dict['PPI']: continue
                    for p2 in self.dict['PPI'][p1]:
                        if p2 not in plist: plist.append(p2)
                plist.sort()
                ## Generate Sequence list and output ##
                mylist = []
                for p in plist:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath, domain)
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
                else:
                    self.log.printLog(
                        '#DOM', 'No PPI partners for domain "%s"' % domain)
            self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.saveFasta()',
                              printerror=True,
                              quitchoice=False)
            raise

Example #21

Show file

File: rje_rf.py Project: lyhniupi1/SLiMSuite

 def rfAtt(self):  ### Generic method
     '''
     Generic method. Add description here (and arguments.)
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhead = [
             'Att', 'RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3', 'ObsRF1',
             'ObsRF2', 'ObsRF3', 'ObsRF-1', 'ObsRF-2', 'ObsRF-3', 'ExpRF1',
             'ExpRF2', 'ExpRF3', 'ExpRF-1', 'ExpRF-2', 'ExpRF-3'
         ]
         rfdata = {}
         rfobs = {}
         rfexp = {}
         ntfreq = {}
         for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
             rfdata[rf] = {}
             rfobs[rf] = {}
             rfexp[rf] = {}
             for x in rje_seq.alph_protx[:-1] + ['*']:
                 rfdata[rf][x] = 0
                 rfobs[rf][x] = 0
                 rfexp[rf][x] = 0
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfdata[rf]['%s%s' % (a1, a2)] = 0
                     rfobs[rf]['%s%s' % (a1, a2)] = 0
                     rfexp[rf]['%s%s' % (a1, a2)] = 0
         for x in rje_seq.alph_dna[:-1]:
             ntfreq[x] = 0
         seqlist = self.obj['SeqList']
         ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (sx, stot) = (0.0, seqlist.seqNum())
         for seq in seqlist.seq:
             self.progLog(
                 '\r#ATT',
                 'Counting sequence attributes: %.2f%%' % (sx / stot))
             sx += 100.0
             for x in seq.info['Sequence']:
                 if x in ntfreq: ntfreq[x] += 1
             rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence'])
             for r in rf6:
                 rseq = rf6[r]
                 rf = 'RF%d' % r
                 for i in range(len(rseq)):
                     a = rseq[i]
                     dia = rseq[i:i + 2]
                     if a in rfdata[rf]: rfdata[rf][a] += 1
                     if dia in rfdata[rf]: rfdata[rf][dia] += 1
         self.printLog('\r#ATT', 'Counting sequence attributes complete.')
         ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ntobs = rje.dictFreq(ntfreq, total=True, newdict=True)
         ntcomp = {'Total': ntobs['Total']}
         for xy in ['AT', 'GC']:
             ntcomp[xy[0]] = ntobs[xy[1]]
             ntcomp[xy[1]] = ntobs[xy[0]]
         for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
             aafreq = {}
             for a in rje_seq.alph_protx[:-1] + ['*']:
                 aafreq[a] = rfdata[rf][a]
             aafreq = rje.dictFreq(aafreq, total=True, newdict=True)
             for a in rje_seq.alph_protx[:-1] + ['*']:
                 rfobs[rf][a] = rfdata[rf][a]
                 rfexp[rf][a] = 0
             for n1 in 'GATC':
                 for n2 in 'GATC':
                     for n3 in 'GATC':
                         codon = '%s%s%s' % (n1, n2, n3)
                         aa = rje_sequence.dna2prot(codon)
                         if rf[-2] == '-':
                             rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) *
                                               ntcomp[n1] * ntcomp[n2] *
                                               ntcomp[n3])
                         else:
                             rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) *
                                               ntobs[n1] * ntobs[n2] *
                                               ntobs[n3])
                         #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3]))
                         #self.deBug('%s: %s' % (aa,rfexp[rf][aa]))
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfexp[rf]['%s%s' %
                               (a1, a2)] = (aafreq['Total'] -
                                            1) * aafreq[a1] * aafreq[a2]
                     rfobs[rf]['%s%s' % (a1, a2)] = rfdata[rf]['%s%s' %
                                                               (a1, a2)]
         ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt'
         rje.delimitedFileOutput(self, rfile, rfhead, rje_backup=True)
         for a in rje_seq.alph_protx[:-1] + ['*']:
             data = {'Att': a}
             for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
                 data['Obs%s' % rf] = rfobs[rf][a]
                 data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                 data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
             rje.delimitedFileOutput(self, rfile, rfhead, datadict=data)
         for a1 in rje_seq.alph_protx[:-1] + ['*']:
             for a2 in rje_seq.alph_protx[:-1] + ['*']:
                 a = '%s%s' % (a1, a2)
                 data = {'Att': a}
                 for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']:
                     data['Obs%s' % rf] = rfobs[rf][a]
                     data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                     data[rf] = rje.expectString(rfobs[rf][a] /
                                                 rfexp[rf][a])
                 rje.delimitedFileOutput(self, rfile, rfhead, datadict=data)
         self.printLog('#TDT', 'TDT output complete.')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible

Example #22

Show file

File: wormpump.py Project: lyhniupi1/SLiMSuite

 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         counter = ['>>']  # List containing count times
         menulist = [('F', 'Change output file name', 'outfile', 'OutFile'),
                     ('X', 'Exit', 'return', ''),
                     ('R', 'Run', 'return', '')]
         mchoice = rje_menu.menu(self,
                                 'WormPump Menu',
                                 menulist,
                                 choicetext='Please select:',
                                 changecase=True,
                                 default='R')
         if mchoice == 'X': return
         self.printLog('#OUT',
                       'Output will be to %s' % self.info['OutFile'])
         self.printLog('#START', 'Initialising counter...')
         ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         wormid = None
         while counter[-1] != 'X':
             if wormid:
                 counter.append(
                     rje.choice(
                         'ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count'
                         % wormid,
                         default='').upper())
             else:
                 counter.append(
                     rje.choice(
                         'ID <ENTER> for new worm | X <ENTER> to exit',
                         default='').upper())
             if counter[-1]:
                 wormid = counter[-1]
                 if wormid == 'X': break
                 self.printLog('#WORM', 'Worm "%s"' % wormid)
             counter.append(time.time())
             self.deBug(counter)
         ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         head = ['Worm', 'Count', 'WormTime', 'AbsTime']
         rje.delimitedFileOutput(self,
                                 self.info['OutFile'],
                                 headers=head,
                                 rje_backup=True)
         wormstart = 0.0
         wormid = None
         wtot = 0
         while counter:
             x = counter.pop(0)
             if x in ['>>', 'X']: continue
             if x:
                 wormid = x
                 wormstart = counter[0]
                 wx = 0
                 wtot += 1
             else:
                 if not wormid: continue
                 wx += 1
             t = counter.pop(0)
             tt = time.localtime(t)
             wdata = {
                 'Worm':
                 wormid,
                 'Count':
                 wx,
                 'WormTime':
                 t - wormstart,
                 #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
                 'AbsTime':
                 '%s:%s:%s' % (rje.preZero(tt[3], 24), rje.preZero(
                     tt[4], 60), rje.preZero(tt[5], 60))
             }
             rje.delimitedFileOutput(self,
                                     self.info['OutFile'],
                                     headers=head,
                                     datadict=wdata)
         self.printLog(
             '#OUT', 'Counts for %d worms output to %s' %
             (wtot, self.info['OutFile']))
         rje.choice('<ENTER> to exit')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible

Example #23

Show file

    def run(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mygo = rje_go.GO(self.log,self.cmd_list)
            mygo.readGO()
            gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True)
            self.deBug(rje.sortKeys(gomap)[:100])
            #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!#
            self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap))))
            slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum'])
            self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc))))
            ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)
            seqmap = {}
            (sx,stot) = (0.0,seqlist.seqNum())
            for seq in seqlist.seq:
                self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0
                if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0]
            self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap))))
            self.deBug(rje.sortKeys(seqmap)[:100])

            ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            goocc = {}
            outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.')
            headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum']
            for okey in slimocc.keys():
                self.progLog('#NEW','Making new GO occurrences: %s    ' % (rje.integerString(len(slimocc))))
                data = slimocc.pop(okey)
                gene = seq = data['Seq']
                type = 'fwd'
                if string.split(data['Motif'],'_')[-1] in ['rev','scram']:
                    type = string.split(data['Motif'],'_')[-1]
                    data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_')
                if gene not in gomap and gene in seqmap: gene = seqmap[gene]
                golist = []
                if gene in gomap:
                    for id in gomap[gene]: golist += mygo.parents(id)
                else: golist = ['NoGo']
                self.deBug('%s:%s::%s' % (seq,gene,golist))
                for id in rje.sortUnique(golist,False,False):
                    if id not in goocc: goocc[id] = {}
                    if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]}
                    goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data))
            self.printLog('\r#NEW','Making new GO occurrences complete.    ' % (rje.integerString(len(slimocc))))

            rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
            (mx,ox,ix,itot) = (0,0,0.0,len(goocc))
            for id in rje.sortKeys(goocc):
                for motif in rje.sortKeys(goocc[id]):
                    for type in rje.sortKeys(goocc[id][motif]):
                        if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type)
                    if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue
                    mx += 1
                    for type in goocc[id][motif]:
                        for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1
                self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox)))
            self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox)))

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible

Example #24

Show file

class PhosphoSeq(rje.RJE_Object):
    '''
    PhosphoSeq Class. Author: Rich Edwards (2007).

    Info:str
    - PELM = Filename for phosphoELM download [None]
    - PELMFas = Filename for fasta file output of pELM sequences [pelm.fas]
    - PhosBlast = Fasta file of sequences to perform phosBLAST method against pELM [None]
    - PhosRes = Delimited text file containing input sequence, position and evidence [*.phosres.tdt]

    Opt:boolean
    - FilterSeq = Apply rje_seq sequence filters to phosphoELM data [False]
    - UseSpec = Use species codes for determing same species for ID matches [True]
    - PhosDat = Whether to produce a modified UniProt-format file with potential phosphoSites as features [False]

    Stat:numeric
    - IDSim = Percentage identity (GABLAM; phosblast qry) for marking as identity [95.0]
    - HomSim = Percentage identity (GABLAM; phosblast qry) for marking as homologue [40.0]

    List:list

    Dict:dictionary
    - PhosphoSites = Dictionary of {Seq:{Pos:details}}

    Obj:RJE_Objects
    - SeqList = rje_seq.SeqList() object for storing sequences
    - UniProt = rje_uniprot.UniProt() object for storing UniProt data
    '''

    #########################################################################################################################
    ### <1> ### Class Initiation etc.: sets attributes                                                                  #
    #########################################################################################################################
    def _setAttributes(self):  ### Sets Attributes of Object
        '''Sets Attributes of Object.'''
        ### Basics ###
        self.infolist = ['PELM', 'PELMFas', 'PhosBlast', 'PhosRes']
        self.optlist = ['FilterSeq', 'UseSpec', 'PhosDat']
        self.statlist = ['IDSim', 'HomSim']
        self.listlist = []
        self.dictlist = ['PhosphoSites']
        self.objlist = ['SeqList', 'UniProt']
        ### Defaults ###
        self._setDefaults(info='None',
                          opt=False,
                          stat=0.0,
                          obj=None,
                          setlist=True,
                          setdict=True)
        self.setInfo({'PELMFas': 'pelm.fas'})
        self.setStat({'IDSim': 95.0, 'HomSim': 40.0})
        self.setOpt({'UseSpec': True})
#########################################################################################################################

    def _cmdList(self):  ### Sets Attributes from commandline
        '''
        Sets attributes according to commandline parameters:
        - see .__doc__ or run with 'help' option
        '''
        for cmd in self.cmd_list:
            try:
                self._generalCmd(cmd)  ### General Options ###
                ### Class Options ###
                self._cmdReadList(cmd, 'file',
                                  ['PELM', 'PELMFas', 'PhosBlast', 'PhosRes'])
                self._cmdReadList(cmd, 'opt',
                                  ['FilterSeq', 'UseSpec', 'PhosDat'])
                self._cmdReadList(cmd, 'stat', ['IDSim', 'HomSim'])
            except:
                self.log.errorLog('Problem with cmd:%s' % cmd)
#########################################################################################################################
### <3> ### Main Run Methods                                                                                        #
#########################################################################################################################

    def run(self):  ### Main method for standalone functionality
        '''Main method for standalone functionality.'''
        self.readPELM()
        if self.info['PhosBlast'].lower() not in ['', 'none']:
            self.mapPhosByBLAST(self.info['PhosBlast'])
#########################################################################################################################

    def readPELM(
        self
    ):  ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.
        '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.'''
        try:  ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            data = rje.dataDict(self,
                                self.info['PELM'],
                                mainkeys=['acc', 'position'])
            seqdict = {}  # Dictionary of Acc:Sequence

            ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pdict = self.dict['PhosphoSites']
            for dkey in data:
                ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                (acc, pos) = string.split(dkey)
                pos = string.atoi(pos)
                if acc not in pdict: pdict[acc] = {}
                if pos not in pdict[acc]: pdict[acc][pos] = {}
                ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if acc not in seqdict: seqdict[acc] = data[dkey]['sequence']
                elif seqdict[acc] != data[dkey]['sequence']:
                    self.log.printLog(
                        '#ERR', 'Warning. Sequence mismatch for %s' % acc)
                if 'aa' not in pdict[acc][pos]:
                    pdict[acc][pos]['aa'] = data[dkey]['code']
                elif pdict[acc][pos]['aa'] != data[dkey]['code']:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s'
                        %
                        (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa']))
                if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s'
                        % (acc, pos, data[dkey]['code'],
                           seqdict[acc][pos - 1:pos]))

            ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            acclist = rje.sortKeys(seqdict)
            pelmuni = rje_uniprot.UniProt(self.log,
                                          self.cmd_list)  # UniProt entry
            unidict = pelmuni.accDict(
                acclist)  # Dictionary of {acc:UniProtEntry}
            pelmseq = rje_seq.SeqList(self.log, self.cmd_list +
                                      ['seqin=None'])  # SeqList object
            ## ~ [3b] Add one sequence for each AccNum and update seqdict  ~~~~~~~~~~~~~~~~~~~~~~~~ ##
            #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!#
            for acc in acclist:  #!# Make accdict of {acc:Seq} using unidict and seqlist #!#
                sequence = seqdict[acc]
                try:
                    uni = unidict[string.split(acc, '-')[0]]
                    desc = uni.obj['Sequence'].info['Description']
                    name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc,
                                          desc)
                    if sequence != uni.obj['Sequence'].info['Sequence']:
                        self.log.printLog(
                            '#WARNING',
                            'Sequence mismatch for UniProt entry %s' % acc)
                except:
                    self.log.errorLog('Problem with %s' % acc)
                    name = '%s_UNK__%s' % (
                        acc, acc)  #!# Add sequences where UniProt missing #!#
                seqdict[acc] = pelmseq._addSeq(name, sequence)
            ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['FilterSeq']:
                pelmseq.autoFilter()
                for acc in acclist:
                    if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc)
                acclist = rje.sortKeys(seqdict)
            ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not os.path.exists(
                    self.info['PELMFas']
            ) or self.stat['Interactive'] < 0 or rje.yesNo(
                    '%s exists: overwrite?' % self.info['PELMFas']):
                pelmseq.saveFasta(seqfile=self.info['PELMFas'])
            self.obj['SeqList'] = pelmseq
            self.obj['UniProt'] = pelmuni
        except:
            self.log.errorLog('Problem during PhosphoSeq.readPELM')
#########################################################################################################################

    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln(
                )  #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln', '-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,
                                                         qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][
                                a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {
                                'Name': search.info['Name'],
                                'Pos': rdict[qseq],
                                'AA': qseq.info['Sequence'][a],
                                'PELM': hseq.shortName(),
                                'PELMPos': rdict[hseq],
                                'Evidence': 'Hom'
                            }
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self, self.info['PhosRes'],
                                                    headers, delimit, pdata)
                            self.addPhos(aln.seq[-1], a, pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()

Example #25

Show file

    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0

Example #26

Show file

 def uniFake(
     self,
     seqs=[],
     store=False
 ):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx, seqnum) = (0, seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(
             self.log, self.cmd_list)  # UniProt object for saving data
         if self.info['DatOut'].lower() in ['', 'none']:
             self.info['DatOut'] = rje.baseFile(
                 seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self, datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self, hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else:
             hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()
             self.printLog(
                 '#SEQ', 'Processing %s (%s aa) %s...' %
                 (seq.shortName(), rje.integerString(
                     seq.aaLen()), seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),
                                      seq.info['AccNum'])
                 open('%s.fas' % utmp, 'w').write(
                     '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence']))
                 udata = {
                     'CC': ['-!- Features generated using unifake.py'],
                     'AC': []
                 }
                 if seq.info['SpecCode'] in ['Unknown', 'UNK']:
                     seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name']):
                     details = rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name'])
                     self.addAlias(seq.info['AccNum'], details[0])
                     self.addAlias(seq.info['AccNum'], details[1])
                     self.addAlias(seq.info['AccNum'], details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(), seq.info['AccNum']]:
                     if id in self.dict['Aliases']:
                         udata['AC'].append(
                             '%s;' %
                             string.join(self.dict['Aliases'][id], '; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []  # List of features for sequence
                 for id in [
                         seq.shortName(), seq.info['AccNum'], seq.info['ID']
                 ]:
                     if id in self.dict['Features']:
                         ft += self.dict['Features'][id]
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list[
                                 'RegionDisorder']:
                             ft.append({
                                 'Type':
                                 'DISORDER',
                                 'Desc':
                                 'Predicted disorder: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 disorder[0],
                                 'End':
                                 disorder[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s > %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({
                                 'Type':
                                 'ORDER',
                                 'Desc':
                                 'Predicted order: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 fold[0],
                                 'End':
                                 fold[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s <= %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                     except:
                         self.log.errorLog(
                             'UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({
                             'SearchDB': '%s.fas' % utmp,
                             'HMMOut': '%s.hmm.out' % utmp
                         })  # This will be made for each sequence
                         hmm.search = []
                         hmm.list['HMMRes'] = [
                             hmm.hmmSearch(self.info['PFam'],
                                           outfile=hmm.info['HMMOut'])
                         ]  # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile, append=True)
                         if 'disorder' in self.list['UniFake']:
                             disorder = seq.obj['Disorder'].list[
                                 'ResidueDisorder']  # individual (IUPRed) residue results
                         else:
                             disorder = []
                         if hmm.search:
                             udata['CC'].append(
                                 'PFam: HMMer PFam search vs %s (Modified %s)'
                                 %
                                 (self.info['PFam'],
                                  time.ctime(
                                      os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append(
                                 '-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type': '!ERROR!', 'Name': name}
                             rje.delimitedFileOutput(
                                 self,
                                 hmmfile, [
                                     'Type', 'Name', 'Start', 'End', 'Eval',
                                     'Score'
                                 ],
                                 datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {
                                         'Start':
                                         aln.stat['SbjStart'],
                                         'End':
                                         aln.stat['SbjEnd'],
                                         'Type':
                                         'PFAM',
                                         'Desc':
                                         '%s PFam HMM Eval: %.2e; Score: %.1f'
                                         % (search.info['Name'],
                                            aln.stat['Expect'],
                                            aln.stat['BitScore'])
                                     }
                                     if disorder:
                                         region = disorder[
                                             aln.stat['SbjStart'] -
                                             1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(
                                             sum(region)) / len(region)
                                         pfamft[
                                             'Desc'] = '%s; IUPRed: %.2f' % (
                                                 pfamft['Desc'],
                                                 hmmdisorder)
                                         if hmmdisorder < self.stat[
                                                 'DisDom']:
                                             pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except:
                         self.log.errorLog(
                             'UniFake PFam HMM problem for %s.' % name)
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen(
                             '%s %s.fas -short' %
                             (self.info['TMHMM'], utmp)).readlines()
                         domlist = rje_tm.domainList(
                             rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1:
                             udata['CC'].append(
                                 'TMHMM: %d TM domains; N-Term %s' %
                                 ((len(domlist) - 1) / 2,
                                  domlist[0]['Type']))
                         else:
                             udata['CC'].append('TMHMM: 0 TM domains')
                     except:
                         self.log.errorLog('UniFake TMHMM problem for %s.' %
                                           name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system(
                             '%s -f short -t euk %s.fas > %s.signalp' %
                             (self.info['SignalP'], utmp, utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else:
                                     desc += ' (HMM also Y)'
                         if cpos > 0:
                             ft.append({
                                 'Type': 'SIGNALP',
                                 'Desc': desc,
                                 'Start': 1,
                                 'End': cpos
                             })
                     except:
                         self.log.errorLog(
                             'UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq, udata, ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(
                         seq, data=udata,
                         ft=ft):  ### Converts into UniProtEntry object
                     if not store: uniprot.saveUniProt(datfile, append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except:
                 self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp):
                 os.unlink(tmp)
             self.printLog(
                 '#UNIFAKE',
                 '|---------- %s run <<<|>>> %s to go -----------|' %
                 (rje.integerString(sx), rje.integerString(seqnum - sx)),
                 log=False)
         if store: uniprot.saveUniProt(datfile, append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)
     except:
         self.errorLog(
             'Oh, the shame of it! Trouble during UniFake.uniFake()')

Example #27

Show file

File: picsi.py Project: lyhniupi1/SLiMSuite

    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')

Example #28

Show file

File: rje_rf.py Project: slimsuite/SLiMSuite

 def rfAtt(self):      ### Generic method
     '''
     Generic method. Add description here (and arguments.)
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhead = ['Att','RF1','RF2','RF3','RF-1','RF-2','RF-3','ObsRF1','ObsRF2','ObsRF3','ObsRF-1','ObsRF-2','ObsRF-3','ExpRF1','ExpRF2','ExpRF3','ExpRF-1','ExpRF-2','ExpRF-3']
         rfdata = {}; rfobs = {}; rfexp = {}; ntfreq = {}
         for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
             rfdata[rf] = {}; rfobs[rf] = {}; rfexp[rf] = {}
             for x in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf][x] = 0; rfobs[rf][x] = 0; rfexp[rf][x] = 0
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf]['%s%s' % (a1,a2)] = 0; rfobs[rf]['%s%s' % (a1,a2)] = 0; rfexp[rf]['%s%s' % (a1,a2)] = 0
         for x in rje_seq.alph_dna[:-1]: ntfreq[x] = 0
         seqlist = self.obj['SeqList'] 
         ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (sx,stot) = (0.0,seqlist.seqNum())
         for seq in seqlist.seq:
             self.progLog('\r#ATT','Counting sequence attributes: %.2f%%' % (sx/stot)); sx += 100.0
             for x in seq.info['Sequence']:
                 if x in ntfreq: ntfreq[x] += 1
             rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence'])
             for r in rf6:
                 rseq = rf6[r]
                 rf = 'RF%d' % r
                 for i in range(len(rseq)):
                     a = rseq[i]; dia = rseq[i:i+2]
                     if a in rfdata[rf]: rfdata[rf][a] += 1
                     if dia in rfdata[rf]: rfdata[rf][dia] += 1
         self.printLog('\r#ATT','Counting sequence attributes complete.')
         ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ntobs = rje.dictFreq(ntfreq,total=True,newdict=True)
         ntcomp = {'Total':ntobs['Total']}
         for xy in ['AT','GC']: ntcomp[xy[0]] = ntobs[xy[1]]; ntcomp[xy[1]] = ntobs[xy[0]]
         for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
             aafreq = {}
             for a in rje_seq.alph_protx[:-1] + ['*']: aafreq[a] = rfdata[rf][a]
             aafreq = rje.dictFreq(aafreq,total=True,newdict=True)
             for a in rje_seq.alph_protx[:-1] + ['*']: rfobs[rf][a] = rfdata[rf][a]; rfexp[rf][a] = 0
             for n1 in 'GATC':
                 for n2 in 'GATC':
                     for n3 in 'GATC':
                         codon = '%s%s%s' % (n1, n2, n3)
                         aa = rje_sequence.dna2prot(codon)
                         if rf[-2] == '-': rfexp[rf][aa] += (int(ntobs['Total']/3.0) * ntcomp[n1] * ntcomp[n2] * ntcomp[n3])
                         else: rfexp[rf][aa] += (int(ntobs['Total']/3.0) * ntobs[n1] * ntobs[n2] * ntobs[n3])
                         #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3]))
                         #self.deBug('%s: %s' % (aa,rfexp[rf][aa]))
             for a1 in rje_seq.alph_protx[:-1] + ['*']:
                 for a2 in rje_seq.alph_protx[:-1] + ['*']:
                     rfexp[rf]['%s%s' % (a1,a2)] = (aafreq['Total'] - 1) * aafreq[a1] * aafreq[a2]
                     rfobs[rf]['%s%s' % (a1,a2)] = rfdata[rf]['%s%s' % (a1,a2)] 
         ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt'
         rje.delimitedFileOutput(self,rfile,rfhead,rje_backup=True)
         for a in rje_seq.alph_protx[:-1] + ['*']:
             data = {'Att':a}
             for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
                 data['Obs%s' % rf] = rfobs[rf][a]
                 data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                 data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
             rje.delimitedFileOutput(self,rfile,rfhead,datadict=data)
         for a1 in rje_seq.alph_protx[:-1] + ['*']:
             for a2 in rje_seq.alph_protx[:-1] + ['*']:
                 a = '%s%s' % (a1,a2)
                 data = {'Att':a}
                 for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']:
                     data['Obs%s' % rf] = rfobs[rf][a]
                     data['Exp%s' % rf] = '%.2f' % rfexp[rf][a]
                     data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a])
                 rje.delimitedFileOutput(self,rfile,rfhead,datadict=data)
         self.printLog('#TDT','TDT output complete.')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible

Example #29

Show file

 def tabulatePPIRegion(
         self):  ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']:
             return self.printLog('#REGTAB',
                                  '%s found. (Force=F)' % tabfile)
         headers = ['Protein', 'Start', 'End', 'Interactor']
         rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP', gcmd)
         prot = None
         rx = 0
         plist = []
         ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)', gline):
                 prot = rje.matchExp('ID   (\S+)', gline)[0]
             if rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline):
                 (rstart, rend, rint) = rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi):
                         datadict = {
                             'Protein':
                             prot,
                             'Start':
                             rstart,
                             'End':
                             rend,
                             'Interactor':
                             rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0]
                         }
                         rje.delimitedFileOutput(self,
                                                 tabfile,
                                                 headers,
                                                 datadict=datadict)
                         rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist:
                             ilist.append(datadict['Interactor'])
                         self.progLog(
                             '\r#REGTAB',
                             'Tabulating regions: %s proteins; %s interactors; %s regions'
                             % (rje.integerString(
                                 len(plist)), rje.integerString(
                                     len(ilist)), rje.integerString(rx)))
         self.printLog(
             '\r#REGTAB',
             'Tabulated regions (%s proteins; %s interactors; %s regions) => %s'
             % (rje.integerString(len(plist)), rje.integerString(
                 len(ilist)), rje.integerString(rx), tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible

Example #30

Show file

File: rje_mc58.py Project: slimsuite/SLiMSuite

 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())

Example #31

Show file

 def setupResults(self):    ### Main results setup method.
     '''Main results setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['Headers'] = ['Dataset','Query','Fitness','Phenotype','SeqGroup','CovP','CovB','CovW','Price','Ratio']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],rje_backup=True)
     except: self.errorLog('Problem during %s setupResults().' % self); raise

Example #32

Show file

File: seqmapper.py Project: slimsuite/SLiMSuite

    def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence
        '''
        Performs actual mapping of sequence.
        >> seq:SeqList object containing Sequence Object to be mapped
        >> blast:BLAST_Run object to perform BLAST and GABLAM
        >> search:Current BLAST search object for mapping
        >> outputmap:boolean = Whether to output mapping into a file [True]
        << returns shortName() of mapped sequence (or None if none)
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seq = seqlist.getSeq(format='tuple')
            mapseq = self.obj['MapDB']
            hits = blast.db('Hit').indexEntries('Query',search)
            self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits)))
            hitseq = {}; hitdata = {}
            for entry in hits:
                hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple')
                hitdata[entry['Hit']] = entry
            resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])}
            ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hits,hitdict) = self.orderHits(seq,hits,hitseq)
            self.debug(hits)
            self.debug(hitdict)
            ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for method in self.list['Mapping']:
                resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower())
                if resdict['Hit']:
                    resdict['Method'] = method[:1].upper() + method[1:].lower()
                    break
                elif method == 'gablam' and (len(hits) > 0):
                    resdict['Method'] = 'Rejected'
            self.debug(resdict)
            ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if resdict['Hit']:  #hitdict[hit]['Data']['ShortName']
                hit = resdict['Hit']['Hit']     # resdict['Hit'] is the BLAST table entry for Hit
                shortname = hitdict[hit]['Data']['ShortName']   # This is just hit!
                self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method']))
                ## Update Stats ##
                self.debug('')
                resdict['BlastRank'] = hitdata[hit]['Rank']
                for key in hitdict[hit]: resdict[key] = hitdict[hit][key]
                ## Fasta and Redundancy ##
                if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                else:
                    self.list['Mapped'].append(shortname)
                    if outputmap:
                        open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1]))
                resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode']
                resdict['Hit'] = shortname
            else:
                ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
                if 'grep' in self.list['Mapping']:
                    greplist = []; hitseq = ''
                    self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False)
                    for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines():
                        if line[:1] == '>': greplist.append(string.split(line[1:])[0])
                        elif not hitseq: hitseq = rje.chomp(line)
                    if greplist:
                        shortname = greplist.pop(0)
                        resdict['Hit'] = shortname
                        resdict['Method'] = 'Grep'
                        resdict['Qry_ID'] = '100.0'
                        resdict['Qry_Len'] = len(seq[1])
                        resdict['Hit_Len'] = len(hitseq)
                        resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1])
                        try: resdict['Hit_Species'] = string.split(shortname,'_')[1]
                        except: pass
                        if shortname in self.list['Mapped']:
                            self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                        else:
                            self.list['Mapped'].append(shortname)
                            if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq))
                    for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra))
                if not resdict['Hit'] and self.bool['Combine']:
                    ## Fasta and Redundancy ##
                    shortname = string.split(seq[0])[0]
                    if shortname in self.list['Mapped']:
                        self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas']))
                    else:
                        self.list['Mapped'].append(shortname)
                        if outputmap:
                            open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                elif outputmap:
                    open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method']))
            if outputmap:
                rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
            return resdict['Hit']

        except:
            self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True)
            return False

Example #33

Show file

File: unifake.py Project: kwikwag/SLiMSuite

 def uniFake(self,seqs=[],store=False):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx,seqnum) = (0,seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(self.log,self.cmd_list)   # UniProt object for saving data
         if self.info['DatOut'].lower() in ['','none']: self.info['DatOut'] = rje.baseFile(seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self,datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log,self.cmd_list+['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self,hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else: hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log,self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()                    
             self.printLog('#SEQ','Processing %s (%s aa) %s...' % (seq.shortName(),rje.integerString(seq.aaLen()),seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),seq.info['AccNum'])
                 open('%s.fas' % utmp,'w').write('>%s\n%s\n' % (seq.shortName(),seq.info['Sequence']))
                 udata = {'CC':['-!- Features generated using unifake.py'],'AC':[]}
                 if seq.info['SpecCode'] in ['Unknown','UNK']: seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']):
                     details = rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name'])
                     self.addAlias(seq.info['AccNum'],details[0])
                     self.addAlias(seq.info['AccNum'],details[1])
                     self.addAlias(seq.info['AccNum'],details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(),seq.info['AccNum']]:
                     if id in self.dict['Aliases']: udata['AC'].append('%s;' % string.join(self.dict['Aliases'][id],'; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []     # List of features for sequence
                 for id in [seq.shortName(),seq.info['AccNum'],seq.info['ID']]:
                     if id in self.dict['Features']: ft += self.dict['Features'][id]                        
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list['RegionDisorder']:
                             ft.append({'Type':'DISORDER','Desc':'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'],'Start':disorder[0],'End':disorder[1]})
                             if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % (ft[-1]['Desc'],dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({'Type':'ORDER','Desc':'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'],'Start':fold[0],'End':fold[1]})
                             if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % (ft[-1]['Desc'],dis.stat['IUCut'])
                     except: self.log.errorLog('UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({'SearchDB':'%s.fas' % utmp,'HMMOut':'%s.hmm.out' % utmp})      # This will be made for each sequence                    
                         hmm.search = []
                         hmm.list['HMMRes'] = [hmm.hmmSearch(self.info['PFam'],outfile=hmm.info['HMMOut'])]   # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile,append=True)
                         if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list['ResidueDisorder']          # individual (IUPRed) residue results
                         else: disorder = []
                         if hmm.search: udata['CC'].append('PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'],time.ctime(os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append('-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type':'!ERROR!','Name':name}
                             rje.delimitedFileOutput(self,hmmfile,['Type','Name','Start','End','Eval','Score'],datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {'Start':aln.stat['SbjStart'],'End':aln.stat['SbjEnd'],'Type':'PFAM',
                                                'Desc':'%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'],aln.stat['Expect'],aln.stat['BitScore'])}
                                     if disorder:
                                         region = disorder[aln.stat['SbjStart']-1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(sum(region)) / len(region)
                                         pfamft['Desc'] = '%s; IUPRed: %.2f' % (pfamft['Desc'],hmmdisorder)
                                         if hmmdisorder < self.stat['DisDom']: pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except: self.log.errorLog('UniFake PFam HMM problem for %s.' % name)                  
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen('%s %s.fas -short' % (self.info['TMHMM'],utmp)).readlines()
                         domlist = rje_tm.domainList(rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1: udata['CC'].append('TMHMM: %d TM domains; N-Term %s' % ((len(domlist)-1)/2,domlist[0]['Type']))
                         else: udata['CC'].append('TMHMM: 0 TM domains')
                     except: self.log.errorLog('UniFake TMHMM problem for %s.' % name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system('%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'],utmp,utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else: desc += ' (HMM also Y)'
                         if cpos > 0: ft.append({'Type':'SIGNALP','Desc':desc,'Start':1,'End':cpos})
                     except: self.log.errorLog('UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq,udata,ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(seq,data=udata,ft=ft):    ### Converts into UniProtEntry object 
                     if not store: uniprot.saveUniProt(datfile,append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except: self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp): os.unlink(tmp)
             self.printLog('#UNIFAKE','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx),rje.integerString(seqnum-sx)),log=False)
         if store: uniprot.saveUniProt(datfile,append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)            
     except: self.errorLog('Oh, the shame of it! Trouble during UniFake.uniFake()')

Example #34

Show file

File: rje_gquad.py Project: slimsuite/SLiMSuite

    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A','C','G','T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []     # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {}  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {}  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1,n2,n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod,'T','U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,total=False)                                 # Normalise triplet freq.
            for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('#FREQ','Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0]
                except:
                    try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0]
                    except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0]
                self.deBug(exons)
                exons = string.split(exons,',')
                elen = []
                try:
                    for exon in exons:
                        (start,end) = string.split(exon,'..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue
                        
                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False)
                if seq.aaLen()/3 != seq.aaLen()/3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence,'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                elen = seq.list['ExonLen'] 
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0      # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:    # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:              # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1,3):
                                    if elen[0] > ax + i:    # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:    # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:              # Look at overlap with previous codon
                        for i in range(1,3):
                            if elen[0] > ax + i:    # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq,total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False)    
            self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip']
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self,tfile,headers,rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod,'T','U')]
                datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod],
                            'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod],
                            'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]}
                rje.delimitedFileOutput(self,tfile,headers,datadict=datadict)
            self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())

Example #35

Show file

    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A', 'C', 'G', 'T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []  # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {
            }  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {
            }  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1, n2, n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod, 'T', 'U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][
                            cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,
                                     total=False)  # Normalise triplet freq.
            for aa in nts_cfreq:
                nts_cfreq[aa] = rje.dictFreq(
                    nts_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog('#FREQ', 'Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog(
                    '\r#OBS',
                    'Calculating observed codon frequencies: %.1f%%' %
                    (sx / stot),
                    newline=False,
                    log=False)
                sx += 100.0
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try:
                    exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',
                                         pos)[0]
                except:
                    try:
                        exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0]
                    except:
                        exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0]
                self.deBug(exons)
                exons = string.split(exons, ',')
                elen = []
                try:
                    for exon in exons:
                        (start, end) = string.split(exon, '..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue

                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen():
                    self.log.errorLog('%s exon length error' % id,
                                      printerror=False)
                if seq.aaLen() / 3 != seq.aaLen() / 3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,
                                      printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence, 'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id, printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq:
                obs_cfreq[aa] = rje.dictFreq(
                    obs_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog(
                '\r#OBS', 'Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP',
                                  'Calculating triplet frequencies: %.1f%%' %
                                  (sx / stot),
                                  newline=False,
                                  log=False)
                sx += 100.0
                elen = seq.list['ExonLen']
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0  # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:  # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:  # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1, 3):
                                    if elen[0] > ax + i:  # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (
                                            obs_cfreq[prevaa][cod1] *
                                            obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (
                                            nts_cfreq[prevaa][cod1] *
                                            nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:  # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:  # Look at overlap with previous codon
                        for i in range(1, 3):
                            if elen[0] > ax + i:  # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq, total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False)
            self.log.printLog('\r#TRIP',
                              'Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip',
                'NT_Trip', 'ObCod_Trip', 'NTCod_Trip'
            ]
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self, tfile, headers, rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod, 'T', 'U')]
                datadict = {
                    'Triplet': cod,
                    'AA': aa,
                    'Degen': len(obs_cfreq[aa]),
                    'Obs_Codon': obs_cfreq[aa][cod],
                    'NT_Codon': nts_cfreq[aa][cod],
                    'Obs_Trip': obs_tfreq[cod],
                    'NT_Trip': nts_tfreq[cod],
                    'ObCod_Trip': ocd_tfreq[cod],
                    'NTCod_Trip': ncd_tfreq[cod]
                }
                rje.delimitedFileOutput(self,
                                        tfile,
                                        headers,
                                        datadict=datadict)
            self.log.printLog('#OUT',
                              'Triplet & codon data output to %s' % tfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())

Example #36

Show file

File: rje_phos.py Project: kwikwag/SLiMSuite

    def mapPhosByBLAST(self,fasfile):   ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F']
            qseqlist = rje_seq.SeqList(self.log,scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name','Pos','AA','PELM','PELMPos','Evidence']
            delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self,ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F'])
            pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile})
            pblast.setStat({'HitAln':pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key']
            for g in ['ID','Hom']:
                if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True,log=True)    # BLAST
            pblast.readBLAST(gablam=True)               # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F'])
                aln.seq = [qseq]
                pdict = {}      # Dictionary of {hseq:[poslist]}
                rdict = {qseq:0}      # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False):
                            self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln()   #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln','-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a],
                                     'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'}
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata)
                            self.addPhos(aln.seq[-1],a,pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()


            # Align hits for each > X %ID
            # Map phosphosites onto alignment and output #
            
            return
        except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')

Example #37

Show file

File: rje_mc58.py Project: lyhniupi1/SLiMSuite

    def run(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for fasta in glob.glob('*.fasta'):
                fas = fasta[:-2]
                if os.path.exists(fas): continue
                sx = 0
                for line in open(fasta, 'r').readlines():
                    if line[:1] == '>':
                        try:
                            (name,
                             desc) = rje.matchExp('^>(\S+) (\S.+)$', line)
                        except:
                            name = rje.matchExp('^>(\S+)', line)[0]
                        if len(string.split(name, '|')) == 3:
                            name = '6rf_NEIME__%s' % string.split(name, '|')[2]
                            open(fas, 'a').write('>%s\n' % name)
                        elif len(string.split(name, '|')) == 5:
                            name = 'ref_NEIME__%s' % string.split(name, '|')[3]
                            open(fas, 'a').write('>%s %s\n' % (name, desc))
                        else:
                            print string.split(name, '|')
                            raise ValueError
                        self.progLog(
                            '\r#FAS', 'Processing %s: %s seqs' %
                            (fas, rje.integerString(sx)))
                        sx += 1
                    else:
                        open(fas, 'a').write(line)
                self.printLog(
                    '\r#FAS', 'Processed %s: %s seqs from %s' %
                    (fas, rje.integerString(sx), fasta))
                rje_blast.BLASTRun(self.log,
                                   self.cmd_list).formatDB(fas,
                                                           protein=True,
                                                           force=True)
            ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            rfhits = {}  # Dictionary of {hit:['File:hit_num']}
            acc = 'MC58_6RF_Hits.acc'
            open(acc, 'w')
            gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
            cx = 0
            for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
                cx += 1
                file = os.path.basename(csv)[:-4]
                hits = False
                for line in open(csv, 'r').readlines():
                    if line.find('prot_hit_num,prot_acc') == 0: hits = True
                    elif hits:
                        data = rje.readDelimit(line, ',')
                        if len(data) < 2: continue
                        [num, name] = data[:2]
                        try:
                            name = string.split(name, '|')[2]
                        except:
                            continue
                        if name not in rfhits:
                            open(acc, 'a').write('6rf_NEIME__%s\n' % name)
                            rfhits[name] = []
                        id = '%s:%s' % (file, num)
                        if id not in rfhits[name]: rfhits[name].append(id)
                        self.progLog(
                            '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' %
                            (cx, rje.integerString(len(rfhits))))
            self.printLog(
                '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' %
                (cx, rje.integerString(len(rfhits)), acc))
            ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(gfile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=%s' % acc, 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas',
                        'qryacc=F'
                    ]).gablam()
            ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum'])
            zeros = []
            for hit in gdata:
                if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
            zeros = rje.sortUnique(zeros, False)
            open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n'))
            self.printLog(
                '#ZERO',
                '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
            ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
            if not os.path.exists(ufile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Zeros.fas',
                        'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas',
                        'qryacc=F'
                    ]).gablam()
            gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True)
            fdata = rje.dataDict(self,
                                 string.replace(ufile, 'hitsum', 'gablam'),
                                 ['Qry'], ['Hit'],
                                 lists=True)
            headers = gdata.pop('Headers')
            headers.insert(1, 'Sample')
            headers.append('BestHit')
            rje.delimitedFileOutput(self,
                                    'MC58_6RF_Zeros.tdt',
                                    headers,
                                    rje_backup=True)
            for rf in rje.sortKeys(gdata):
                rfcut = string.split(rf, '__')[1]
                gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ')
                gdata[rf]['Qry'] = rfcut
                try:
                    gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
                except:
                    gdata[rf]['BestHit'] = '-'
                rje.delimitedFileOutput(self,
                                        'MC58_6RF_Zeros.tdt',
                                        headers,
                                        datadict=gdata[rf])

        except:
            self.errorLog(rje_zen.Zen().wisdom())
        self.printLog('#ZEN', rje_zen.Zen().wisdom())