Python sortUnique Examples

Programming Language: Python

Namespace/Package Name: rje

Method/Function: sortUnique

Examples at hotexamples.com: 24

Python sortUnique - 24 examples found. These are the top rated real world Python examples of rje.sortUnique extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: rje_hmm_V1.py Project: lyhniupi1/SLiMSuite

 def readResults(self,
                 clear=True,
                 readaln=False
                 ):  ### Reads results from self.list['HMMRes'] into objects
     '''
     Reads results from self.list['HMMRes'] into objects.
     >> clear:boolean = whether to clear self.search before reading [True]
     >> readaln:boolean = whether to bother reading Alignments into objects [False]
     '''
     try:
         if clear: self.search = []
         for resfile in rje.sortUnique(self.list['HMMRes'], xreplace=False):
             if not os.path.exists(
                     resfile) and self.opt['GZip'] and os.path.exists(
                         '%s.gz' % resfile):
                 os.system('gunzip %s.gz' % resfile)
                 self.printLog('#GUNZIP', 'Gunzipped %s.gz' % resfile)
             if self.opt['HMMPFam']:
                 self.readHMMPFamSearch(resfile, readaln)
             else:
                 self.readHMMSearch(resfile, readaln)
             if self.opt['GZip'] and os.path.exists(resfile):
                 rje.backup(self, '%s.gz' % resfile, unlink=True)
                 os.system('gzip %s' % resfile)
                 self.printLog('#GZIP',
                               '%s gzipped to save space' % resfile)
     except:
         self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',
                           quitchoice=True)
         return False

Example #2

Show file

 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self, outdir)
         dpi = {}  # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']:
                     dpi[dom] += self.dict['PPI'][
                         hub]  # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1:
                     dpi[dom].remove(
                         spoke)  # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict[
                                 'PPI'][spoke]:
                             self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom], False, False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.dpi.acc' % (outdir, dom),
                  'w').write(string.join(acc, '\n'))
             self.printLog('#DPI',
                           '%s domain => %d interactors' % (dom, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI',
                           'No %s PPI left after DPI removed' % hub,
                           screen=False)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after DPI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)

Example #3

Show file

 def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]):  ### Special GO power calculation for GO slim set
     '''
     Special GO power calculation for GO slim set.
     >> numbers:dictionary of {Sample:Count}
     >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time.
     >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples.
     >> total:str ['Total'] = Sample containing Total counts to compare against
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     >> ignore:list of Samples to ignore from calculation
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         N = numbers[total]        # Total count for calculating expectations/probabilities
         nlist = []                  # List of counts for subsamples to be assessed
         for sample in numbers:
             if sample not in ignore + [total]: nlist.append(numbers[sample])
         nlist = rje.sortUnique(nlist,xreplace=False,num=True)
         ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         plist = []                  # List of acceptable Total counts for subset
         nx = 0.0
         for i in range(1,N+1):      # Look at all possible levels of occurrence
             self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N))
             nx += 100.0
             ok = 0
             p = float(i) / N        # Probability of each gene having this term
             for n in nlist:         # Look at each subset
                 k1 = min(i,n)       # Want to look at largest possible count for sample-term pairing
                 k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation
                 if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1
                 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1
                 #!# Add under-representation too! #!#
                 if ok and samples == 'any': break
             if (ok and samples == 'any') or ok == len(nlist): plist.append(i)
         self.printLog('\r#POW','Calculation of GO term power complete.',log=False)
         self.deBug(nlist)
         ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         terms = []
         (ix,itot) = (0.0,len(self.go()))
         for id in rje.sortKeys(self.go()):
             self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms)))
             ix += 100.0
             if self.go(id)[countkey][total] in plist: terms.append(id)
         self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms)))
         #!# Add correction terms #!#
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.powerGO()')
     return []

Example #4

Show file

File: slimpid.py Project: slimsuite/SLiMSuite

 def fpi(self):  ### Family-protein interactions
     '''Family-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_FPI'
         rje.mkDir(self,outdir)
         fpi = {}            # Dictionary of {family:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qry in rje.sortKeys(self.dict['PPI']):
             try:
                 fam = self.dict['Fam'][qry]
                 if len(fam) < 2: continue
             except: self.errorLog('Problem with "%s" protein family' % qry); continue
             fpi[qry] = []
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 fpi[qry] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in fpi[qry][0:]:
                 if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke)   # Must have 2+ family interactions
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in fpi[qry]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             fpi[qry] = rje.sortUnique(fpi[qry],False,False)
             acc = []
             gene = self.dict['Gene'][qry]
             for name in fpi[qry]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name)                     
             open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n'))
             self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#FPI','No %s PPI left after FPI removed' % hub)
         self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)

Example #5

Show file

File: rje_hmm_V1.py Project: kwikwag/SLiMSuite

 def readResults(self,clear=True,readaln=False):  ### Reads results from self.list['HMMRes'] into objects
     '''
     Reads results from self.list['HMMRes'] into objects.
     >> clear:boolean = whether to clear self.search before reading [True]
     >> readaln:boolean = whether to bother reading Alignments into objects [False]
     '''
     try:
         if clear: self.search = []
         for resfile in rje.sortUnique(self.list['HMMRes'],xreplace=False):
             if not os.path.exists(resfile) and self.opt['GZip'] and os.path.exists('%s.gz' % resfile):
                 os.system('gunzip %s.gz' % resfile)
                 self.printLog('#GUNZIP','Gunzipped %s.gz' % resfile)
             if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile,readaln)
             else: self.readHMMSearch(resfile,readaln)
             if self.opt['GZip'] and os.path.exists(resfile):
                 rje.backup(self,'%s.gz' % resfile,unlink=True)
                 os.system('gzip %s' % resfile)
                 self.printLog('#GZIP','%s gzipped to save space' % resfile)
     except:
         self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',quitchoice=True)
         return False

Example #6

Show file

File: slimpid.py Project: slimsuite/SLiMSuite

 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self,outdir)
         dpi = {}            # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke)   # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom],False,False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name) 
             open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n'))
             self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False)
         self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)

Example #7

Show file

File: rje_taxamap.py Project: lyhniupi1/SLiMSuite

 def treeListSPCode(self):  ### Main taxa mapping from list of tree files
     '''Main taxa mapping from list of tree files.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         specdb = self.db('spcode',
                          add=True,
                          forcecheck=True,
                          mainkeys=['protein'])
         if not specdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.spcode.tdt' % self.getStr('TaxBase')
             specdb = db.addTable(spfile,
                                  mainkeys=['protein'],
                                  name='spcode',
                                  expect=False)
         if specdb:
             specdb.dataFormat({'boot': 'num'})
             return True
         specdb = db.addEmptyTable(
             'spcode',
             ['protein', 'boot', 'spcode', 'inpara', 'paralogues'],
             ['protein'])
         #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein'])
         self.dict['Duplicates'] = {}  # {prot1:[dups]}
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for nwkfile in self.list['NwkList']:
             tree = rje_tree.Tree(self.log, self.cmd_list)
             tree.loadTree(nwkfile, seqlist=None, postprocess=False)
             seqacc = rje.baseFile(nwkfile, strip_path=True)
             # Identify node corresponding to query sequence
             seqnode = None
             for node in tree.nodes():
                 try:
                     if string.split(node.shortName(), '__')[1] == seqacc:
                         seqnode = node
                 except:
                     pass  # Internal node or bad sequence format
             if not seqnode:
                 self.warnLog('Could not find %s in %s nodes!' %
                              (seqacc, nwkfile))
                 continue
             # Get species code for query sequence
             seqspec = tree.cladeSpec(seqnode)
             if len(seqspec) != 1:
                 self.warnLog('Could not find species in %s node!' %
                              (seqacc))
                 continue
             seqspec = seqspec.keys()[0]
             if seqspec != string.split(seqnode.shortName(), '_')[1]:
                 raise ValueError('Species mismatch for %s & %s' %
                                  (seqacc, seqnode.shortName()))
             # Find ancestor with closest orthologue outgroup
             rootnode = tree._getRootNode()
             if not rootnode:
                 self.warnLog('Could not find root node in %s!' % (nwkfile))
                 continue
             ancnode = seqnode.ancNode()
             try:
                 bootx = float(ancnode.ancBranch().stat['Bootstrap']
                               ) / tree.stat['Bootstraps']
             except:
                 bootx = 1.0
             inparanode = None  # Node to define in-paralogues
             ancspec = tree.cladeSpec(ancnode)
             while len(ancspec) < 2 or bootx < self.getNum('MinBoot'):
                 inparanode = ancnode  # All same species
                 if ancnode == rootnode: break
                 ancnode = ancnode.ancNode()
                 ancspec = tree.cladeSpec(ancnode)
                 try:
                     bootx = float(ancnode.ancBranch().stat['Bootstrap']
                                   ) / tree.stat['Bootstraps']
                 except:
                     bootx = 1.0
             ancspec.pop(
                 seqspec)  # Now only have counts of closest other species
             # Update table, replacing species codes with genera?
             sentry = {
                 'protein': seqacc,
                 'spcode': rje.sortUnique(ancspec.keys())
             }
             sentry['boot'] = bootx
             if not ancspec:
                 sentry['spcode'] = ['None']
                 sentry['boot'] = self.getNum('NoneBoot')
             sentry['spcode'] = string.join(sentry['spcode'], '|')
             # Establish list of duplicate proteins
             inpara = []  # List of in-paralogue nodes
             inparacc = []  # List of in-paralogue accnum
             if inparanode:
                 inpara = tree._nodeClade(inparanode, internal=False)
             self.dict['Duplicates'][seqacc] = []
             for node in tree._nodeClade(rootnode, internal=False):
                 if node == seqnode: continue
                 if len(string.split(node.shortName(), '_')) < 2: continue
                 if string.split(node.shortName(), '_')[1] == seqspec:
                     paracc = string.split(node.shortName(), '__')[1]
                     if node in inpara: inparacc.append(paracc)
                     else: self.dict['Duplicates'][seqacc].append(paracc)
             sentry['inpara'] = string.join(inparacc, '|')
             sentry['paralogues'] = string.join(
                 self.dict['Duplicates'][seqacc], '|')
             specdb.addEntry(sentry)
         ## Update specdb and save
         specdb.saveToFile()
         #dupdb.saveToFile()
         return True
     except:
         self.errorLog(self.zen())
         return False

Example #8

Show file

File: rje_mitab.py Project: slimsuite/SLiMSuite

    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Setup Objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.getStrLC('DBSource'): self.setStr({'DBSource':string.split(rje.stripPath(self.getStr('MITAB')),'.')[0]})
            if not self.obj['DB']: self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
            pdb = self.db('pairwise',add=False)
            pfields = ['#','Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType']
            if not pdb: self.db().addEmptyTable('pairwise',pfields,['#'],log=True)
            if not self.obj['XRef']:
                xcmd = ['mapfields=Gene,%s,Secondary,Ensembl,Aliases,Accessions,RefSeq,Previous Symbols,Synonyms' % self.getStr('UniField')]
                self.obj['XRef'] = rje_xref.XRef(self.log,xcmd+self.cmd_list)
                self.obj['XRef'].setup()
            skip_comments = True
            for field in self.list['IDField']:
                if field[:1] == '#': skip_comments = False
            if self.list['MapDB'] and 'uniprotkb' not in self.list['MapDB']:
                self.list['MapDB'].append('uniprotkb')
                self.printLog('#MAP','uniprotkb added to MapDB list.')
            elif not self.list['MapDB']: self.printLog('#MAP','No MapDB list: will attempt to match all IDs to xref KeyID "%s".' % self.obj['XRef'].getStr('KeyID'))
            ### ~ [2] Setup MITAB File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.open('MITAB')
            if not self.file['MITAB']: raise IOError
            self.printLog('#MITAB','Parse PPI from %s.' % self.getStr('MITAB'))
            ## ~ [2a] MITAB file headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            headers = []
            while not headers:
                self.list['Headers'] = headers = self.readDelimit('MITAB')
                if not headers: break
                if headers[0][:1] == '#' and skip_comments: headers = []; continue
            #self.debug(headers)
            ## ~ [2b] IDField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            idfields = []
            for hfield in headers:
                #self.bugPrint(hfield.upper())
                for idfield in rje.sortUnique(self.list['IDField'])[0:]:
                    idfield = string.replace(idfield.upper(),'(','\(')
                    idfield = string.replace(idfield,')','\)')
                    idmatch = rje.matchExp('^(%s\s?[AB])$' % idfield.upper(),hfield.upper())
                    if not idmatch: idmatch = rje.matchExp('^(%s\s?[AB]) \(\S+\)$' % idfield.upper(),hfield.upper())
                    if idmatch and hfield not in idfields:
                        idfields.append(hfield)
                        self.printLog('#ID','IDField: %s' % hfield)
                        #self.bugPrint(idfields)
                        break
            #self.debug(idfields)
            self.list['IDField'] = idfields
            if not self.list['IDField']: raise ValueError('No IDField found in MITAB headers.')
            ## ~ [2c] TaxaField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            taxafields = []
            for tfield in self.list['TaxaField'][0:]:
                for hfield in headers:
                    tmatch = rje.matchExp('^(%s\s?[AB])$' % tfield.upper(),hfield.upper())
                    if not tmatch: tmatch = rje.matchExp('^(%s\s?[AB]) \(\S+\)$' % tfield.upper(),hfield.upper())
                    if tmatch and hfield not in taxafields:
                        taxafields.append(hfield)
                        self.printLog('#TAX','TaxaField: %s' % hfield)
            self.list['TaxaField'] = taxafields
            if not self.list['TaxaField']: self.warnLog('No TaxaField found in MITAB headers.',quitchoice=True)
            ## ~ [2d] TypeField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            methfields = []
            lctypes = rje.listLower(self.list['MethodField'])
            for hfield in headers:
                if hfield.lower() in lctypes:
                    methfields.append(hfield)
                    self.printLog('#METH','MethodField: %s' % hfield)
            self.list['MethodField'] = methfields
            if not self.list['MethodField']: self.warnLog('No MethodField found in MITAB headers.',quitchoice=True)
            ## ~ [2e] TypeField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            typefields = []
            lctypes = rje.listLower(self.list['TypeField'])
            for hfield in headers:
                if hfield.lower() in lctypes:
                    typefields.append(hfield)
                    self.printLog('#TYPE','TypeField: %s' % hfield)
            self.list['TypeField'] = typefields
            if not self.list['TypeField']: self.warnLog('No TypeField found in MITAB headers.',quitchoice=True)

            return True     # Setup successful
        except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed

Example #9

Show file

File: rje_mitab.py Project: slimsuite/SLiMSuite

    def parseMITAB(self):   ### Parse MITAB file into pairwise PPI table.
        '''Parse MITAB file into pairwise PPI table.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            xref = self.obj['XRef']
            pdb = self.db('pairwise')
            pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType']
            headers = {}
            for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h
            dbsource = self.getStr('DBSource')
            ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0
            epos = self.endPos('MITAB')
            complexidlist = []
            badtaxa = ['-']
            baduni = []
            while 1:
                self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
                mline = self.readDelimit('MITAB'); mx += 1
                if not mline: break
                entry = {'#':pdb.entryNum()}
                for field in pfields: entry[field] = ''
                ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                complexid = {}     # This will take the first complex ID
                if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']:
                    if int(mline[headers['numParticipants']]) > 2:
                        complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']]
                        #self.bugPrint(mline)
                        #self.debug(complexid)
                ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                taxa = {'A':'','B':''}
                for tfield in self.list['TaxaField']:
                    ab = tfield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0]
                    try:
                        taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0]
                        if self.list['TaxID'] and taxon not in self.list['TaxID']: continue
                        taxa[ab] = taxon
                    except:
                        taxon = mline[headers[tfield]]
                        if taxon not in badtaxa:
                            badtaxa.append(taxon)
                            self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True)
                        if not self.list['TaxID']: taxa[ab] = '-'
                if not taxa['A'] and complexid: taxa['A'] = taxa['B']
                if not taxa['B'] and complexid: taxa['B'] = taxa['A']
                if not (taxa['A'] and taxa['B']): ftx += 1; continue
                ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ids = {'A':[],'B':[]}
                uni = {'A':'','B':''}
                for ifield in self.list['IDField']:
                    ab = ifield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0]
                    # Split IDs on | then db:id vs self.list['MapDB']
                    for pid in string.split(mline[headers[ifield]],'|'):
                        try: (db,dbid) = string.split(pid,':',1)
                        except: continue
                        if db.lower() in ['uniprotkb'] and '(' in dbid: continue    # Only map uniprotkb accnum
                        dbid = string.split(dbid,'(')[0]
                        dbid = string.split(dbid,';')[0]
                        if db.lower() in ['uniprotkb']:
                            svid = dbid
                            dbid = string.split(svid,'-')[0]
                        if ab not in complexid:     # First identifier for A/B
                            if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid)
                            else: complexid[ab] = ''
                        if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid)
                        # Parse uniprot directly if possible
                        if db.lower() in ['uniprotkb'] and not uni[ab]:
                            if self.getBool('SpliceVar'): uni[ab] = svid
                            else: uni[ab] = dbid
                #self.bugPrint(ids)
                ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                amb = {'A':False,'B':False}
                if not ids['A'] or not ids['B']:
                    #self.bugPrint('%s\n=> ID Failure' % mline)
                    #self.bugPrint(ids['A']); self.bugPrint(ids['B'])
                    #self.bugPrint(entry)
                    fx += 1; continue
                for ida in ids['A']:
                    #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False)))
                    if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True)
                    if entry['Hub'] == False: amb['A'] = True
                    #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida)
                if self.getBool('AddUni') and not entry['HubUni']:
                    entry['HubUni'] = uni['A']
                    if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A'])
                if not entry['Hub'] and entry['HubUni']:
                    entry['Hub'] = entry['HubUni']
                    #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True)
                    uhx += 1
                if not entry['Hub'] and complexid['A']:
                    entry['Hub'] = complexid['A']
                else: complexid['A'] = ''
                if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = ''
                for idb in ids['B']:
                    if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True)
                    if entry['Spoke'] == False: amb['B'] = True
                    #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb)
                if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B']
                if not entry['Spoke'] and entry['SpokeUni']:
                    entry['Spoke'] = entry['SpokeUni']
                    #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True)
                    usx += 1
                if not entry['Spoke'] and complexid['B']:
                    entry['Spoke'] = complexid['B']
                else: complexid['B'] = ''
                if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']:
                    entry['Spoke'] = ''
                    if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B'])
                if complexid['A'] and complexid['B']:
                    if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')):
                        self.printLog('\r#MITAB','',log=False)
                        self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True)
                    entry['Hub'] = entry['Spoke'] = ''
                #self.bugPrint(entry)
                #self.debug(complexid)
                if not (entry['Hub'] and entry['Spoke']):
                    if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']):
                        fax += 1; continue
                    #self.bugPrint(mline); self.debug(entry)
                    fx += 1; continue
                #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue
                entry['HubTaxID'] = taxa['A']
                entry['SpokeTaxID'] = taxa['B']
                if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A'])
                if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B'])
                #if complexid['A'] or complexid['B']: self.debug(entry)
                ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                #self.bugPrint(mline)
                evidence = []
                for tfield in self.list['MethodField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: evidence.append('%s:%s' % (dbsource,ematch[0]))
                if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource')))
                evidence = rje.sortUnique(evidence)
                #self.debug(evidence)
                entry['Evidence'] = string.join(evidence,'|')
                ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                itypes = []
                for tfield in self.list['TypeField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: itypes.append(ematch[0])
                if not itypes: itypes.append('unknown')
                itypes = rje.sortUnique(itypes)
                #self.debug(itypes)
                entry['IType'] = string.join(itypes,'|')
                pdb.addEntry(entry); ex += 1
                if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']:
                    self.printLog('#DEV',string.join(mline,'\t'))
                    #self.bugPrint(uni); self.debug(entry)
                if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']:
                    pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                  'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                  'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                  'Evidence':entry['Evidence'],'IType':entry['IType']})
            self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
            self.close('MITAB')
            if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx)))
            if baduni:
                baduni.sort()
                accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource)
                self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout))
                open(accout,'w').write(string.join(baduni,'\n'))

            ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not complexidlist: return pdb
            self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist))
            ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            complexes = {}; chentries = []; csentries = []
            cevidence = {}  # List of Evidence for each complex
            citypes = {}    # List of ITypes for each complex
            ctaxa = {}
            ex = 0.0; etot = pdb.entryNum()
            for entry in pdb.entries():
                self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if entry['Hub'] in complexidlist:
                    cid = entry['Hub']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Spoke'])
                    ctaxa[entry['Spoke']] = entry['SpokeTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    chentries.append(entry)
                elif entry['Spoke'] in complexidlist:
                    cid = entry['Spoke']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Hub'])
                    ctaxa[entry['Hub']] = entry['HubTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    csentries.append(entry)
            self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist)))
            #self.debug(complexes)
            ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cppi = {}
            ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0
            for cid in rje.sortKeys(complexes):
                self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid]))
                if len(complexes[cid]) < 2:
                    complexes.pop(cid)
                    cevidence.pop(cid)
                    citypes.pop(cid)
                    rx += 1; continue
                complexes[cid].sort()
                #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|')
                #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|')
                cmax = max(cmax,len(complexes[cid]))
                #px += (len(complexes[cid]) * (len(complexes[cid])-1))
                members = complexes[cid][0:]
                while members:
                    hub = members.pop(0)
                    if self.dev() and hub == 'KLF3': self.debug(cid)
                    if hub not in cppi: cppi[hub] = {}
                    for spoke in members:
                        if spoke not in cppi[hub]:
                            cppi[hub][spoke] = []; px += 1
                            cppi[hub][spoke].append(cid)
            self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px)))
            ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cix = pdb.entryNum()
            for centry in chentries + csentries: pdb.dropEntry(centry)
            ex = 0.0; etot = len(cppi)
            for hub in rje.sortKeys(cppi):
                self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0
                #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]}
                hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]}
                for spoke in rje.sortKeys(cppi[hub]):
                    evidence = []
                    itypes = []
                    ctypes = []
                    for cid in cppi[hub][spoke]:
                        evidence += cevidence[cid]
                        itypes += citypes[cid]
                        ctypes += string.split(cid,':')[0]
                    ctype = string.join(rje.sortUnique(ctypes),'|')
                    evidence = string.join(rje.sortUnique(evidence),'|')
                    if not evidence: evidence = '%s:%s' % (dbsource,ctype)
                    itypes = string.join(rje.sortUnique(itypes),'|')
                    if not itypes: itypes = ctype
                    #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]}
                    newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]}
                    newentry['Evidence'] = evidence
                    newentry['IType'] = itypes
                    entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1
                    if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry)
                    if self.getBool('Symmetry'):
                        pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                      'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                      'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                      'Evidence':entry['Evidence'],'IType':entry['IType']})
                        cix += 1
            self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry')))
            return pdb
        except: self.errorLog('%s.parseMITAB error' % self.prog())

Example #10

Show file

 def seqSubset2(
     self
 ):  ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),
                                      mainkeys=['Ingolia'],
                                      name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile, mainkeys=['Gene'], name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(
                 ['sourcedata=%s' % xfile,
                  'aliases=%s' % afile])
             ing_genes = string.split(
                 string.join(
                     self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes:
                 ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog(
                         '#MAP',
                         'Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene)
                     ing_mgi.remove(gene)
             self.printLog(
                 '#BAD', 'Failed to map %s genes from Ignolia' %
                 rje.iLen(bad_genes))
             open('ingolia.bad.txt', 'w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),
                                     'EnsEMBL',
                                     sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna', 'pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + [
                         'seqin=%s' % seqfile,
                         'seqout=%s' % seqout, 'autofilter=T', 'autload=T',
                         'seqmode=file',
                         'gooddesc=%s' % string.join(ing_musg, ',')
                     ]
                     rje_seqlist.SeqList(self.log, seqcmd)
             mdb = self.db().addEmptyTable('map',
                                           ['Ingolia', 'Gene', 'EnsEMBL'],
                                           ['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia': gene, 'Gene': ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else:
                     entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + [
             'seqin=%s' % seqfile, 'autofilter=F', 'autload=T',
             'seqmode=file'
         ]
         iseq = rje_seqlist.SeqList(self.log, seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST', evalue='')
             while iseq.nextSeq():
                 (iname, icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)', iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL', musg):
                     if entry['ENST']:
                         entry['ENST'] += ',%s' % string.split(iname)[0]
                     else:
                         entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]': 'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST')
         sdb.addField('ENSP')
         sdb.addField('ENSI')
         ENST = open('IngExact.cdna.all.fa', 'w')
         ENSP = open('IngExact.pep.all.fa', 'w')
         ex = 0.0
         etot = sdb.entryNum()
         sx = 0
         fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog(
                 '\r#ING',
                 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot))
             ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']:
                 fx += 1
                 continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'], ','):
                 (tname, tseq) = iseq.getDictSeq(trans, format='tuple')
                 self.deBug('%s vs %s' %
                            (tseq[entry[icod] - 3:][:7], entry[icon]))
                 if tseq[entry[icod] - 3:][:7] == entry[icon]:
                     ipept = string.split(
                         rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog(
                     '\r#ING',
                     'Unable to find Harrington start for %s %s (%s)' %
                     (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             elif len(peptseq) < minpep:
                 self.printLog(
                     '\r#ING',
                     'Peptide from mapped Harrington start for %s %s (%s) too short!'
                     % (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             id = rje.preZero(int(ex / 100), etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write(
                 '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'],
                    entry['Gene'], mentry['Gene'], cdnaseq))
             ENSP.write(
                 '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id,
                    entry['Gene'], mentry['Gene'], peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close()
         ENSP.close()
         self.printLog(
             '\r#ING',
             'Output %s Ingolia peptides and transcripts. %s failed.' %
             (rje.iStr(sx), rje.iStr(fx)))
         return
     except:
         self.errorLog('%s.method error' % self)

Example #11

Show file

File: rje_ingolia.py Project: slimsuite/SLiMSuite

 def seqSubset2(self):    ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile,mainkeys=['Gene'],name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile])
             ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes: ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene); ing_mgi.remove(gene)
             self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes))
             open('ingolia.bad.txt','w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna','pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')]
                     rje_seqlist.SeqList(self.log,seqcmd)
             mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia':gene,'Gene':ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file']
         iseq = rje_seqlist.SeqList(self.log,seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST',evalue='')
             while iseq.nextSeq():
                 (iname,icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)',iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL',musg):
                     if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0]
                     else: entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]':'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI');
         ENST = open('IngExact.cdna.all.fa','w')
         ENSP = open('IngExact.pep.all.fa','w')
         ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']: fx += 1; continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'],','):
                 (tname,tseq) = iseq.getDictSeq(trans,format='tuple')
                 self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon]))
                 if tseq[entry[icod]-3:][:7] == entry[icon]:
                     ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             elif len(peptseq) < minpep:
                 self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             id = rje.preZero(int(ex/100),etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq))
             ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close(); ENSP.close()
         self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx)))
         return
     except: self.errorLog('%s.method error' % self)

Example #12

Show file

File: rje_mc58.py Project: slimsuite/SLiMSuite

 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())

Example #13

Show file

File: rje_taxonomy.py Project: slimsuite/SLiMSuite

 def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True):  ### Maps taxa onto TaxID. If taxa is a list, will process each element.
     '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not taxa: return []
         taxid = []
         ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxa.sort()
         except: tlist = False
         if tlist:
             tx = 0.0; ttot = len(taxa)
             if ttot > 1:
                 for t in taxa:
                     if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0
                     taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False)
                 taxid = rje.sortUnique(taxid)
                 if log:
                     if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid)))
             else:
                 t = taxa[0]
                 if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t)
                 taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False))
                 if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid)))
             return taxid
         ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap = self.dict['TaxMap']; rankid = self.list['RankID']
         taxa = '%s' % taxa
         ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if rje.matchExp('^(\d+)$', taxa):
             #if taxa not in taxmap: self.taxaChildren(taxa)
             #if taxa in rankid: return [taxa]
             if nodeonly:
                 if taxa in rankid or not rankonly: return [taxa]
                 else: return []
             if taxa not in taxmap:
                 if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True)
                 return []
             parents = [taxa]
             while parents:
                 taxa = parents.pop(0)
                 #if taxa not in taxmap: self.taxaChildren(taxa)
                 if not rankonly or taxa in rankid: taxid.append(taxa)
                 parents += taxmap[taxa]
             return taxid
         ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if taxa == string.replace(taxa.upper(),' ',''):
             greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
             for entry in greplines:
                 try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0])
                 except: pass
             if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True)
             if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
             return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxa = taxa.replace('_',' ')
         ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
         gtaxid = None; comid = []; synid = []
         for entry in greplines:
             try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0]
             except: pass
             if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid)
             elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid)
             elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid)
         if not taxid: taxid = comid
         if not taxid: taxid = synid
         if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True)
         if len(taxid) > 1:
             #self.bugPrint(string.join(greplines))
             #self.debug('%s %s %s' % (taxid,comid,synid))
             if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         #self.debug(taxid)
         ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines()
         for entry in greplines:
             try:
                 #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry)
                 gtaxid = string.split(entry,'\t|\t')
                 if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0])
                 elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0])
             except: pass
         if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
     except: self.errorLog('%s.mapToTaxID() error' % (self)); raise

Example #14

Show file

File: rje_taxamap.py Project: slimsuite/SLiMSuite

 def treeListSPCode(self):  ### Main taxa mapping from list of tree files
     '''Main taxa mapping from list of tree files.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         specdb = self.db('spcode',add=True,forcecheck=True,mainkeys=['protein'])
         if not specdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.spcode.tdt' % self.getStr('TaxBase')
             specdb = db.addTable(spfile,mainkeys=['protein'],name='spcode',expect=False)
         if specdb: specdb.dataFormat({'boot':'num'}); return True
         specdb = db.addEmptyTable('spcode',['protein','boot','spcode','inpara','paralogues'],['protein'])
         #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein'])
         self.dict['Duplicates'] = {}    # {prot1:[dups]}
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for nwkfile in self.list['NwkList']:
             tree = rje_tree.Tree(self.log,self.cmd_list)
             tree.loadTree(nwkfile,seqlist=None,postprocess=False)
             seqacc = rje.baseFile(nwkfile,strip_path=True)
             # Identify node corresponding to query sequence
             seqnode = None
             for node in tree.nodes():
                 try:
                     if string.split(node.shortName(),'__')[1] == seqacc: seqnode = node
                 except: pass    # Internal node or bad sequence format
             if not seqnode:
                 self.warnLog('Could not find %s in %s nodes!' % (seqacc,nwkfile))
                 continue
             # Get species code for query sequence
             seqspec = tree.cladeSpec(seqnode)
             if len(seqspec) != 1: self.warnLog('Could not find species in %s node!' % (seqacc)); continue
             seqspec = seqspec.keys()[0]
             if seqspec != string.split(seqnode.shortName(),'_')[1]: raise ValueError('Species mismatch for %s & %s' % (seqacc,seqnode.shortName()))
             # Find ancestor with closest orthologue outgroup
             rootnode = tree._getRootNode()
             if not rootnode: self.warnLog('Could not find root node in %s!' % (nwkfile)); continue
             ancnode = seqnode.ancNode()
             try: bootx = float(ancnode.ancBranch().stat['Bootstrap'])/tree.stat['Bootstraps']
             except: bootx = 1.0
             inparanode = None    # Node to define in-paralogues
             ancspec = tree.cladeSpec(ancnode)
             while len(ancspec) < 2 or bootx < self.getNum('MinBoot'):
                 inparanode = ancnode    # All same species
                 if ancnode == rootnode: break
                 ancnode = ancnode.ancNode(); ancspec = tree.cladeSpec(ancnode)
                 try: bootx = float(ancnode.ancBranch().stat['Bootstrap'])/tree.stat['Bootstraps']
                 except: bootx = 1.0
             ancspec.pop(seqspec)    # Now only have counts of closest other species
             # Update table, replacing species codes with genera?
             sentry = {'protein':seqacc,'spcode':rje.sortUnique(ancspec.keys())}
             sentry['boot'] = bootx
             if not ancspec: sentry['spcode'] = ['None']; sentry['boot'] = self.getNum('NoneBoot')
             sentry['spcode'] = string.join(sentry['spcode'],'|')
             # Establish list of duplicate proteins
             inpara = []     # List of in-paralogue nodes
             inparacc = []   # List of in-paralogue accnum
             if inparanode: inpara = tree._nodeClade(inparanode,internal=False)
             self.dict['Duplicates'][seqacc] = []
             for node in tree._nodeClade(rootnode,internal=False):
                 if node == seqnode: continue
                 if len(string.split(node.shortName(),'_')) < 2: continue
                 if string.split(node.shortName(),'_')[1] == seqspec:
                     paracc = string.split(node.shortName(),'__')[1]
                     if node in inpara: inparacc.append(paracc)
                     else: self.dict['Duplicates'][seqacc].append(paracc)
             sentry['inpara'] = string.join(inparacc,'|')
             sentry['paralogues'] = string.join(self.dict['Duplicates'][seqacc],'|')
             specdb.addEntry(sentry)
         ## Update specdb and save
         specdb.saveToFile()
         #dupdb.saveToFile()
         return True
     except:
         self.errorLog(self.zen())
         return False

Example #15

Show file

File: rje_chlamydia.py Project: slimsuite/SLiMSuite

    def contamination(self):    ### Compares peptides from Chlamydia and human and outputs summaries
        '''Compares peptides from Chlamydia and human and outputs summaries.'''
        try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
            mods = ['none']
            
            ### >>>> Shortcut reanalysis without modifications >>>> ###
            pepfile = '%s.chlam_peptides.tdt' % self.basefile()
            if not self.force() and os.path.exists(pepfile):
                pepdb = db.addTable(pepfile,mainkeys=['key','seqmod'],name='chlam_nomod')
                pepdb.dropFields(['pass','modification'])
                pepdb.compress(['key','seq'],default='max')
                pepdb.dropFields(['seqmod'])
                for entry in pepdb.entries():
                    for field in pepdb.fields():
                        if 'len' not in field: continue
                        try:
                            if entry[field] and int(entry[field]): entry[field] = len(entry['seq'])
                            else: entry[field] = ''
                        except: self.errorLog('%s >> %s' % (entry,field),quitchoice=True)
                tdb = pepdb
                comprules = {'key':'str','pi':'str','mass':'str'}
                shapefields = []
                for field in pepdb.fields():
                    if 'len' in field: comprules[field] = 'mean'
                    if len(string.split(field,'|')) > 1 and string.split(field,'|')[0] not in shapefields: shapefields.append(string.split(field,'|')[0])
                print shapefields
                tdb.compress(['protein'],rules=comprules,default='sum')
                tdb.dropFields(['seq'])
                tdb.saveToFile()

                tdb.info['Name'] = 'chlam_nomod_summary'
                tdb.addField('temp',evalue=1)
                tdb.compress(['temp'],rules=comprules,default='sum')
                tdb.reshapeLong('exp',shapefields)
                tdb.newKey(['exp'])
                tdb.dropFields(['exp']+shapefields,inverse=True)
                tdb.saveToFile()

                return
            ### <<<< End Shortcut reanalysis without modifications <<<< ###

            ## ~ [0a] ~ Load EB and RB human peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#~~#','## ~ [0a] ~ Load EB and RB human peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False)
            #protein.key	protein.Entry	protein.Accession	protein.Description	protein.dataBaseType	protein.score	protein.falsePositiveRate	protein.avgMass	protein.MatchedProducts	protein.matchedPeptides	protein.digestPeps	protein.seqCover(%)	protein.MatchedPeptideIntenSum	protein.top3MatchedPeptideIntenSum	protein.MatchedProductIntenSum	protein.fmolOnColumn	protein.ngramOnColumn	protein.AutoCurate	protein.Key_ForHomologs	protein.SumForTotalProteins	peptide.Rank	peptide.Pass	peptide.matchType	peptide.modification	peptide.mhp	peptide.seq	peptide.OriginatingSeq	peptide.seqStart	peptide.seqLength	peptide.pI	peptide.componentID	peptide.MatchedProducts	peptide.UniqueProducts	peptide.ConsectiveMatchedProducts	peptide.ComplementaryMatchedProducts	peptide.rawScore	peptide.score	peptide.(X)-P Bond	peptide.MatchedProductsSumInten	peptide.MatchedProductsTheoretical	peptide.MatchedProductsString	peptide.ModelRT	peptide.Volume	peptide.CSA	peptide.ModelDrift	peptide.RelIntensity	peptide.AutoCurate	precursor.leID	precursor.mhp	precursor.mhpCal	precursor.retT	precursor.inten	precursor.calcInten	precursor.charge	precursor.z	precursor.mz	precursor.fraction	precursor.numFrac	precursor.fwhm	precursor.liftOffRT	precursor.infUpRT	precursor.infDownRT	precursor.touchDownRT	prec.rmsFWHMDelta	peptidePrecursor.deltaMhpPPM
            humedb = db.addTable('EB_IA_final_peptide.csv',mainkeys=['protein.Accession','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.key','protein.Accession','protein.Entry','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name='humaneb')
            humrdb = db.addTable('RB_IA_final_peptide.csv',mainkeys=['protein.Accession','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.key','protein.Accession','protein.Entry','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name='humanrb')
            for humdb in [humedb,humrdb]:
                humdb.info['Delimit'] = '\t'
                humdb.addField('exp',evalue=humdb.info['Name'][-2:])
                humdb.renameField('protein.Accession','Protein')
                humdb.renameField('protein.Entry','Species')
                for entry in humdb.entries(): entry['Species'] = string.split(entry['Species'],'_')[-1]
                humdb.dropEntriesDirect('Species',['HUMAN'],inverse=True)
                for field in ['Rank','Pass','seq','OriginatingSeq','modification']: humdb.renameField('peptide.%s' % field,field)
                humdb.dataFormat({'Rank':'int'})
                for mod in humdb.index('modification'):
                    if mod.lower() and mod.lower() not in mods: mods.append(mod.lower())
                humdb.addField('seqmod')
                for entry in humdb.entries():
                    if entry['modification'] and mods.index(entry['modification'].lower()): entry['seqmod'] = '%s-%d' % (entry['seq'],mods.index(entry['modification'].lower()))
                    else: entry['seqmod'] = entry['seq']
            humtdb = db.copyTable(humedb,'humantot')
            humtdb.newKey(['Protein','Rank','seq','modification','exp'])
            db.mergeTables(humtdb,db.copyTable(humrdb,'temp',add=False))
            humtdb.compress(['Protein','seq','Pass'],rules={'Rank':'max'})
            ## ~ [0b] ~ Load Proteomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#~~#','## ~ [0b] ~ Load Proteomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False)
            # Load human proteome
            hseqfile = '/home/re1u06/researchfiles/SBSBINF/Databases/DBase_120225/EnsEMBL/ens_HUMAN.loci.fas'
            hseq = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % hseqfile])
            # Load Chlamydia proteome
            cseqfile = '../2011-07-18-Genome/NC_010287.proteome.fas'
            cseq = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % cseqfile])
            # Load matched protein list
            rbpep = rje.listFromCommand('../2011-05-ProDigIS/soton_rb_peptides.txt')
            ebpep = rje.listFromCommand('../2011-05-ProDigIS/soton_rb_peptides.txt')
            ## ~ [0c] ~ Load EB and RB Chlamydia peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#~~#','## ~ [0c] ~ Load EB and RB Chlamydia peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False)
            chlamdb = {'EB':[],'RB':[]}
            for pfile in glob.glob('./Soton*/*peptide.csv'):
                (er,uniq) = rje.matchExp('\./Soton(\S\S)\S+_(\d+)/',pfile)
                chlamdb[er].append(db.addTable(pfile,mainkeys=['protein.key','protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name=uniq))
            edb = chlamdb['EB'].pop(0); edb.info['Name'] = 'chlam_eb'
            while chlamdb['EB']: db.mergeTables(edb,chlamdb['EB'].pop(0))
            rdb = chlamdb['RB'].pop(0); rdb.info['Name'] = 'chlam_rb'
            while chlamdb['RB']: db.mergeTables(rdb,chlamdb['RB'].pop(0))
            # Load EB and RB matching peptide file
            #edb = db.addTable('../2011-05-ProDigIS/SotonEB_peptide_pjss.csv',mainkeys=['protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.OriginatingSeq'],name='chlam_eb')
            #rdb = db.addTable('../2011-05-ProDigIS/SotonRB_peptide_pjss.csv',mainkeys=['protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.OriginatingSeq'],name='chlam_rb')
            for chlamdb in [edb,rdb]:
                chlamdb.info['Delimit'] = '\t'
                chlamdb.addField('exp',evalue=chlamdb.info['Name'][-2:])
                chlamdb.renameField('protein.name','Protein'); chlamdb.renameField('protein.key','key')
                for field in ['Rank','Pass','seq','OriginatingSeq','modification']: chlamdb.renameField('peptide.%s' % field,field)
                chlamdb.dataFormat({'Rank':'int'})
                for mod in chlamdb.index('modification'):
                    if mod.lower() and mod.lower() not in mods: mods.append(mod.lower())
                chlamdb.addField('seqmod')
                chlamdb.addField('Species',evalue='UNKNOWN')
                for entry in chlamdb.entries():
                    if 'Chlamydia trachomatis' in entry['Protein'] or '_CHLT2' in entry['Protein']: entry['Species'] = 'CHLT2'
                    if entry['modification'] and mods.index(entry['modification'].lower()): entry['seqmod'] = '%s-%d' % (entry['seq'],mods.index(entry['modification'].lower()))
                    else: entry['seqmod'] = entry['seq']
                    if not entry['OriginatingSeq']: entry['OriginatingSeq'] = entry['seq']
                chlamdb.dropEntriesDirect('Species',['CHLT2'],inverse=True)
                chlamdb.remakeKeys()
            ## ~ Load Protein Key Mapping ~ ##
            kdb = db.addTable('NC_010287.proteinkey.tdt',mainkeys=['key'],name='keys')
            xdb = db.addTable('NC_010287.dbxref.tdt',mainkeys=['tag'],name='xref')
            tdb = db.copyTable(edb,'chlam_temp')
            self.deBug(tdb.entries()[0])
            tdb.newKey(['Protein','Rank','Pass','seq','modification','exp'])
            db.mergeTables(tdb,db.copyTable(rdb,'temp',add=False))
            kdb = db.joinTables(name='full_xref',join=[(kdb,'tag'),(xdb,'tag')],newkey=kdb.keys(),keeptable=True)
            tdb = db.joinTables(name='chlam_tot',join=[(tdb,'key'),(kdb,'key')],newkey=tdb.keys(),keeptable=True)
            self.deBug(tdb.keys())
            self.deBug(tdb.entries()[0])

            ### ~ [1] ~ Add Human Data to combined Chlamydia Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.printLog('#~~#','### ~ [1] ~ Add Human Data to combined Chlamydia Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###',log=False)
            tdb.renameField('Pass','pass'); tdb.renameField('Protein','protein');
            for entry in tdb.entries():
                entry['pass'] = string.atoi(entry['pass'][-1])
            keep = ['key'] + xdb.fields() + ['description','protein','exp','pass','seq','seqmod','modification']
            tdb.newKey(['tag','exp','pass','seqmod','Rank'])
            tdb.compress(['tag','exp','pass','seqmod'],rules={'pass':'******'})
            tdb.dropFields(keep,inverse=True)
            self.deBug(tdb.keys())
            self.deBug(tdb.entries()[0])

            ## ~ [1a] ~ Map ID'd peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~ ##
            bothpep = {'eb':[],'rb':[]}     # Peptides found in both species
            chlampep = {'eb':[],'rb':[]}    # Peptides found only in Chlamydia
            uniqpep = {'eb':[],'rb':[]}     # Peptides found only in a single protein in Chlamydia
            fx = len(tdb.fields())
            tdb.addField('pep',evalue=1)
            for field in ['pass1','pass2','hsap1','hsap2','uniq1','uniq2']: tdb.addField(field,evalue=0)
            comprules = {'pass':'******','key':'min'}
            for field in ['pep','pass1','pass2','hsap','uniq']:
                tdb.addField('%s_len' % field)
                comprules[tdb.fields()[-1]] = 'mean'
            shapefields = tdb.fields()[fx:]

            for entry in tdb.entries():
                epass = '******' % entry['pass']
                entry[epass.lower()] = 1
                plen = entry['pep_len'] = len(entry['seq'])
                plen = entry['pass%d_len' % entry['pass']] = len(entry['seq'])
                hsap = False
                if entry['exp'] == 'eb':
                    if 'Pass1' in humedb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap1'] += 1; hsap = True
                    if 'Pass2' in humedb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap2'] += 1; hsap = True
                    if entry['seqmod'] not in humedb.index('seqmod'):
                        if entry['seq'] in humedb.index('seq'): self.errorLog('EB mod peptide %s not found in Human EB but unmod *is* found in Human EB!' % entry['seqmod'],printerror=False)
                        if entry['seqmod'] in humrdb.index('seqmod'): self.errorLog('EB peptide %s not found in Human EB but found in Human RB!' % entry['seqmod'],printerror=False)
                else:
                    if 'Pass1' in humrdb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap1'] += 1; hsap = True
                    if 'Pass2' in humrdb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap2'] += 1; hsap = True
                    if entry['seqmod'] not in humrdb.index('seqmod'):
                        if entry['seq'] in humrdb.index('seq'): self.errorLog('RB mod peptide %s not found in Human RB but unmod *is* found in Human RB!' % entry['seqmod'],printerror=False)
                        if entry['seqmod'] in humedb.index('seqmod'): self.errorLog('RB peptide %s not found in Human RB but found in Human EB!' % entry['seqmod'],printerror=False)
                if hsap: entry['hsap_len'] = plen; bothpep[entry['exp']].append(entry['seq']); continue
                chlampep[entry['exp']].append(entry['seq'])
                entry['uniq1'] = entry['pass1']
                entry['uniq2'] = entry['pass2']
                entry['uniq_len'] = plen
                for altentry in tdb.indexEntries('seqmod',entry['seqmod']):
                    if altentry['tag'] == entry['tag']: continue
                    entry['uniq1'] = entry['uniq2'] = entry['uniq_len'] = 0
                if entry['uniq1'] or entry['uniq2']: uniqpep[entry['exp']].append(entry['seq'])

            tdb.reshapeWide('exp',shapefields)
            fillfields = tdb.fields()[13:]
            for field in fillfields[0:]:
                if 'len' in field: fillfields.remove(field)
            tdb.fillBlanks(0,fillfields,fillempty=True)
            for entry in tdb.entries():
                if entry['modification'] == 0: entry['modification'] = ''

            for field in shapefields:
                tdb.addField('%s|tot' % field)
                if field[-3:] == 'len':
                    comprules['%s|eb' % field] = 'mean'
                    comprules['%s|rb' % field] = 'mean'
                    comprules['%s|tot' % field] = 'mean'
            for entry in tdb.entries():
                for field in shapefields:
                    if entry['%s|eb' % field] and not entry['%s|rb' % field]: entry['%s|tot' % field] = entry['%s|eb' % field]
                    elif entry['%s|rb' % field] and not entry['%s|eb' % field]: entry['%s|tot' % field] = entry['%s|rb' % field]
                    else: entry['%s|tot' % field] = max(entry['%s|eb' % field],entry['%s|rb' % field])

            tdb.info['Name'] = 'chlam_peptides'
            tdb.saveToFile()

            tdb.info['Name'] = 'chlam_proteins'
            tdb.compress(['protein'],rules=comprules,default='sum')
            tdb.dropFields(['pass','seq','modification','seqmod'])
            tdb.saveToFile()

            tdb.info['Name'] = 'chlam_summary'
            tdb.addField('temp',evalue=1)
            tdb.compress(['temp'],rules=comprules,default='sum')
            tdb.reshapeLong('exp',shapefields)
            tdb.newKey(['exp'])
            tdb.dropFields(['exp']+shapefields,inverse=True)
            tdb.saveToFile()

            bothpep['tot'] = bothpep['eb'] + bothpep['rb']     # Peptides found in both species
            chlampep['tot'] = chlampep['eb'] + chlampep['rb']# Peptides found only in Chlamydia
            uniqpep['tot'] = uniqpep['eb'] + uniqpep['rb']
            for er in bothpep:
                open('%s.%s.bothpep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(bothpep[er]),'\n'))
                open('%s.%s.chlampep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(chlampep[er]),'\n'))
                open('%s.%s.uniqpep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(uniqpep[er]),'\n'))
            



            return



            #Peptide numbers for C. trachomatis/human
            #1.	Number of  chlamydial peptides assigned for each protein from RBs
            #2.	Number of  chlamydial peptides assigned for each protein from EBs
            #3.	Number of  chlamydial peptides assigned from both EB and RB combined, with redundancy removed
            #4.	Number of  unique chlamydial peptides assigned for each protein from RBs
            #5.	Number of unique chlamydial peptides assigned for each protein from EBs
            #6.	Number of unique chlamydial peptides assigned for EBs and RBs combined with redundancy removed
            #7.	Total number of human peptides identified in EB (Length would be useful)
            #8.	Total number of human peptides identified in RB (Length would be useful)
            #9.	Total number of human peptides identified in EB and RB
            #10.	Human peptides matching pass 1 chlamydia peptides for RB (sequence would be useful)
            #11.	Human peptides matching pass 2 chlamydia peptides for EB (sequence would be useful)

            #An accession number and protein description would be useful where possible, i.e.,  the number of chlamydial peptides for each protein.





            tdb.compress(['Protein','seq'],rules={'Rank':'max','Pass':'******'})
            for entry in tdb.entries(): entry['exp'] = 'tot'

            
            ### ~ [1] ~ Map ID'd peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mapkey = 'seqmod'
            self.deBug(rje.sortKeys(humdb.index(mapkey)))
            self.printLog('#~~#','## ~ [1] ~ Map ID\'d peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###',log=False)
            for chlamdb in [edb,rdb,tdb]:
                bothpep = []
                chlampep = []
                uniqpep = []
                if chlamdb == edb: humdb = humedb
                elif chlamdb == rdb: humdb = humrdb
                else: humdb = humtdb
                chlamdb.addField('Pep',evalue=1)
                chlamdb.addField('Pass1',evalue=0)
                chlamdb.addField('Pass2',evalue=0)
                chlamdb.addField('Hsap1',evalue=0)
                chlamdb.addField('Hsap2',evalue=0)
                chlamdb.addField('Uniq1',evalue=0)
                chlamdb.addField('Uniq2',evalue=0)
                comprules = {'Rank':'max'}
                for field in ['Pep','Pass1','Pass2','Hsap','Uniq']:
                    chlamdb.addField('%s_len' % field)
                    comprules[chlamdb.fields()[-1]] = 'mean'
                for entry in chlamdb.entries():
                    if 'Pass1' in entry['Pass']: entry['Pass'] = '******'
                    else: entry['Pass'] = '******'
                    entry[entry['Pass']] += 1
                    entry['Pep_len'] = plen = len(entry['seq'])
                    entry['%s_len' % entry['Pass']] = plen
                    hsap = False
                    self.deBug(entry[mapkey])
                    self.deBug(entry[mapkey] in humdb.index(mapkey))
                    if 'Pass1' in humdb.indexDataList(mapkey,entry[mapkey],'Pass'): entry['Hsap1'] += 1; bothpep.append(entry[mapkey]); hsap = True
                    if 'Pass2' in humdb.indexDataList(mapkey,entry[mapkey],'Pass'): entry['Hsap2'] += 1; bothpep.append(entry[mapkey]); hsap = True
                    if hsap: entry['Hsap_len'] = plen; continue
                    chlampep.append(entry[mapkey])
                    entry['Uniq1'] = entry['Pass1']
                    entry['Uniq2'] = entry['Pass2']
                    entry['Uniq_len'] = plen
                    for altentry in chlamdb.indexEntries(mapkey,entry[mapkey]):
                        if altentry['Protein'] == entry['Protein']: continue
                        entry['Uniq1'] = entry['Uniq2'] = 0
                    if entry['Uniq1'] or entry['Uniq2']: uniqpep.append(entry[mapkey])
                chlamdb.dropFields(['Pass','Rank','seq','OriginatingSeq','modification'])
                chlamdb.compress(['Protein'],rules=comprules,default='sum')
                #chlamdb.dropField('Rank')
                chlamdb.saveToFile()
                open('%s.%s.bothpep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(bothpep),'\n'))
                open('%s.%s.chlampep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(chlampep),'\n'))
                open('%s.%s.uniqpep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(uniqpep),'\n'))
                chlamdb.newKey(['Protein','exp'])
            db.mergeTables(edb,rdb)
            db.mergeTables(edb,tdb)
            cdb = db.copyTable(edb,'chlam_summary')
            edb.info['Name'] = 'chlam_pep'
            edb.reshapeWide('exp',edb.fields()[-7:])
            edb.saveToFile()
            cdb.compress(['exp'],rules=comprules,default='sum')
            cdb.dropField('Protein')
            cdb.saveToFile()
            


            # - twice maybe, once using EnsEMBL sequences directly, once using EB/RB search
            # - Numbers of unique Pass1/2 human peptides, and numbers matching Chlam
            # - Numbers of matched peptides per Chlam gene: total, eb, rb, human (e/r), unique (e/r), ens (e/r)
            ## Do complete digest of Chlam and search against Human
        except: self.errorLog('%s.contamination error' % self)

Example #16

Show file

 def fpi(self):  ### Family-protein interactions
     '''Family-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_FPI'
         rje.mkDir(self, outdir)
         fpi = {}  # Dictionary of {family:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qry in rje.sortKeys(self.dict['PPI']):
             try:
                 fam = self.dict['Fam'][qry]
                 if len(fam) < 2: continue
             except:
                 self.errorLog('Problem with "%s" protein family' % qry)
                 continue
             fpi[qry] = []
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 fpi[qry] += self.dict['PPI'][hub]  # Add with redundancy
             for spoke in fpi[qry][0:]:
                 if fpi[qry].count(spoke) == 1:
                     fpi[qry].remove(
                         spoke)  # Must have 2+ family interactions
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in fpi[qry]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict[
                                 'PPI'][spoke]:
                             self.dict['PPI'][spoke].remove(hub)
             fpi[qry] = rje.sortUnique(fpi[qry], False, False)
             acc = []
             gene = self.dict['Gene'][qry]
             for name in fpi[qry]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.fpi.acc' % (outdir, gene),
                  'w').write(string.join(acc, '\n'))
             self.printLog('#FPI',
                           '%s family => %d interactors' % (gene, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#FPI', 'No %s PPI left after FPI removed' % hub)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after FPI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.fpi()', quitchoice=True)

Example #17

Show file

File: picsi.py Project: lyhniupi1/SLiMSuite

    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')

Example #18

Show file

    def run(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mygo = rje_go.GO(self.log,self.cmd_list)
            mygo.readGO()
            gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True)
            self.deBug(rje.sortKeys(gomap)[:100])
            #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!#
            self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap))))
            slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum'])
            self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc))))
            ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)
            seqmap = {}
            (sx,stot) = (0.0,seqlist.seqNum())
            for seq in seqlist.seq:
                self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0
                if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0]
            self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap))))
            self.deBug(rje.sortKeys(seqmap)[:100])

            ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            goocc = {}
            outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.')
            headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum']
            for okey in slimocc.keys():
                self.progLog('#NEW','Making new GO occurrences: %s    ' % (rje.integerString(len(slimocc))))
                data = slimocc.pop(okey)
                gene = seq = data['Seq']
                type = 'fwd'
                if string.split(data['Motif'],'_')[-1] in ['rev','scram']:
                    type = string.split(data['Motif'],'_')[-1]
                    data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_')
                if gene not in gomap and gene in seqmap: gene = seqmap[gene]
                golist = []
                if gene in gomap:
                    for id in gomap[gene]: golist += mygo.parents(id)
                else: golist = ['NoGo']
                self.deBug('%s:%s::%s' % (seq,gene,golist))
                for id in rje.sortUnique(golist,False,False):
                    if id not in goocc: goocc[id] = {}
                    if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]}
                    goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data))
            self.printLog('\r#NEW','Making new GO occurrences complete.    ' % (rje.integerString(len(slimocc))))

            rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
            (mx,ox,ix,itot) = (0,0,0.0,len(goocc))
            for id in rje.sortKeys(goocc):
                for motif in rje.sortKeys(goocc[id]):
                    for type in rje.sortKeys(goocc[id][motif]):
                        if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type)
                    if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue
                    mx += 1
                    for type in goocc[id][motif]:
                        for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1
                self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox)))
            self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox)))

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible

Example #19

Show file

File: rje_gquad.py Project: slimsuite/SLiMSuite

    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd)
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}   # Dictionary of {ID:Sequence object}
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name'])
                if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold':scaffold,'Gene':name})
                try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos)
                except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                gene.opt['Complement'] = start > end        # Sequence on "lagging" strand
                gene.setStat({'Start':start,'End':end})
                gene.list['CDS'] = []       # Will add CDS sequences here
                gene.list['Exon'] = []      # Will add exon sequences here
            self.log.printLog('\r#GENE','Processing Gene Annotation complete!')
                           
            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx,ctot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False)
                cx += 100
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS','Processing CDS Annotation complete!')
                
            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex,etot) = (0.0,exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False)
                ex += 100
                try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON','Processing Exon Annotation complete!')
                
            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist,xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist,xreplace=False)
                gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,','))
            self.log.printLog('\r#GENE','Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except: self.log.errorLog(rje_zen.Zen().wisdom())

Example #20

Show file

File: rje_taxamap.py Project: lyhniupi1/SLiMSuite

    def taxaMap(self):  ### Maps species codes onto different taxonomic ranks.
        '''Maps species codes onto different taxonomic ranks.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            #descdb = self.db('protdesc')
            ranks = ['genus', 'family', 'order', 'class', 'phylum']
            rankmap = {}  # SPCODE to Taxon dictionary
            rankfields = ['protein'] + ranks + specdb.fields()[1:]
            #if descdb: rankfields.append('desc')
            if self.getStrLC('ProtDesc'):
                rankfields.append('desc')
                px = 0
                for prot in self.dict['ProtDesc']:
                    if prot.lower() in ['', 'protein', 'gene']: continue
                    pentry = {
                        'protein': prot,
                        'spcode': 'None',
                        'boot': self.getNum('NoneBoot')
                    }
                    pkey = specdb.makeKey(pentry)
                    if pkey not in specdb.dataKeys():
                        specdb.addEntry(pentry)
                        px += 1
                self.printLog(
                    '#PROT', 'Added %s proteins from %s without trees.' %
                    (rje.iStr(px), self.getStr('ProtDesc')))
            rankdb = db.addEmptyTable('taxamap', rankfields, ['protein'])
            for rank in ranks:
                rankmap[rank] = {
                    'None': 'None',
                    'Unmapped': 'Unmapped',
                    'Uncertain': 'Uncertain'
                }
            taxdb = db.addEmptyTable('taxa',
                                     ['spcode', 'taxid', 'name'] + ranks,
                                     ['spcode'])

            sx = 0.0
            stot = specdb.entryNum()
            for entry in specdb.entries():
                self.progLog('\r#SPEC',
                             'Processing species: %.2f%%' % (sx / stot))
                sx += 100.0
                #if descdb:
                #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description']
                try:
                    entry['desc'] = self.dict['ProtDesc'][entry['protein']]
                except:
                    entry['desc'] = ''
                for spcode in string.split(entry['spcode'], '|'):
                    if spcode in rankmap['genus']: continue
                    tentry = {'spcode': spcode}
                    try:
                        taxid = tax.mapToTaxID(spcode,
                                               nodeonly=True,
                                               warn=False)[0]
                        rank = tax.dict['Rank'][taxid]
                        tentry['taxid'] = taxid
                        tentry['name'] = tax.getSpecies(taxid)
                    except:
                        self.warnLog(
                            'Unable to map species code "%s" to TaxID -> "Unmapped"'
                            % spcode)
                        taxid = 'Unmapped'
                        rank = 'genus'
                    # Loop through different ranks
                    for ri in range(len(ranks)):
                        nextrank = ranks[ri]
                        while rank not in ranks[ri:] and taxid in tax.dict[
                                'Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            rank = tax.dict['Rank'][taxid]
                            #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid)))
                        if taxid in tax.dict['Parent']:
                            taxon = tax.getSpecies(taxid)
                        else:
                            taxon = 'Unmapped'
                        if rank != nextrank:
                            if self.getBool('Monophyly'): taxon = 'Uncertain'
                            else: taxon = '%s %s.' % (taxon, nextrank[:3])
                        rankmap[nextrank][spcode] = taxon
                        tentry[nextrank] = taxon
                    taxdb.addEntry(tentry)
                rentry = {}
                for nextrank in ranks:
                    taxa = []
                    unmapped = ''
                    for spcode in string.split(entry['spcode'], '|'):
                        ranktax = rankmap[nextrank][spcode]
                        if 'unmapped' in ranktax.lower(
                        ) and ranktax not in taxa:
                            if unmapped:
                                self.warnLog('Two Unmapped %s taxa: %s & %s' %
                                             (nextrank, unmapped, ranktax))
                            unmapped = ranktax  #i# Should only be one
                        if ranktax not in taxa: taxa.append(ranktax)
                    if len(taxa) > 1 and 'None' in taxa:
                        self.warnLog('None in: %s' %
                                     string.join(rje.sortUnique(taxa), '|'))
                        taxa.remove('None')
                    if len(taxa) > 1 and unmapped: taxa.remove(unmapped)
                    if len(taxa) > 1 and self.getBool('Monophyly'):
                        rentry[nextrank] = 'Uncertain'
                    else:
                        rentry[nextrank] = string.join(rje.sortUnique(taxa),
                                                       '|')
                rankdb.addEntry(rje.combineDict(rentry, entry))
            self.printLog(
                '\r#SPEC',
                '%s proteins with species codes processed.' % rje.iStr(stot))
            rankdb.saveToFile()
            taxdb.saveToFile()
        except:
            self.errorLog('%s.taxaMap error' % self.prog())

Example #21

Show file

File: rje_taxamap.py Project: slimsuite/SLiMSuite

    def taxaMap(self):      ### Maps species codes onto different taxonomic ranks.
        '''Maps species codes onto different taxonomic ranks.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            #descdb = self.db('protdesc')
            ranks = ['genus','family','order','class','phylum']
            rankmap = {}    # SPCODE to Taxon dictionary
            rankfields = ['protein']+ranks+specdb.fields()[1:]
            #if descdb: rankfields.append('desc')
            if self.getStrLC('ProtDesc'):
                rankfields.append('desc'); px = 0
                for prot in self.dict['ProtDesc']:
                    if prot.lower() in ['','protein','gene']: continue
                    pentry = {'protein':prot,'spcode':'None','boot':self.getNum('NoneBoot')}
                    pkey = specdb.makeKey(pentry)
                    if pkey not in specdb.dataKeys(): specdb.addEntry(pentry); px += 1
                self.printLog('#PROT','Added %s proteins from %s without trees.' % (rje.iStr(px),self.getStr('ProtDesc')))
            rankdb = db.addEmptyTable('taxamap',rankfields,['protein'])
            for rank in ranks: rankmap[rank] = {'None':'None','Unmapped':'Unmapped','Uncertain':'Uncertain'}
            taxdb = db.addEmptyTable('taxa',['spcode','taxid','name']+ranks,['spcode'])

            sx = 0.0; stot = specdb.entryNum()
            for entry in specdb.entries():
                self.progLog('\r#SPEC','Processing species: %.2f%%' % (sx/stot)); sx += 100.0
                #if descdb:
                    #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description']
                try: entry['desc'] = self.dict['ProtDesc'][entry['protein']]
                except: entry['desc'] = ''
                for spcode in string.split(entry['spcode'],'|'):
                    if spcode in rankmap['genus']: continue
                    tentry = {'spcode':spcode}
                    try:
                        taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0]
                        rank = tax.dict['Rank'][taxid]
                        tentry['taxid'] = taxid
                        tentry['name'] = tax.getSpecies(taxid)
                    except:
                        self.warnLog('Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode)
                        taxid = 'Unmapped'
                        rank = 'genus'
                    # Loop through different ranks
                    for ri in range(len(ranks)):
                        nextrank = ranks[ri]
                        while rank not in ranks[ri:] and taxid in tax.dict['Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            rank = tax.dict['Rank'][taxid]
                            #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid)))
                        if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid)
                        else: taxon = 'Unmapped'
                        if rank != nextrank:
                            if self.getBool('Monophyly'): taxon = 'Uncertain'
                            else: taxon = '%s %s.' % (taxon,nextrank[:3])
                        rankmap[nextrank][spcode] = taxon
                        tentry[nextrank] = taxon
                    taxdb.addEntry(tentry)
                rentry = {}
                for nextrank in ranks:
                    taxa = []
                    unmapped = ''
                    for spcode in string.split(entry['spcode'],'|'):
                        ranktax = rankmap[nextrank][spcode]
                        if 'unmapped' in ranktax.lower() and ranktax not in taxa:
                            if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank,unmapped,ranktax))
                            unmapped = ranktax   #i# Should only be one
                        if ranktax not in taxa: taxa.append(ranktax)
                    if len(taxa) > 1 and 'None' in taxa:
                        self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa),'|'))
                        taxa.remove('None')
                    if len(taxa) > 1 and unmapped: taxa.remove(unmapped)
                    if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain'
                    else: rentry[nextrank] = string.join(rje.sortUnique(taxa),'|')
                rankdb.addEntry(rje.combineDict(rentry,entry))
            self.printLog('\r#SPEC','%s proteins with species codes processed.' % rje.iStr(stot))
            rankdb.saveToFile()
            taxdb.saveToFile()
        except: self.errorLog('%s.taxaMap error' %  self.prog())

Example #22

Show file

    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            genes = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd)
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            exons = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}  # Dictionary of {ID:Sequence object}
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE',
                                  'Processing Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                (id, scaffold, pos, name, glen) = rje.matchExp(
                    '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',
                    gene.info['Name'])
                if string.atoi(glen) != gene.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold': scaffold, 'Gene': name})
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',
                                           pos)
                except:
                    (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                gene.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                gene.setStat({'Start': start, 'End': end})
                gene.list['CDS'] = []  # Will add CDS sequences here
                gene.list['Exon'] = []  # Will add exon sequences here
            self.log.printLog('\r#GENE',
                              'Processing Gene Annotation complete!')

            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx, ctot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS',
                                  'Processing CDS Annotation: %.1f%%' %
                                  (cx / ctot),
                                  newline=False,
                                  log=False)
                cx += 100
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!')

            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex, etot) = (0.0, exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON',
                                  'Processing Exon Annotation: %.1f%%' %
                                  (ex / etot),
                                  newline=False,
                                  log=False)
                ex += 100
                try:
                    (id, scaffold, pos, name, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,
                                                                 ',')[0]]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON',
                              'Processing Exon Annotation complete!')

            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE',
                                  'Generating new Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist, xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist, xreplace=False)
                gene.info[
                    'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (
                        gene.info['Gene'], gene.info['SpecCode'],
                        gene.info['AccNum'], gene.aaLen(),
                        string.join(clist, ','), string.join(elist, ','))
            self.log.printLog('\r#GENE',
                              'Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())

Example #23

Show file

 def iTRAQSamples(self): ### Uses self.dict['Samples'] and self.db('itraq') to summarise hit data
     '''Uses self.dict['Samples'] and self.db('itraq') to summarise hit data.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db(); idb = self.db('itraq')
         mdb = db.copyTable(idb,'itraq_summary')
         gdb = db.copyTable(idb,'itraq_geomean')
         ### ~ [1] Reformat Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         mdb.dropField('geomean'); gdb.dropField('ratio'); gdb.renameField('geomean','ratio')
         for sdb in [mdb,gdb]:
             sdb.dropField('summary');
             sdb.dropEntriesDirect('ratio','---')
             sdb.dropEntriesDirect('ratio','NN')
             sdb.dataFormat({'ratio':'num','n':'int'})
             ## ~ [1a] Drop tags with Samples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             (ex,etot) = (0.0,sdb.entryNum())
             for entry in sdb.entries():
                 self.progLog('\r#ITRAQ','Drop isotags without Sample info: %.2f%%' % (ex/etot)); ex += 100.0
                 tags = string.split(entry['itraq'],'/')
                 if tags[0] not in self.dict['Samples'] or tags[1] not in self.dict['Samples']: sdb.dropEntry(entry)
             self.printLog('\r#ITRAQ','Dropped all isotags without Sample info: %s of %s entries remain' % (rje.iStr(sdb.entryNum()),rje.iStr(etot)))
             ## ~ [1b] Reshape, rename, invert and remove redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             sdb.reshapeWide('itraq',['ratio','n'])
             samples = rje.sortUnique(self.dict['Samples'].values())
             ratios = []
             self.printLog('#SAMP',string.join(samples,', '))
             for s1 in samples:
                 for s2 in samples[samples.index(s1):]:
                     newfield = '%s/%s' % (s1,s2)
                     sdb.addField(newfield)
                     sdb.addField('%s_Min' % newfield)
                     sdb.addField('%s_Max' % newfield)
                     sdb.addField('%s_Dirn' % newfield)
                     ratios.append(newfield)
                     for entry in sdb.entries(): entry[newfield] = []
             for field in sdb.fields():
                 if '|' in field:
                     (score,tags) = string.split(field,'|')
                     tag = string.split(tags,'/')
                     if int(tag[0]) > int(tag[1]):   ### Invert
                         newfield = '%s|%s/%s' % (score,tag[1],tag[0])
                         if newfield in sdb.fields(): sdb.dropField(newfield); continue
                         sdb.renameField(field,newfield)
                         if score == 'ratio':
                             for entry in sdb.entries():
                                 if entry[newfield]: entry[newfield] = 1.0 / entry[newfield]
                         tag = (tag[1],tag[0])
                         field = newfield
                     s1 = self.dict['Samples'][tag[0]]
                     s2 = self.dict['Samples'][tag[1]]
                     newname = '%s|%s%s/%s%s' % (score,s1,tag[0],s2,tag[1])
                     sdb.renameField(field,newname)
                     if score == 'n': continue
                     newfield = '%s/%s' % (s1,s2)
                     invfield = '%s/%s' % (s2,s1)
                     for entry in sdb.entries():
                         if entry[newname] and newfield in sdb.fields(): entry[newfield].append(entry[newname])
                         elif entry[newname]: entry[invfield].append(1.0/entry[newname])
             ## ~ [1c] Calculate Geometric mean ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             (ex,etot) = (0.0,sdb.entryNum())
             for entry in sdb.entries():
                 self.progLog('\r#GEO','Calculating Geometric means: %.2f%%' % (ex/etot)); ex += 100.0
                 for ratio in ratios:
                     if entry[ratio]:
                         entry['%s_Min' % ratio] = min(entry[ratio])
                         entry['%s_Max' % ratio] = max(entry[ratio])
                         try: entry[ratio] = rje.geoMean(entry[ratio])
                         except: self.deBug(entry)
                         if entry[ratio] > 1 and entry['%s_Min' % ratio] > 1: entry['%s_Dirn' % ratio] = 'UP'
                         elif entry[ratio] < 1 and entry['%s_Max' % ratio] < 1: entry['%s_Dirn' % ratio] = 'DOWN'
                     else: entry['%s_Dirn' % ratio] = entry['%s_Min' % ratio] = entry['%s_Max' % ratio] = entry[ratio] = ''
             self.printLog('\r#GEO','Geometric mean calculations complete')
             sdb.saveToFile()
     except: self.errorLog('iTRAQSamples error')

Example #24

Show file

File: rje_mc58.py Project: lyhniupi1/SLiMSuite

    def run(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for fasta in glob.glob('*.fasta'):
                fas = fasta[:-2]
                if os.path.exists(fas): continue
                sx = 0
                for line in open(fasta, 'r').readlines():
                    if line[:1] == '>':
                        try:
                            (name,
                             desc) = rje.matchExp('^>(\S+) (\S.+)$', line)
                        except:
                            name = rje.matchExp('^>(\S+)', line)[0]
                        if len(string.split(name, '|')) == 3:
                            name = '6rf_NEIME__%s' % string.split(name, '|')[2]
                            open(fas, 'a').write('>%s\n' % name)
                        elif len(string.split(name, '|')) == 5:
                            name = 'ref_NEIME__%s' % string.split(name, '|')[3]
                            open(fas, 'a').write('>%s %s\n' % (name, desc))
                        else:
                            print string.split(name, '|')
                            raise ValueError
                        self.progLog(
                            '\r#FAS', 'Processing %s: %s seqs' %
                            (fas, rje.integerString(sx)))
                        sx += 1
                    else:
                        open(fas, 'a').write(line)
                self.printLog(
                    '\r#FAS', 'Processed %s: %s seqs from %s' %
                    (fas, rje.integerString(sx), fasta))
                rje_blast.BLASTRun(self.log,
                                   self.cmd_list).formatDB(fas,
                                                           protein=True,
                                                           force=True)
            ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            rfhits = {}  # Dictionary of {hit:['File:hit_num']}
            acc = 'MC58_6RF_Hits.acc'
            open(acc, 'w')
            gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
            cx = 0
            for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
                cx += 1
                file = os.path.basename(csv)[:-4]
                hits = False
                for line in open(csv, 'r').readlines():
                    if line.find('prot_hit_num,prot_acc') == 0: hits = True
                    elif hits:
                        data = rje.readDelimit(line, ',')
                        if len(data) < 2: continue
                        [num, name] = data[:2]
                        try:
                            name = string.split(name, '|')[2]
                        except:
                            continue
                        if name not in rfhits:
                            open(acc, 'a').write('6rf_NEIME__%s\n' % name)
                            rfhits[name] = []
                        id = '%s:%s' % (file, num)
                        if id not in rfhits[name]: rfhits[name].append(id)
                        self.progLog(
                            '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' %
                            (cx, rje.integerString(len(rfhits))))
            self.printLog(
                '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' %
                (cx, rje.integerString(len(rfhits)), acc))
            ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(gfile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=%s' % acc, 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas',
                        'qryacc=F'
                    ]).gablam()
            ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum'])
            zeros = []
            for hit in gdata:
                if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
            zeros = rje.sortUnique(zeros, False)
            open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n'))
            self.printLog(
                '#ZERO',
                '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
            ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
            if not os.path.exists(ufile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Zeros.fas',
                        'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas',
                        'qryacc=F'
                    ]).gablam()
            gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True)
            fdata = rje.dataDict(self,
                                 string.replace(ufile, 'hitsum', 'gablam'),
                                 ['Qry'], ['Hit'],
                                 lists=True)
            headers = gdata.pop('Headers')
            headers.insert(1, 'Sample')
            headers.append('BestHit')
            rje.delimitedFileOutput(self,
                                    'MC58_6RF_Zeros.tdt',
                                    headers,
                                    rje_backup=True)
            for rf in rje.sortKeys(gdata):
                rfcut = string.split(rf, '__')[1]
                gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ')
                gdata[rf]['Qry'] = rfcut
                try:
                    gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
                except:
                    gdata[rf]['BestHit'] = '-'
                rje.delimitedFileOutput(self,
                                        'MC58_6RF_Zeros.tdt',
                                        headers,
                                        datadict=gdata[rf])

        except:
            self.errorLog(rje_zen.Zen().wisdom())
        self.printLog('#ZEN', rje_zen.Zen().wisdom())