Exemple #1
0
 def readResults(self,
                 clear=True,
                 readaln=False
                 ):  ### Reads results from self.list['HMMRes'] into objects
     '''
     Reads results from self.list['HMMRes'] into objects.
     >> clear:boolean = whether to clear self.search before reading [True]
     >> readaln:boolean = whether to bother reading Alignments into objects [False]
     '''
     try:
         if clear: self.search = []
         for resfile in rje.sortUnique(self.list['HMMRes'], xreplace=False):
             if not os.path.exists(
                     resfile) and self.opt['GZip'] and os.path.exists(
                         '%s.gz' % resfile):
                 os.system('gunzip %s.gz' % resfile)
                 self.printLog('#GUNZIP', 'Gunzipped %s.gz' % resfile)
             if self.opt['HMMPFam']:
                 self.readHMMPFamSearch(resfile, readaln)
             else:
                 self.readHMMSearch(resfile, readaln)
             if self.opt['GZip'] and os.path.exists(resfile):
                 rje.backup(self, '%s.gz' % resfile, unlink=True)
                 os.system('gzip %s' % resfile)
                 self.printLog('#GZIP',
                               '%s gzipped to save space' % resfile)
     except:
         self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',
                           quitchoice=True)
         return False
Exemple #2
0
 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self, outdir)
         dpi = {}  # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']:
                     dpi[dom] += self.dict['PPI'][
                         hub]  # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1:
                     dpi[dom].remove(
                         spoke)  # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict[
                                 'PPI'][spoke]:
                             self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom], False, False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.dpi.acc' % (outdir, dom),
                  'w').write(string.join(acc, '\n'))
             self.printLog('#DPI',
                           '%s domain => %d interactors' % (dom, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI',
                           'No %s PPI left after DPI removed' % hub,
                           screen=False)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after DPI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
Exemple #3
0
 def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]):  ### Special GO power calculation for GO slim set
     '''
     Special GO power calculation for GO slim set.
     >> numbers:dictionary of {Sample:Count}
     >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time.
     >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples.
     >> total:str ['Total'] = Sample containing Total counts to compare against
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     >> ignore:list of Samples to ignore from calculation
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         N = numbers[total]        # Total count for calculating expectations/probabilities
         nlist = []                  # List of counts for subsamples to be assessed
         for sample in numbers:
             if sample not in ignore + [total]: nlist.append(numbers[sample])
         nlist = rje.sortUnique(nlist,xreplace=False,num=True)
         ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         plist = []                  # List of acceptable Total counts for subset
         nx = 0.0
         for i in range(1,N+1):      # Look at all possible levels of occurrence
             self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N))
             nx += 100.0
             ok = 0
             p = float(i) / N        # Probability of each gene having this term
             for n in nlist:         # Look at each subset
                 k1 = min(i,n)       # Want to look at largest possible count for sample-term pairing
                 k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation
                 if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1
                 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1
                 #!# Add under-representation too! #!#
                 if ok and samples == 'any': break
             if (ok and samples == 'any') or ok == len(nlist): plist.append(i)
         self.printLog('\r#POW','Calculation of GO term power complete.',log=False)
         self.deBug(nlist)
         ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         terms = []
         (ix,itot) = (0.0,len(self.go()))
         for id in rje.sortKeys(self.go()):
             self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms)))
             ix += 100.0
             if self.go(id)[countkey][total] in plist: terms.append(id)
         self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms)))
         #!# Add correction terms #!#
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.powerGO()')
     return []
Exemple #4
0
 def fpi(self):  ### Family-protein interactions
     '''Family-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_FPI'
         rje.mkDir(self,outdir)
         fpi = {}            # Dictionary of {family:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qry in rje.sortKeys(self.dict['PPI']):
             try:
                 fam = self.dict['Fam'][qry]
                 if len(fam) < 2: continue
             except: self.errorLog('Problem with "%s" protein family' % qry); continue
             fpi[qry] = []
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 fpi[qry] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in fpi[qry][0:]:
                 if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke)   # Must have 2+ family interactions
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in fpi[qry]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             fpi[qry] = rje.sortUnique(fpi[qry],False,False)
             acc = []
             gene = self.dict['Gene'][qry]
             for name in fpi[qry]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name)                     
             open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n'))
             self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#FPI','No %s PPI left after FPI removed' % hub)
         self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
Exemple #5
0
 def readResults(self,clear=True,readaln=False):  ### Reads results from self.list['HMMRes'] into objects
     '''
     Reads results from self.list['HMMRes'] into objects.
     >> clear:boolean = whether to clear self.search before reading [True]
     >> readaln:boolean = whether to bother reading Alignments into objects [False]
     '''
     try:
         if clear: self.search = []
         for resfile in rje.sortUnique(self.list['HMMRes'],xreplace=False):
             if not os.path.exists(resfile) and self.opt['GZip'] and os.path.exists('%s.gz' % resfile):
                 os.system('gunzip %s.gz' % resfile)
                 self.printLog('#GUNZIP','Gunzipped %s.gz' % resfile)
             if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile,readaln)
             else: self.readHMMSearch(resfile,readaln)
             if self.opt['GZip'] and os.path.exists(resfile):
                 rje.backup(self,'%s.gz' % resfile,unlink=True)
                 os.system('gzip %s' % resfile)
                 self.printLog('#GZIP','%s gzipped to save space' % resfile)
     except:
         self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',quitchoice=True)
         return False
Exemple #6
0
 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self,outdir)
         dpi = {}            # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke)   # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom],False,False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name) 
             open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n'))
             self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False)
         self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
Exemple #7
0
 def treeListSPCode(self):  ### Main taxa mapping from list of tree files
     '''Main taxa mapping from list of tree files.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         specdb = self.db('spcode',
                          add=True,
                          forcecheck=True,
                          mainkeys=['protein'])
         if not specdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.spcode.tdt' % self.getStr('TaxBase')
             specdb = db.addTable(spfile,
                                  mainkeys=['protein'],
                                  name='spcode',
                                  expect=False)
         if specdb:
             specdb.dataFormat({'boot': 'num'})
             return True
         specdb = db.addEmptyTable(
             'spcode',
             ['protein', 'boot', 'spcode', 'inpara', 'paralogues'],
             ['protein'])
         #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein'])
         self.dict['Duplicates'] = {}  # {prot1:[dups]}
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for nwkfile in self.list['NwkList']:
             tree = rje_tree.Tree(self.log, self.cmd_list)
             tree.loadTree(nwkfile, seqlist=None, postprocess=False)
             seqacc = rje.baseFile(nwkfile, strip_path=True)
             # Identify node corresponding to query sequence
             seqnode = None
             for node in tree.nodes():
                 try:
                     if string.split(node.shortName(), '__')[1] == seqacc:
                         seqnode = node
                 except:
                     pass  # Internal node or bad sequence format
             if not seqnode:
                 self.warnLog('Could not find %s in %s nodes!' %
                              (seqacc, nwkfile))
                 continue
             # Get species code for query sequence
             seqspec = tree.cladeSpec(seqnode)
             if len(seqspec) != 1:
                 self.warnLog('Could not find species in %s node!' %
                              (seqacc))
                 continue
             seqspec = seqspec.keys()[0]
             if seqspec != string.split(seqnode.shortName(), '_')[1]:
                 raise ValueError('Species mismatch for %s & %s' %
                                  (seqacc, seqnode.shortName()))
             # Find ancestor with closest orthologue outgroup
             rootnode = tree._getRootNode()
             if not rootnode:
                 self.warnLog('Could not find root node in %s!' % (nwkfile))
                 continue
             ancnode = seqnode.ancNode()
             try:
                 bootx = float(ancnode.ancBranch().stat['Bootstrap']
                               ) / tree.stat['Bootstraps']
             except:
                 bootx = 1.0
             inparanode = None  # Node to define in-paralogues
             ancspec = tree.cladeSpec(ancnode)
             while len(ancspec) < 2 or bootx < self.getNum('MinBoot'):
                 inparanode = ancnode  # All same species
                 if ancnode == rootnode: break
                 ancnode = ancnode.ancNode()
                 ancspec = tree.cladeSpec(ancnode)
                 try:
                     bootx = float(ancnode.ancBranch().stat['Bootstrap']
                                   ) / tree.stat['Bootstraps']
                 except:
                     bootx = 1.0
             ancspec.pop(
                 seqspec)  # Now only have counts of closest other species
             # Update table, replacing species codes with genera?
             sentry = {
                 'protein': seqacc,
                 'spcode': rje.sortUnique(ancspec.keys())
             }
             sentry['boot'] = bootx
             if not ancspec:
                 sentry['spcode'] = ['None']
                 sentry['boot'] = self.getNum('NoneBoot')
             sentry['spcode'] = string.join(sentry['spcode'], '|')
             # Establish list of duplicate proteins
             inpara = []  # List of in-paralogue nodes
             inparacc = []  # List of in-paralogue accnum
             if inparanode:
                 inpara = tree._nodeClade(inparanode, internal=False)
             self.dict['Duplicates'][seqacc] = []
             for node in tree._nodeClade(rootnode, internal=False):
                 if node == seqnode: continue
                 if len(string.split(node.shortName(), '_')) < 2: continue
                 if string.split(node.shortName(), '_')[1] == seqspec:
                     paracc = string.split(node.shortName(), '__')[1]
                     if node in inpara: inparacc.append(paracc)
                     else: self.dict['Duplicates'][seqacc].append(paracc)
             sentry['inpara'] = string.join(inparacc, '|')
             sentry['paralogues'] = string.join(
                 self.dict['Duplicates'][seqacc], '|')
             specdb.addEntry(sentry)
         ## Update specdb and save
         specdb.saveToFile()
         #dupdb.saveToFile()
         return True
     except:
         self.errorLog(self.zen())
         return False
Exemple #8
0
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Setup Objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.getStrLC('DBSource'): self.setStr({'DBSource':string.split(rje.stripPath(self.getStr('MITAB')),'.')[0]})
            if not self.obj['DB']: self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
            pdb = self.db('pairwise',add=False)
            pfields = ['#','Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType']
            if not pdb: self.db().addEmptyTable('pairwise',pfields,['#'],log=True)
            if not self.obj['XRef']:
                xcmd = ['mapfields=Gene,%s,Secondary,Ensembl,Aliases,Accessions,RefSeq,Previous Symbols,Synonyms' % self.getStr('UniField')]
                self.obj['XRef'] = rje_xref.XRef(self.log,xcmd+self.cmd_list)
                self.obj['XRef'].setup()
            skip_comments = True
            for field in self.list['IDField']:
                if field[:1] == '#': skip_comments = False
            if self.list['MapDB'] and 'uniprotkb' not in self.list['MapDB']:
                self.list['MapDB'].append('uniprotkb')
                self.printLog('#MAP','uniprotkb added to MapDB list.')
            elif not self.list['MapDB']: self.printLog('#MAP','No MapDB list: will attempt to match all IDs to xref KeyID "%s".' % self.obj['XRef'].getStr('KeyID'))
            ### ~ [2] Setup MITAB File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.open('MITAB')
            if not self.file['MITAB']: raise IOError
            self.printLog('#MITAB','Parse PPI from %s.' % self.getStr('MITAB'))
            ## ~ [2a] MITAB file headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            headers = []
            while not headers:
                self.list['Headers'] = headers = self.readDelimit('MITAB')
                if not headers: break
                if headers[0][:1] == '#' and skip_comments: headers = []; continue
            #self.debug(headers)
            ## ~ [2b] IDField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            idfields = []
            for hfield in headers:
                #self.bugPrint(hfield.upper())
                for idfield in rje.sortUnique(self.list['IDField'])[0:]:
                    idfield = string.replace(idfield.upper(),'(','\(')
                    idfield = string.replace(idfield,')','\)')
                    idmatch = rje.matchExp('^(%s\s?[AB])$' % idfield.upper(),hfield.upper())
                    if not idmatch: idmatch = rje.matchExp('^(%s\s?[AB]) \(\S+\)$' % idfield.upper(),hfield.upper())
                    if idmatch and hfield not in idfields:
                        idfields.append(hfield)
                        self.printLog('#ID','IDField: %s' % hfield)
                        #self.bugPrint(idfields)
                        break
            #self.debug(idfields)
            self.list['IDField'] = idfields
            if not self.list['IDField']: raise ValueError('No IDField found in MITAB headers.')
            ## ~ [2c] TaxaField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            taxafields = []
            for tfield in self.list['TaxaField'][0:]:
                for hfield in headers:
                    tmatch = rje.matchExp('^(%s\s?[AB])$' % tfield.upper(),hfield.upper())
                    if not tmatch: tmatch = rje.matchExp('^(%s\s?[AB]) \(\S+\)$' % tfield.upper(),hfield.upper())
                    if tmatch and hfield not in taxafields:
                        taxafields.append(hfield)
                        self.printLog('#TAX','TaxaField: %s' % hfield)
            self.list['TaxaField'] = taxafields
            if not self.list['TaxaField']: self.warnLog('No TaxaField found in MITAB headers.',quitchoice=True)
            ## ~ [2d] TypeField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            methfields = []
            lctypes = rje.listLower(self.list['MethodField'])
            for hfield in headers:
                if hfield.lower() in lctypes:
                    methfields.append(hfield)
                    self.printLog('#METH','MethodField: %s' % hfield)
            self.list['MethodField'] = methfields
            if not self.list['MethodField']: self.warnLog('No MethodField found in MITAB headers.',quitchoice=True)
            ## ~ [2e] TypeField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            typefields = []
            lctypes = rje.listLower(self.list['TypeField'])
            for hfield in headers:
                if hfield.lower() in lctypes:
                    typefields.append(hfield)
                    self.printLog('#TYPE','TypeField: %s' % hfield)
            self.list['TypeField'] = typefields
            if not self.list['TypeField']: self.warnLog('No TypeField found in MITAB headers.',quitchoice=True)

            return True     # Setup successful
        except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Exemple #9
0
    def parseMITAB(self):   ### Parse MITAB file into pairwise PPI table.
        '''Parse MITAB file into pairwise PPI table.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            xref = self.obj['XRef']
            pdb = self.db('pairwise')
            pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType']
            headers = {}
            for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h
            dbsource = self.getStr('DBSource')
            ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0
            epos = self.endPos('MITAB')
            complexidlist = []
            badtaxa = ['-']
            baduni = []
            while 1:
                self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
                mline = self.readDelimit('MITAB'); mx += 1
                if not mline: break
                entry = {'#':pdb.entryNum()}
                for field in pfields: entry[field] = ''
                ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                complexid = {}     # This will take the first complex ID
                if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']:
                    if int(mline[headers['numParticipants']]) > 2:
                        complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']]
                        #self.bugPrint(mline)
                        #self.debug(complexid)
                ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                taxa = {'A':'','B':''}
                for tfield in self.list['TaxaField']:
                    ab = tfield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0]
                    try:
                        taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0]
                        if self.list['TaxID'] and taxon not in self.list['TaxID']: continue
                        taxa[ab] = taxon
                    except:
                        taxon = mline[headers[tfield]]
                        if taxon not in badtaxa:
                            badtaxa.append(taxon)
                            self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True)
                        if not self.list['TaxID']: taxa[ab] = '-'
                if not taxa['A'] and complexid: taxa['A'] = taxa['B']
                if not taxa['B'] and complexid: taxa['B'] = taxa['A']
                if not (taxa['A'] and taxa['B']): ftx += 1; continue
                ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ids = {'A':[],'B':[]}
                uni = {'A':'','B':''}
                for ifield in self.list['IDField']:
                    ab = ifield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0]
                    # Split IDs on | then db:id vs self.list['MapDB']
                    for pid in string.split(mline[headers[ifield]],'|'):
                        try: (db,dbid) = string.split(pid,':',1)
                        except: continue
                        if db.lower() in ['uniprotkb'] and '(' in dbid: continue    # Only map uniprotkb accnum
                        dbid = string.split(dbid,'(')[0]
                        dbid = string.split(dbid,';')[0]
                        if db.lower() in ['uniprotkb']:
                            svid = dbid
                            dbid = string.split(svid,'-')[0]
                        if ab not in complexid:     # First identifier for A/B
                            if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid)
                            else: complexid[ab] = ''
                        if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid)
                        # Parse uniprot directly if possible
                        if db.lower() in ['uniprotkb'] and not uni[ab]:
                            if self.getBool('SpliceVar'): uni[ab] = svid
                            else: uni[ab] = dbid
                #self.bugPrint(ids)
                ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                amb = {'A':False,'B':False}
                if not ids['A'] or not ids['B']:
                    #self.bugPrint('%s\n=> ID Failure' % mline)
                    #self.bugPrint(ids['A']); self.bugPrint(ids['B'])
                    #self.bugPrint(entry)
                    fx += 1; continue
                for ida in ids['A']:
                    #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False)))
                    if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True)
                    if entry['Hub'] == False: amb['A'] = True
                    #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida)
                if self.getBool('AddUni') and not entry['HubUni']:
                    entry['HubUni'] = uni['A']
                    if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A'])
                if not entry['Hub'] and entry['HubUni']:
                    entry['Hub'] = entry['HubUni']
                    #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True)
                    uhx += 1
                if not entry['Hub'] and complexid['A']:
                    entry['Hub'] = complexid['A']
                else: complexid['A'] = ''
                if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = ''
                for idb in ids['B']:
                    if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True)
                    if entry['Spoke'] == False: amb['B'] = True
                    #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb)
                if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B']
                if not entry['Spoke'] and entry['SpokeUni']:
                    entry['Spoke'] = entry['SpokeUni']
                    #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True)
                    usx += 1
                if not entry['Spoke'] and complexid['B']:
                    entry['Spoke'] = complexid['B']
                else: complexid['B'] = ''
                if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']:
                    entry['Spoke'] = ''
                    if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B'])
                if complexid['A'] and complexid['B']:
                    if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')):
                        self.printLog('\r#MITAB','',log=False)
                        self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True)
                    entry['Hub'] = entry['Spoke'] = ''
                #self.bugPrint(entry)
                #self.debug(complexid)
                if not (entry['Hub'] and entry['Spoke']):
                    if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']):
                        fax += 1; continue
                    #self.bugPrint(mline); self.debug(entry)
                    fx += 1; continue
                #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue
                entry['HubTaxID'] = taxa['A']
                entry['SpokeTaxID'] = taxa['B']
                if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A'])
                if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B'])
                #if complexid['A'] or complexid['B']: self.debug(entry)
                ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                #self.bugPrint(mline)
                evidence = []
                for tfield in self.list['MethodField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: evidence.append('%s:%s' % (dbsource,ematch[0]))
                if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource')))
                evidence = rje.sortUnique(evidence)
                #self.debug(evidence)
                entry['Evidence'] = string.join(evidence,'|')
                ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                itypes = []
                for tfield in self.list['TypeField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: itypes.append(ematch[0])
                if not itypes: itypes.append('unknown')
                itypes = rje.sortUnique(itypes)
                #self.debug(itypes)
                entry['IType'] = string.join(itypes,'|')
                pdb.addEntry(entry); ex += 1
                if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']:
                    self.printLog('#DEV',string.join(mline,'\t'))
                    #self.bugPrint(uni); self.debug(entry)
                if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']:
                    pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                  'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                  'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                  'Evidence':entry['Evidence'],'IType':entry['IType']})
            self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
            self.close('MITAB')
            if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx)))
            if baduni:
                baduni.sort()
                accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource)
                self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout))
                open(accout,'w').write(string.join(baduni,'\n'))

            ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not complexidlist: return pdb
            self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist))
            ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            complexes = {}; chentries = []; csentries = []
            cevidence = {}  # List of Evidence for each complex
            citypes = {}    # List of ITypes for each complex
            ctaxa = {}
            ex = 0.0; etot = pdb.entryNum()
            for entry in pdb.entries():
                self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if entry['Hub'] in complexidlist:
                    cid = entry['Hub']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Spoke'])
                    ctaxa[entry['Spoke']] = entry['SpokeTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    chentries.append(entry)
                elif entry['Spoke'] in complexidlist:
                    cid = entry['Spoke']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Hub'])
                    ctaxa[entry['Hub']] = entry['HubTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    csentries.append(entry)
            self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist)))
            #self.debug(complexes)
            ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cppi = {}
            ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0
            for cid in rje.sortKeys(complexes):
                self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid]))
                if len(complexes[cid]) < 2:
                    complexes.pop(cid)
                    cevidence.pop(cid)
                    citypes.pop(cid)
                    rx += 1; continue
                complexes[cid].sort()
                #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|')
                #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|')
                cmax = max(cmax,len(complexes[cid]))
                #px += (len(complexes[cid]) * (len(complexes[cid])-1))
                members = complexes[cid][0:]
                while members:
                    hub = members.pop(0)
                    if self.dev() and hub == 'KLF3': self.debug(cid)
                    if hub not in cppi: cppi[hub] = {}
                    for spoke in members:
                        if spoke not in cppi[hub]:
                            cppi[hub][spoke] = []; px += 1
                            cppi[hub][spoke].append(cid)
            self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px)))
            ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cix = pdb.entryNum()
            for centry in chentries + csentries: pdb.dropEntry(centry)
            ex = 0.0; etot = len(cppi)
            for hub in rje.sortKeys(cppi):
                self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0
                #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]}
                hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]}
                for spoke in rje.sortKeys(cppi[hub]):
                    evidence = []
                    itypes = []
                    ctypes = []
                    for cid in cppi[hub][spoke]:
                        evidence += cevidence[cid]
                        itypes += citypes[cid]
                        ctypes += string.split(cid,':')[0]
                    ctype = string.join(rje.sortUnique(ctypes),'|')
                    evidence = string.join(rje.sortUnique(evidence),'|')
                    if not evidence: evidence = '%s:%s' % (dbsource,ctype)
                    itypes = string.join(rje.sortUnique(itypes),'|')
                    if not itypes: itypes = ctype
                    #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]}
                    newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]}
                    newentry['Evidence'] = evidence
                    newentry['IType'] = itypes
                    entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1
                    if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry)
                    if self.getBool('Symmetry'):
                        pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                      'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                      'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                      'Evidence':entry['Evidence'],'IType':entry['IType']})
                        cix += 1
            self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry')))
            return pdb
        except: self.errorLog('%s.parseMITAB error' % self.prog())
Exemple #10
0
 def seqSubset2(
     self
 ):  ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),
                                      mainkeys=['Ingolia'],
                                      name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile, mainkeys=['Gene'], name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(
                 ['sourcedata=%s' % xfile,
                  'aliases=%s' % afile])
             ing_genes = string.split(
                 string.join(
                     self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes:
                 ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog(
                         '#MAP',
                         'Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene)
                     ing_mgi.remove(gene)
             self.printLog(
                 '#BAD', 'Failed to map %s genes from Ignolia' %
                 rje.iLen(bad_genes))
             open('ingolia.bad.txt', 'w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),
                                     'EnsEMBL',
                                     sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna', 'pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + [
                         'seqin=%s' % seqfile,
                         'seqout=%s' % seqout, 'autofilter=T', 'autload=T',
                         'seqmode=file',
                         'gooddesc=%s' % string.join(ing_musg, ',')
                     ]
                     rje_seqlist.SeqList(self.log, seqcmd)
             mdb = self.db().addEmptyTable('map',
                                           ['Ingolia', 'Gene', 'EnsEMBL'],
                                           ['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia': gene, 'Gene': ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else:
                     entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + [
             'seqin=%s' % seqfile, 'autofilter=F', 'autload=T',
             'seqmode=file'
         ]
         iseq = rje_seqlist.SeqList(self.log, seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST', evalue='')
             while iseq.nextSeq():
                 (iname, icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)', iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL', musg):
                     if entry['ENST']:
                         entry['ENST'] += ',%s' % string.split(iname)[0]
                     else:
                         entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]': 'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST')
         sdb.addField('ENSP')
         sdb.addField('ENSI')
         ENST = open('IngExact.cdna.all.fa', 'w')
         ENSP = open('IngExact.pep.all.fa', 'w')
         ex = 0.0
         etot = sdb.entryNum()
         sx = 0
         fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog(
                 '\r#ING',
                 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot))
             ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']:
                 fx += 1
                 continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'], ','):
                 (tname, tseq) = iseq.getDictSeq(trans, format='tuple')
                 self.deBug('%s vs %s' %
                            (tseq[entry[icod] - 3:][:7], entry[icon]))
                 if tseq[entry[icod] - 3:][:7] == entry[icon]:
                     ipept = string.split(
                         rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog(
                     '\r#ING',
                     'Unable to find Harrington start for %s %s (%s)' %
                     (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             elif len(peptseq) < minpep:
                 self.printLog(
                     '\r#ING',
                     'Peptide from mapped Harrington start for %s %s (%s) too short!'
                     % (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             id = rje.preZero(int(ex / 100), etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write(
                 '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'],
                    entry['Gene'], mentry['Gene'], cdnaseq))
             ENSP.write(
                 '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id,
                    entry['Gene'], mentry['Gene'], peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close()
         ENSP.close()
         self.printLog(
             '\r#ING',
             'Output %s Ingolia peptides and transcripts. %s failed.' %
             (rje.iStr(sx), rje.iStr(fx)))
         return
     except:
         self.errorLog('%s.method error' % self)
Exemple #11
0
 def seqSubset2(self):    ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile,mainkeys=['Gene'],name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile])
             ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes: ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene); ing_mgi.remove(gene)
             self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes))
             open('ingolia.bad.txt','w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna','pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')]
                     rje_seqlist.SeqList(self.log,seqcmd)
             mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia':gene,'Gene':ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file']
         iseq = rje_seqlist.SeqList(self.log,seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST',evalue='')
             while iseq.nextSeq():
                 (iname,icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)',iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL',musg):
                     if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0]
                     else: entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]':'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI');
         ENST = open('IngExact.cdna.all.fa','w')
         ENSP = open('IngExact.pep.all.fa','w')
         ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']: fx += 1; continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'],','):
                 (tname,tseq) = iseq.getDictSeq(trans,format='tuple')
                 self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon]))
                 if tseq[entry[icod]-3:][:7] == entry[icon]:
                     ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             elif len(peptseq) < minpep:
                 self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             id = rje.preZero(int(ex/100),etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq))
             ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close(); ENSP.close()
         self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx)))
         return
     except: self.errorLog('%s.method error' % self)
Exemple #12
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())
Exemple #13
0
 def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True):  ### Maps taxa onto TaxID. If taxa is a list, will process each element.
     '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not taxa: return []
         taxid = []
         ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxa.sort()
         except: tlist = False
         if tlist:
             tx = 0.0; ttot = len(taxa)
             if ttot > 1:
                 for t in taxa:
                     if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0
                     taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False)
                 taxid = rje.sortUnique(taxid)
                 if log:
                     if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid)))
             else:
                 t = taxa[0]
                 if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t)
                 taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False))
                 if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid)))
             return taxid
         ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap = self.dict['TaxMap']; rankid = self.list['RankID']
         taxa = '%s' % taxa
         ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if rje.matchExp('^(\d+)$', taxa):
             #if taxa not in taxmap: self.taxaChildren(taxa)
             #if taxa in rankid: return [taxa]
             if nodeonly:
                 if taxa in rankid or not rankonly: return [taxa]
                 else: return []
             if taxa not in taxmap:
                 if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True)
                 return []
             parents = [taxa]
             while parents:
                 taxa = parents.pop(0)
                 #if taxa not in taxmap: self.taxaChildren(taxa)
                 if not rankonly or taxa in rankid: taxid.append(taxa)
                 parents += taxmap[taxa]
             return taxid
         ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if taxa == string.replace(taxa.upper(),' ',''):
             greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
             for entry in greplines:
                 try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0])
                 except: pass
             if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True)
             if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
             return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxa = taxa.replace('_',' ')
         ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
         gtaxid = None; comid = []; synid = []
         for entry in greplines:
             try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0]
             except: pass
             if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid)
             elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid)
             elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid)
         if not taxid: taxid = comid
         if not taxid: taxid = synid
         if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True)
         if len(taxid) > 1:
             #self.bugPrint(string.join(greplines))
             #self.debug('%s %s %s' % (taxid,comid,synid))
             if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         #self.debug(taxid)
         ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines()
         for entry in greplines:
             try:
                 #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry)
                 gtaxid = string.split(entry,'\t|\t')
                 if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0])
                 elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0])
             except: pass
         if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
     except: self.errorLog('%s.mapToTaxID() error' % (self)); raise
Exemple #14
0
 def treeListSPCode(self):  ### Main taxa mapping from list of tree files
     '''Main taxa mapping from list of tree files.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         specdb = self.db('spcode',add=True,forcecheck=True,mainkeys=['protein'])
         if not specdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.spcode.tdt' % self.getStr('TaxBase')
             specdb = db.addTable(spfile,mainkeys=['protein'],name='spcode',expect=False)
         if specdb: specdb.dataFormat({'boot':'num'}); return True
         specdb = db.addEmptyTable('spcode',['protein','boot','spcode','inpara','paralogues'],['protein'])
         #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein'])
         self.dict['Duplicates'] = {}    # {prot1:[dups]}
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for nwkfile in self.list['NwkList']:
             tree = rje_tree.Tree(self.log,self.cmd_list)
             tree.loadTree(nwkfile,seqlist=None,postprocess=False)
             seqacc = rje.baseFile(nwkfile,strip_path=True)
             # Identify node corresponding to query sequence
             seqnode = None
             for node in tree.nodes():
                 try:
                     if string.split(node.shortName(),'__')[1] == seqacc: seqnode = node
                 except: pass    # Internal node or bad sequence format
             if not seqnode:
                 self.warnLog('Could not find %s in %s nodes!' % (seqacc,nwkfile))
                 continue
             # Get species code for query sequence
             seqspec = tree.cladeSpec(seqnode)
             if len(seqspec) != 1: self.warnLog('Could not find species in %s node!' % (seqacc)); continue
             seqspec = seqspec.keys()[0]
             if seqspec != string.split(seqnode.shortName(),'_')[1]: raise ValueError('Species mismatch for %s & %s' % (seqacc,seqnode.shortName()))
             # Find ancestor with closest orthologue outgroup
             rootnode = tree._getRootNode()
             if not rootnode: self.warnLog('Could not find root node in %s!' % (nwkfile)); continue
             ancnode = seqnode.ancNode()
             try: bootx = float(ancnode.ancBranch().stat['Bootstrap'])/tree.stat['Bootstraps']
             except: bootx = 1.0
             inparanode = None    # Node to define in-paralogues
             ancspec = tree.cladeSpec(ancnode)
             while len(ancspec) < 2 or bootx < self.getNum('MinBoot'):
                 inparanode = ancnode    # All same species
                 if ancnode == rootnode: break
                 ancnode = ancnode.ancNode(); ancspec = tree.cladeSpec(ancnode)
                 try: bootx = float(ancnode.ancBranch().stat['Bootstrap'])/tree.stat['Bootstraps']
                 except: bootx = 1.0
             ancspec.pop(seqspec)    # Now only have counts of closest other species
             # Update table, replacing species codes with genera?
             sentry = {'protein':seqacc,'spcode':rje.sortUnique(ancspec.keys())}
             sentry['boot'] = bootx
             if not ancspec: sentry['spcode'] = ['None']; sentry['boot'] = self.getNum('NoneBoot')
             sentry['spcode'] = string.join(sentry['spcode'],'|')
             # Establish list of duplicate proteins
             inpara = []     # List of in-paralogue nodes
             inparacc = []   # List of in-paralogue accnum
             if inparanode: inpara = tree._nodeClade(inparanode,internal=False)
             self.dict['Duplicates'][seqacc] = []
             for node in tree._nodeClade(rootnode,internal=False):
                 if node == seqnode: continue
                 if len(string.split(node.shortName(),'_')) < 2: continue
                 if string.split(node.shortName(),'_')[1] == seqspec:
                     paracc = string.split(node.shortName(),'__')[1]
                     if node in inpara: inparacc.append(paracc)
                     else: self.dict['Duplicates'][seqacc].append(paracc)
             sentry['inpara'] = string.join(inparacc,'|')
             sentry['paralogues'] = string.join(self.dict['Duplicates'][seqacc],'|')
             specdb.addEntry(sentry)
         ## Update specdb and save
         specdb.saveToFile()
         #dupdb.saveToFile()
         return True
     except:
         self.errorLog(self.zen())
         return False
Exemple #15
0
    def contamination(self):    ### Compares peptides from Chlamydia and human and outputs summaries
        '''Compares peptides from Chlamydia and human and outputs summaries.'''
        try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
            mods = ['none']
            
            ### >>>> Shortcut reanalysis without modifications >>>> ###
            pepfile = '%s.chlam_peptides.tdt' % self.basefile()
            if not self.force() and os.path.exists(pepfile):
                pepdb = db.addTable(pepfile,mainkeys=['key','seqmod'],name='chlam_nomod')
                pepdb.dropFields(['pass','modification'])
                pepdb.compress(['key','seq'],default='max')
                pepdb.dropFields(['seqmod'])
                for entry in pepdb.entries():
                    for field in pepdb.fields():
                        if 'len' not in field: continue
                        try:
                            if entry[field] and int(entry[field]): entry[field] = len(entry['seq'])
                            else: entry[field] = ''
                        except: self.errorLog('%s >> %s' % (entry,field),quitchoice=True)
                tdb = pepdb
                comprules = {'key':'str','pi':'str','mass':'str'}
                shapefields = []
                for field in pepdb.fields():
                    if 'len' in field: comprules[field] = 'mean'
                    if len(string.split(field,'|')) > 1 and string.split(field,'|')[0] not in shapefields: shapefields.append(string.split(field,'|')[0])
                print shapefields
                tdb.compress(['protein'],rules=comprules,default='sum')
                tdb.dropFields(['seq'])
                tdb.saveToFile()

                tdb.info['Name'] = 'chlam_nomod_summary'
                tdb.addField('temp',evalue=1)
                tdb.compress(['temp'],rules=comprules,default='sum')
                tdb.reshapeLong('exp',shapefields)
                tdb.newKey(['exp'])
                tdb.dropFields(['exp']+shapefields,inverse=True)
                tdb.saveToFile()

                return
            ### <<<< End Shortcut reanalysis without modifications <<<< ###

            ## ~ [0a] ~ Load EB and RB human peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#~~#','## ~ [0a] ~ Load EB and RB human peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False)
            #protein.key	protein.Entry	protein.Accession	protein.Description	protein.dataBaseType	protein.score	protein.falsePositiveRate	protein.avgMass	protein.MatchedProducts	protein.matchedPeptides	protein.digestPeps	protein.seqCover(%)	protein.MatchedPeptideIntenSum	protein.top3MatchedPeptideIntenSum	protein.MatchedProductIntenSum	protein.fmolOnColumn	protein.ngramOnColumn	protein.AutoCurate	protein.Key_ForHomologs	protein.SumForTotalProteins	peptide.Rank	peptide.Pass	peptide.matchType	peptide.modification	peptide.mhp	peptide.seq	peptide.OriginatingSeq	peptide.seqStart	peptide.seqLength	peptide.pI	peptide.componentID	peptide.MatchedProducts	peptide.UniqueProducts	peptide.ConsectiveMatchedProducts	peptide.ComplementaryMatchedProducts	peptide.rawScore	peptide.score	peptide.(X)-P Bond	peptide.MatchedProductsSumInten	peptide.MatchedProductsTheoretical	peptide.MatchedProductsString	peptide.ModelRT	peptide.Volume	peptide.CSA	peptide.ModelDrift	peptide.RelIntensity	peptide.AutoCurate	precursor.leID	precursor.mhp	precursor.mhpCal	precursor.retT	precursor.inten	precursor.calcInten	precursor.charge	precursor.z	precursor.mz	precursor.fraction	precursor.numFrac	precursor.fwhm	precursor.liftOffRT	precursor.infUpRT	precursor.infDownRT	precursor.touchDownRT	prec.rmsFWHMDelta	peptidePrecursor.deltaMhpPPM
            humedb = db.addTable('EB_IA_final_peptide.csv',mainkeys=['protein.Accession','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.key','protein.Accession','protein.Entry','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name='humaneb')
            humrdb = db.addTable('RB_IA_final_peptide.csv',mainkeys=['protein.Accession','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.key','protein.Accession','protein.Entry','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name='humanrb')
            for humdb in [humedb,humrdb]:
                humdb.info['Delimit'] = '\t'
                humdb.addField('exp',evalue=humdb.info['Name'][-2:])
                humdb.renameField('protein.Accession','Protein')
                humdb.renameField('protein.Entry','Species')
                for entry in humdb.entries(): entry['Species'] = string.split(entry['Species'],'_')[-1]
                humdb.dropEntriesDirect('Species',['HUMAN'],inverse=True)
                for field in ['Rank','Pass','seq','OriginatingSeq','modification']: humdb.renameField('peptide.%s' % field,field)
                humdb.dataFormat({'Rank':'int'})
                for mod in humdb.index('modification'):
                    if mod.lower() and mod.lower() not in mods: mods.append(mod.lower())
                humdb.addField('seqmod')
                for entry in humdb.entries():
                    if entry['modification'] and mods.index(entry['modification'].lower()): entry['seqmod'] = '%s-%d' % (entry['seq'],mods.index(entry['modification'].lower()))
                    else: entry['seqmod'] = entry['seq']
            humtdb = db.copyTable(humedb,'humantot')
            humtdb.newKey(['Protein','Rank','seq','modification','exp'])
            db.mergeTables(humtdb,db.copyTable(humrdb,'temp',add=False))
            humtdb.compress(['Protein','seq','Pass'],rules={'Rank':'max'})
            ## ~ [0b] ~ Load Proteomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#~~#','## ~ [0b] ~ Load Proteomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False)
            # Load human proteome
            hseqfile = '/home/re1u06/researchfiles/SBSBINF/Databases/DBase_120225/EnsEMBL/ens_HUMAN.loci.fas'
            hseq = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % hseqfile])
            # Load Chlamydia proteome
            cseqfile = '../2011-07-18-Genome/NC_010287.proteome.fas'
            cseq = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % cseqfile])
            # Load matched protein list
            rbpep = rje.listFromCommand('../2011-05-ProDigIS/soton_rb_peptides.txt')
            ebpep = rje.listFromCommand('../2011-05-ProDigIS/soton_rb_peptides.txt')
            ## ~ [0c] ~ Load EB and RB Chlamydia peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#~~#','## ~ [0c] ~ Load EB and RB Chlamydia peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False)
            chlamdb = {'EB':[],'RB':[]}
            for pfile in glob.glob('./Soton*/*peptide.csv'):
                (er,uniq) = rje.matchExp('\./Soton(\S\S)\S+_(\d+)/',pfile)
                chlamdb[er].append(db.addTable(pfile,mainkeys=['protein.key','protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name=uniq))
            edb = chlamdb['EB'].pop(0); edb.info['Name'] = 'chlam_eb'
            while chlamdb['EB']: db.mergeTables(edb,chlamdb['EB'].pop(0))
            rdb = chlamdb['RB'].pop(0); rdb.info['Name'] = 'chlam_rb'
            while chlamdb['RB']: db.mergeTables(rdb,chlamdb['RB'].pop(0))
            # Load EB and RB matching peptide file
            #edb = db.addTable('../2011-05-ProDigIS/SotonEB_peptide_pjss.csv',mainkeys=['protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.OriginatingSeq'],name='chlam_eb')
            #rdb = db.addTable('../2011-05-ProDigIS/SotonRB_peptide_pjss.csv',mainkeys=['protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.OriginatingSeq'],name='chlam_rb')
            for chlamdb in [edb,rdb]:
                chlamdb.info['Delimit'] = '\t'
                chlamdb.addField('exp',evalue=chlamdb.info['Name'][-2:])
                chlamdb.renameField('protein.name','Protein'); chlamdb.renameField('protein.key','key')
                for field in ['Rank','Pass','seq','OriginatingSeq','modification']: chlamdb.renameField('peptide.%s' % field,field)
                chlamdb.dataFormat({'Rank':'int'})
                for mod in chlamdb.index('modification'):
                    if mod.lower() and mod.lower() not in mods: mods.append(mod.lower())
                chlamdb.addField('seqmod')
                chlamdb.addField('Species',evalue='UNKNOWN')
                for entry in chlamdb.entries():
                    if 'Chlamydia trachomatis' in entry['Protein'] or '_CHLT2' in entry['Protein']: entry['Species'] = 'CHLT2'
                    if entry['modification'] and mods.index(entry['modification'].lower()): entry['seqmod'] = '%s-%d' % (entry['seq'],mods.index(entry['modification'].lower()))
                    else: entry['seqmod'] = entry['seq']
                    if not entry['OriginatingSeq']: entry['OriginatingSeq'] = entry['seq']
                chlamdb.dropEntriesDirect('Species',['CHLT2'],inverse=True)
                chlamdb.remakeKeys()
            ## ~ Load Protein Key Mapping ~ ##
            kdb = db.addTable('NC_010287.proteinkey.tdt',mainkeys=['key'],name='keys')
            xdb = db.addTable('NC_010287.dbxref.tdt',mainkeys=['tag'],name='xref')
            tdb = db.copyTable(edb,'chlam_temp')
            self.deBug(tdb.entries()[0])
            tdb.newKey(['Protein','Rank','Pass','seq','modification','exp'])
            db.mergeTables(tdb,db.copyTable(rdb,'temp',add=False))
            kdb = db.joinTables(name='full_xref',join=[(kdb,'tag'),(xdb,'tag')],newkey=kdb.keys(),keeptable=True)
            tdb = db.joinTables(name='chlam_tot',join=[(tdb,'key'),(kdb,'key')],newkey=tdb.keys(),keeptable=True)
            self.deBug(tdb.keys())
            self.deBug(tdb.entries()[0])

            ### ~ [1] ~ Add Human Data to combined Chlamydia Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.printLog('#~~#','### ~ [1] ~ Add Human Data to combined Chlamydia Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###',log=False)
            tdb.renameField('Pass','pass'); tdb.renameField('Protein','protein');
            for entry in tdb.entries():
                entry['pass'] = string.atoi(entry['pass'][-1])
            keep = ['key'] + xdb.fields() + ['description','protein','exp','pass','seq','seqmod','modification']
            tdb.newKey(['tag','exp','pass','seqmod','Rank'])
            tdb.compress(['tag','exp','pass','seqmod'],rules={'pass':'******'})
            tdb.dropFields(keep,inverse=True)
            self.deBug(tdb.keys())
            self.deBug(tdb.entries()[0])

            ## ~ [1a] ~ Map ID'd peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~ ##
            bothpep = {'eb':[],'rb':[]}     # Peptides found in both species
            chlampep = {'eb':[],'rb':[]}    # Peptides found only in Chlamydia
            uniqpep = {'eb':[],'rb':[]}     # Peptides found only in a single protein in Chlamydia
            fx = len(tdb.fields())
            tdb.addField('pep',evalue=1)
            for field in ['pass1','pass2','hsap1','hsap2','uniq1','uniq2']: tdb.addField(field,evalue=0)
            comprules = {'pass':'******','key':'min'}
            for field in ['pep','pass1','pass2','hsap','uniq']:
                tdb.addField('%s_len' % field)
                comprules[tdb.fields()[-1]] = 'mean'
            shapefields = tdb.fields()[fx:]

            for entry in tdb.entries():
                epass = '******' % entry['pass']
                entry[epass.lower()] = 1
                plen = entry['pep_len'] = len(entry['seq'])
                plen = entry['pass%d_len' % entry['pass']] = len(entry['seq'])
                hsap = False
                if entry['exp'] == 'eb':
                    if 'Pass1' in humedb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap1'] += 1; hsap = True
                    if 'Pass2' in humedb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap2'] += 1; hsap = True
                    if entry['seqmod'] not in humedb.index('seqmod'):
                        if entry['seq'] in humedb.index('seq'): self.errorLog('EB mod peptide %s not found in Human EB but unmod *is* found in Human EB!' % entry['seqmod'],printerror=False)
                        if entry['seqmod'] in humrdb.index('seqmod'): self.errorLog('EB peptide %s not found in Human EB but found in Human RB!' % entry['seqmod'],printerror=False)
                else:
                    if 'Pass1' in humrdb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap1'] += 1; hsap = True
                    if 'Pass2' in humrdb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap2'] += 1; hsap = True
                    if entry['seqmod'] not in humrdb.index('seqmod'):
                        if entry['seq'] in humrdb.index('seq'): self.errorLog('RB mod peptide %s not found in Human RB but unmod *is* found in Human RB!' % entry['seqmod'],printerror=False)
                        if entry['seqmod'] in humedb.index('seqmod'): self.errorLog('RB peptide %s not found in Human RB but found in Human EB!' % entry['seqmod'],printerror=False)
                if hsap: entry['hsap_len'] = plen; bothpep[entry['exp']].append(entry['seq']); continue
                chlampep[entry['exp']].append(entry['seq'])
                entry['uniq1'] = entry['pass1']
                entry['uniq2'] = entry['pass2']
                entry['uniq_len'] = plen
                for altentry in tdb.indexEntries('seqmod',entry['seqmod']):
                    if altentry['tag'] == entry['tag']: continue
                    entry['uniq1'] = entry['uniq2'] = entry['uniq_len'] = 0
                if entry['uniq1'] or entry['uniq2']: uniqpep[entry['exp']].append(entry['seq'])

            tdb.reshapeWide('exp',shapefields)
            fillfields = tdb.fields()[13:]
            for field in fillfields[0:]:
                if 'len' in field: fillfields.remove(field)
            tdb.fillBlanks(0,fillfields,fillempty=True)
            for entry in tdb.entries():
                if entry['modification'] == 0: entry['modification'] = ''

            for field in shapefields:
                tdb.addField('%s|tot' % field)
                if field[-3:] == 'len':
                    comprules['%s|eb' % field] = 'mean'
                    comprules['%s|rb' % field] = 'mean'
                    comprules['%s|tot' % field] = 'mean'
            for entry in tdb.entries():
                for field in shapefields:
                    if entry['%s|eb' % field] and not entry['%s|rb' % field]: entry['%s|tot' % field] = entry['%s|eb' % field]
                    elif entry['%s|rb' % field] and not entry['%s|eb' % field]: entry['%s|tot' % field] = entry['%s|rb' % field]
                    else: entry['%s|tot' % field] = max(entry['%s|eb' % field],entry['%s|rb' % field])

            tdb.info['Name'] = 'chlam_peptides'
            tdb.saveToFile()

            tdb.info['Name'] = 'chlam_proteins'
            tdb.compress(['protein'],rules=comprules,default='sum')
            tdb.dropFields(['pass','seq','modification','seqmod'])
            tdb.saveToFile()

            tdb.info['Name'] = 'chlam_summary'
            tdb.addField('temp',evalue=1)
            tdb.compress(['temp'],rules=comprules,default='sum')
            tdb.reshapeLong('exp',shapefields)
            tdb.newKey(['exp'])
            tdb.dropFields(['exp']+shapefields,inverse=True)
            tdb.saveToFile()

            bothpep['tot'] = bothpep['eb'] + bothpep['rb']     # Peptides found in both species
            chlampep['tot'] = chlampep['eb'] + chlampep['rb']# Peptides found only in Chlamydia
            uniqpep['tot'] = uniqpep['eb'] + uniqpep['rb']
            for er in bothpep:
                open('%s.%s.bothpep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(bothpep[er]),'\n'))
                open('%s.%s.chlampep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(chlampep[er]),'\n'))
                open('%s.%s.uniqpep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(uniqpep[er]),'\n'))
            



            return



            #Peptide numbers for C. trachomatis/human
            #1.	Number of  chlamydial peptides assigned for each protein from RBs
            #2.	Number of  chlamydial peptides assigned for each protein from EBs
            #3.	Number of  chlamydial peptides assigned from both EB and RB combined, with redundancy removed
            #4.	Number of  unique chlamydial peptides assigned for each protein from RBs
            #5.	Number of unique chlamydial peptides assigned for each protein from EBs
            #6.	Number of unique chlamydial peptides assigned for EBs and RBs combined with redundancy removed
            #7.	Total number of human peptides identified in EB (Length would be useful)
            #8.	Total number of human peptides identified in RB (Length would be useful)
            #9.	Total number of human peptides identified in EB and RB
            #10.	Human peptides matching pass 1 chlamydia peptides for RB (sequence would be useful)
            #11.	Human peptides matching pass 2 chlamydia peptides for EB (sequence would be useful)

            #An accession number and protein description would be useful where possible, i.e.,  the number of chlamydial peptides for each protein.





            tdb.compress(['Protein','seq'],rules={'Rank':'max','Pass':'******'})
            for entry in tdb.entries(): entry['exp'] = 'tot'

            
            ### ~ [1] ~ Map ID'd peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mapkey = 'seqmod'
            self.deBug(rje.sortKeys(humdb.index(mapkey)))
            self.printLog('#~~#','## ~ [1] ~ Map ID\'d peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###',log=False)
            for chlamdb in [edb,rdb,tdb]:
                bothpep = []
                chlampep = []
                uniqpep = []
                if chlamdb == edb: humdb = humedb
                elif chlamdb == rdb: humdb = humrdb
                else: humdb = humtdb
                chlamdb.addField('Pep',evalue=1)
                chlamdb.addField('Pass1',evalue=0)
                chlamdb.addField('Pass2',evalue=0)
                chlamdb.addField('Hsap1',evalue=0)
                chlamdb.addField('Hsap2',evalue=0)
                chlamdb.addField('Uniq1',evalue=0)
                chlamdb.addField('Uniq2',evalue=0)
                comprules = {'Rank':'max'}
                for field in ['Pep','Pass1','Pass2','Hsap','Uniq']:
                    chlamdb.addField('%s_len' % field)
                    comprules[chlamdb.fields()[-1]] = 'mean'
                for entry in chlamdb.entries():
                    if 'Pass1' in entry['Pass']: entry['Pass'] = '******'
                    else: entry['Pass'] = '******'
                    entry[entry['Pass']] += 1
                    entry['Pep_len'] = plen = len(entry['seq'])
                    entry['%s_len' % entry['Pass']] = plen
                    hsap = False
                    self.deBug(entry[mapkey])
                    self.deBug(entry[mapkey] in humdb.index(mapkey))
                    if 'Pass1' in humdb.indexDataList(mapkey,entry[mapkey],'Pass'): entry['Hsap1'] += 1; bothpep.append(entry[mapkey]); hsap = True
                    if 'Pass2' in humdb.indexDataList(mapkey,entry[mapkey],'Pass'): entry['Hsap2'] += 1; bothpep.append(entry[mapkey]); hsap = True
                    if hsap: entry['Hsap_len'] = plen; continue
                    chlampep.append(entry[mapkey])
                    entry['Uniq1'] = entry['Pass1']
                    entry['Uniq2'] = entry['Pass2']
                    entry['Uniq_len'] = plen
                    for altentry in chlamdb.indexEntries(mapkey,entry[mapkey]):
                        if altentry['Protein'] == entry['Protein']: continue
                        entry['Uniq1'] = entry['Uniq2'] = 0
                    if entry['Uniq1'] or entry['Uniq2']: uniqpep.append(entry[mapkey])
                chlamdb.dropFields(['Pass','Rank','seq','OriginatingSeq','modification'])
                chlamdb.compress(['Protein'],rules=comprules,default='sum')
                #chlamdb.dropField('Rank')
                chlamdb.saveToFile()
                open('%s.%s.bothpep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(bothpep),'\n'))
                open('%s.%s.chlampep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(chlampep),'\n'))
                open('%s.%s.uniqpep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(uniqpep),'\n'))
                chlamdb.newKey(['Protein','exp'])
            db.mergeTables(edb,rdb)
            db.mergeTables(edb,tdb)
            cdb = db.copyTable(edb,'chlam_summary')
            edb.info['Name'] = 'chlam_pep'
            edb.reshapeWide('exp',edb.fields()[-7:])
            edb.saveToFile()
            cdb.compress(['exp'],rules=comprules,default='sum')
            cdb.dropField('Protein')
            cdb.saveToFile()
            


            # - twice maybe, once using EnsEMBL sequences directly, once using EB/RB search
            # - Numbers of unique Pass1/2 human peptides, and numbers matching Chlam
            # - Numbers of matched peptides per Chlam gene: total, eb, rb, human (e/r), unique (e/r), ens (e/r)
            ## Do complete digest of Chlam and search against Human
        except: self.errorLog('%s.contamination error' % self)
Exemple #16
0
 def fpi(self):  ### Family-protein interactions
     '''Family-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_FPI'
         rje.mkDir(self, outdir)
         fpi = {}  # Dictionary of {family:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qry in rje.sortKeys(self.dict['PPI']):
             try:
                 fam = self.dict['Fam'][qry]
                 if len(fam) < 2: continue
             except:
                 self.errorLog('Problem with "%s" protein family' % qry)
                 continue
             fpi[qry] = []
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 fpi[qry] += self.dict['PPI'][hub]  # Add with redundancy
             for spoke in fpi[qry][0:]:
                 if fpi[qry].count(spoke) == 1:
                     fpi[qry].remove(
                         spoke)  # Must have 2+ family interactions
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in fpi[qry]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict[
                                 'PPI'][spoke]:
                             self.dict['PPI'][spoke].remove(hub)
             fpi[qry] = rje.sortUnique(fpi[qry], False, False)
             acc = []
             gene = self.dict['Gene'][qry]
             for name in fpi[qry]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.fpi.acc' % (outdir, gene),
                  'w').write(string.join(acc, '\n'))
             self.printLog('#FPI',
                           '%s family => %d interactors' % (gene, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#FPI', 'No %s PPI left after FPI removed' % hub)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after FPI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.fpi()', quitchoice=True)
Exemple #17
0
    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')
Exemple #18
0
    def run(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mygo = rje_go.GO(self.log,self.cmd_list)
            mygo.readGO()
            gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True)
            self.deBug(rje.sortKeys(gomap)[:100])
            #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!#
            self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap))))
            slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum'])
            self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc))))
            ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)
            seqmap = {}
            (sx,stot) = (0.0,seqlist.seqNum())
            for seq in seqlist.seq:
                self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0
                if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0]
            self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap))))
            self.deBug(rje.sortKeys(seqmap)[:100])

            ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            goocc = {}
            outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.')
            headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum']
            for okey in slimocc.keys():
                self.progLog('#NEW','Making new GO occurrences: %s    ' % (rje.integerString(len(slimocc))))
                data = slimocc.pop(okey)
                gene = seq = data['Seq']
                type = 'fwd'
                if string.split(data['Motif'],'_')[-1] in ['rev','scram']:
                    type = string.split(data['Motif'],'_')[-1]
                    data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_')
                if gene not in gomap and gene in seqmap: gene = seqmap[gene]
                golist = []
                if gene in gomap:
                    for id in gomap[gene]: golist += mygo.parents(id)
                else: golist = ['NoGo']
                self.deBug('%s:%s::%s' % (seq,gene,golist))
                for id in rje.sortUnique(golist,False,False):
                    if id not in goocc: goocc[id] = {}
                    if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]}
                    goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data))
            self.printLog('\r#NEW','Making new GO occurrences complete.    ' % (rje.integerString(len(slimocc))))

            rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
            (mx,ox,ix,itot) = (0,0,0.0,len(goocc))
            for id in rje.sortKeys(goocc):
                for motif in rje.sortKeys(goocc[id]):
                    for type in rje.sortKeys(goocc[id][motif]):
                        if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type)
                    if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue
                    mx += 1
                    for type in goocc[id][motif]:
                        for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1
                self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox)))
            self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox)))

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible
Exemple #19
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd)
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}   # Dictionary of {ID:Sequence object}
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name'])
                if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold':scaffold,'Gene':name})
                try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos)
                except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                gene.opt['Complement'] = start > end        # Sequence on "lagging" strand
                gene.setStat({'Start':start,'End':end})
                gene.list['CDS'] = []       # Will add CDS sequences here
                gene.list['Exon'] = []      # Will add exon sequences here
            self.log.printLog('\r#GENE','Processing Gene Annotation complete!')
                           
            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx,ctot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False)
                cx += 100
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS','Processing CDS Annotation complete!')
                
            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex,etot) = (0.0,exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False)
                ex += 100
                try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON','Processing Exon Annotation complete!')
                
            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist,xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist,xreplace=False)
                gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,','))
            self.log.printLog('\r#GENE','Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except: self.log.errorLog(rje_zen.Zen().wisdom())
Exemple #20
0
    def taxaMap(self):  ### Maps species codes onto different taxonomic ranks.
        '''Maps species codes onto different taxonomic ranks.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            #descdb = self.db('protdesc')
            ranks = ['genus', 'family', 'order', 'class', 'phylum']
            rankmap = {}  # SPCODE to Taxon dictionary
            rankfields = ['protein'] + ranks + specdb.fields()[1:]
            #if descdb: rankfields.append('desc')
            if self.getStrLC('ProtDesc'):
                rankfields.append('desc')
                px = 0
                for prot in self.dict['ProtDesc']:
                    if prot.lower() in ['', 'protein', 'gene']: continue
                    pentry = {
                        'protein': prot,
                        'spcode': 'None',
                        'boot': self.getNum('NoneBoot')
                    }
                    pkey = specdb.makeKey(pentry)
                    if pkey not in specdb.dataKeys():
                        specdb.addEntry(pentry)
                        px += 1
                self.printLog(
                    '#PROT', 'Added %s proteins from %s without trees.' %
                    (rje.iStr(px), self.getStr('ProtDesc')))
            rankdb = db.addEmptyTable('taxamap', rankfields, ['protein'])
            for rank in ranks:
                rankmap[rank] = {
                    'None': 'None',
                    'Unmapped': 'Unmapped',
                    'Uncertain': 'Uncertain'
                }
            taxdb = db.addEmptyTable('taxa',
                                     ['spcode', 'taxid', 'name'] + ranks,
                                     ['spcode'])

            sx = 0.0
            stot = specdb.entryNum()
            for entry in specdb.entries():
                self.progLog('\r#SPEC',
                             'Processing species: %.2f%%' % (sx / stot))
                sx += 100.0
                #if descdb:
                #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description']
                try:
                    entry['desc'] = self.dict['ProtDesc'][entry['protein']]
                except:
                    entry['desc'] = ''
                for spcode in string.split(entry['spcode'], '|'):
                    if spcode in rankmap['genus']: continue
                    tentry = {'spcode': spcode}
                    try:
                        taxid = tax.mapToTaxID(spcode,
                                               nodeonly=True,
                                               warn=False)[0]
                        rank = tax.dict['Rank'][taxid]
                        tentry['taxid'] = taxid
                        tentry['name'] = tax.getSpecies(taxid)
                    except:
                        self.warnLog(
                            'Unable to map species code "%s" to TaxID -> "Unmapped"'
                            % spcode)
                        taxid = 'Unmapped'
                        rank = 'genus'
                    # Loop through different ranks
                    for ri in range(len(ranks)):
                        nextrank = ranks[ri]
                        while rank not in ranks[ri:] and taxid in tax.dict[
                                'Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            rank = tax.dict['Rank'][taxid]
                            #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid)))
                        if taxid in tax.dict['Parent']:
                            taxon = tax.getSpecies(taxid)
                        else:
                            taxon = 'Unmapped'
                        if rank != nextrank:
                            if self.getBool('Monophyly'): taxon = 'Uncertain'
                            else: taxon = '%s %s.' % (taxon, nextrank[:3])
                        rankmap[nextrank][spcode] = taxon
                        tentry[nextrank] = taxon
                    taxdb.addEntry(tentry)
                rentry = {}
                for nextrank in ranks:
                    taxa = []
                    unmapped = ''
                    for spcode in string.split(entry['spcode'], '|'):
                        ranktax = rankmap[nextrank][spcode]
                        if 'unmapped' in ranktax.lower(
                        ) and ranktax not in taxa:
                            if unmapped:
                                self.warnLog('Two Unmapped %s taxa: %s & %s' %
                                             (nextrank, unmapped, ranktax))
                            unmapped = ranktax  #i# Should only be one
                        if ranktax not in taxa: taxa.append(ranktax)
                    if len(taxa) > 1 and 'None' in taxa:
                        self.warnLog('None in: %s' %
                                     string.join(rje.sortUnique(taxa), '|'))
                        taxa.remove('None')
                    if len(taxa) > 1 and unmapped: taxa.remove(unmapped)
                    if len(taxa) > 1 and self.getBool('Monophyly'):
                        rentry[nextrank] = 'Uncertain'
                    else:
                        rentry[nextrank] = string.join(rje.sortUnique(taxa),
                                                       '|')
                rankdb.addEntry(rje.combineDict(rentry, entry))
            self.printLog(
                '\r#SPEC',
                '%s proteins with species codes processed.' % rje.iStr(stot))
            rankdb.saveToFile()
            taxdb.saveToFile()
        except:
            self.errorLog('%s.taxaMap error' % self.prog())
Exemple #21
0
    def taxaMap(self):      ### Maps species codes onto different taxonomic ranks.
        '''Maps species codes onto different taxonomic ranks.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            #descdb = self.db('protdesc')
            ranks = ['genus','family','order','class','phylum']
            rankmap = {}    # SPCODE to Taxon dictionary
            rankfields = ['protein']+ranks+specdb.fields()[1:]
            #if descdb: rankfields.append('desc')
            if self.getStrLC('ProtDesc'):
                rankfields.append('desc'); px = 0
                for prot in self.dict['ProtDesc']:
                    if prot.lower() in ['','protein','gene']: continue
                    pentry = {'protein':prot,'spcode':'None','boot':self.getNum('NoneBoot')}
                    pkey = specdb.makeKey(pentry)
                    if pkey not in specdb.dataKeys(): specdb.addEntry(pentry); px += 1
                self.printLog('#PROT','Added %s proteins from %s without trees.' % (rje.iStr(px),self.getStr('ProtDesc')))
            rankdb = db.addEmptyTable('taxamap',rankfields,['protein'])
            for rank in ranks: rankmap[rank] = {'None':'None','Unmapped':'Unmapped','Uncertain':'Uncertain'}
            taxdb = db.addEmptyTable('taxa',['spcode','taxid','name']+ranks,['spcode'])

            sx = 0.0; stot = specdb.entryNum()
            for entry in specdb.entries():
                self.progLog('\r#SPEC','Processing species: %.2f%%' % (sx/stot)); sx += 100.0
                #if descdb:
                    #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description']
                try: entry['desc'] = self.dict['ProtDesc'][entry['protein']]
                except: entry['desc'] = ''
                for spcode in string.split(entry['spcode'],'|'):
                    if spcode in rankmap['genus']: continue
                    tentry = {'spcode':spcode}
                    try:
                        taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0]
                        rank = tax.dict['Rank'][taxid]
                        tentry['taxid'] = taxid
                        tentry['name'] = tax.getSpecies(taxid)
                    except:
                        self.warnLog('Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode)
                        taxid = 'Unmapped'
                        rank = 'genus'
                    # Loop through different ranks
                    for ri in range(len(ranks)):
                        nextrank = ranks[ri]
                        while rank not in ranks[ri:] and taxid in tax.dict['Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            rank = tax.dict['Rank'][taxid]
                            #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid)))
                        if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid)
                        else: taxon = 'Unmapped'
                        if rank != nextrank:
                            if self.getBool('Monophyly'): taxon = 'Uncertain'
                            else: taxon = '%s %s.' % (taxon,nextrank[:3])
                        rankmap[nextrank][spcode] = taxon
                        tentry[nextrank] = taxon
                    taxdb.addEntry(tentry)
                rentry = {}
                for nextrank in ranks:
                    taxa = []
                    unmapped = ''
                    for spcode in string.split(entry['spcode'],'|'):
                        ranktax = rankmap[nextrank][spcode]
                        if 'unmapped' in ranktax.lower() and ranktax not in taxa:
                            if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank,unmapped,ranktax))
                            unmapped = ranktax   #i# Should only be one
                        if ranktax not in taxa: taxa.append(ranktax)
                    if len(taxa) > 1 and 'None' in taxa:
                        self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa),'|'))
                        taxa.remove('None')
                    if len(taxa) > 1 and unmapped: taxa.remove(unmapped)
                    if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain'
                    else: rentry[nextrank] = string.join(rje.sortUnique(taxa),'|')
                rankdb.addEntry(rje.combineDict(rentry,entry))
            self.printLog('\r#SPEC','%s proteins with species codes processed.' % rje.iStr(stot))
            rankdb.saveToFile()
            taxdb.saveToFile()
        except: self.errorLog('%s.taxaMap error' %  self.prog())
Exemple #22
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            genes = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd)
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            exons = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}  # Dictionary of {ID:Sequence object}
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE',
                                  'Processing Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                (id, scaffold, pos, name, glen) = rje.matchExp(
                    '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',
                    gene.info['Name'])
                if string.atoi(glen) != gene.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold': scaffold, 'Gene': name})
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',
                                           pos)
                except:
                    (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                gene.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                gene.setStat({'Start': start, 'End': end})
                gene.list['CDS'] = []  # Will add CDS sequences here
                gene.list['Exon'] = []  # Will add exon sequences here
            self.log.printLog('\r#GENE',
                              'Processing Gene Annotation complete!')

            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx, ctot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS',
                                  'Processing CDS Annotation: %.1f%%' %
                                  (cx / ctot),
                                  newline=False,
                                  log=False)
                cx += 100
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!')

            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex, etot) = (0.0, exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON',
                                  'Processing Exon Annotation: %.1f%%' %
                                  (ex / etot),
                                  newline=False,
                                  log=False)
                ex += 100
                try:
                    (id, scaffold, pos, name, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,
                                                                 ',')[0]]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON',
                              'Processing Exon Annotation complete!')

            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE',
                                  'Generating new Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist, xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist, xreplace=False)
                gene.info[
                    'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (
                        gene.info['Gene'], gene.info['SpecCode'],
                        gene.info['AccNum'], gene.aaLen(),
                        string.join(clist, ','), string.join(elist, ','))
            self.log.printLog('\r#GENE',
                              'Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Exemple #23
0
 def iTRAQSamples(self): ### Uses self.dict['Samples'] and self.db('itraq') to summarise hit data
     '''Uses self.dict['Samples'] and self.db('itraq') to summarise hit data.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db(); idb = self.db('itraq')
         mdb = db.copyTable(idb,'itraq_summary')
         gdb = db.copyTable(idb,'itraq_geomean')
         ### ~ [1] Reformat Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         mdb.dropField('geomean'); gdb.dropField('ratio'); gdb.renameField('geomean','ratio')
         for sdb in [mdb,gdb]:
             sdb.dropField('summary');
             sdb.dropEntriesDirect('ratio','---')
             sdb.dropEntriesDirect('ratio','NN')
             sdb.dataFormat({'ratio':'num','n':'int'})
             ## ~ [1a] Drop tags with Samples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             (ex,etot) = (0.0,sdb.entryNum())
             for entry in sdb.entries():
                 self.progLog('\r#ITRAQ','Drop isotags without Sample info: %.2f%%' % (ex/etot)); ex += 100.0
                 tags = string.split(entry['itraq'],'/')
                 if tags[0] not in self.dict['Samples'] or tags[1] not in self.dict['Samples']: sdb.dropEntry(entry)
             self.printLog('\r#ITRAQ','Dropped all isotags without Sample info: %s of %s entries remain' % (rje.iStr(sdb.entryNum()),rje.iStr(etot)))
             ## ~ [1b] Reshape, rename, invert and remove redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             sdb.reshapeWide('itraq',['ratio','n'])
             samples = rje.sortUnique(self.dict['Samples'].values())
             ratios = []
             self.printLog('#SAMP',string.join(samples,', '))
             for s1 in samples:
                 for s2 in samples[samples.index(s1):]:
                     newfield = '%s/%s' % (s1,s2)
                     sdb.addField(newfield)
                     sdb.addField('%s_Min' % newfield)
                     sdb.addField('%s_Max' % newfield)
                     sdb.addField('%s_Dirn' % newfield)
                     ratios.append(newfield)
                     for entry in sdb.entries(): entry[newfield] = []
             for field in sdb.fields():
                 if '|' in field:
                     (score,tags) = string.split(field,'|')
                     tag = string.split(tags,'/')
                     if int(tag[0]) > int(tag[1]):   ### Invert
                         newfield = '%s|%s/%s' % (score,tag[1],tag[0])
                         if newfield in sdb.fields(): sdb.dropField(newfield); continue
                         sdb.renameField(field,newfield)
                         if score == 'ratio':
                             for entry in sdb.entries():
                                 if entry[newfield]: entry[newfield] = 1.0 / entry[newfield]
                         tag = (tag[1],tag[0])
                         field = newfield
                     s1 = self.dict['Samples'][tag[0]]
                     s2 = self.dict['Samples'][tag[1]]
                     newname = '%s|%s%s/%s%s' % (score,s1,tag[0],s2,tag[1])
                     sdb.renameField(field,newname)
                     if score == 'n': continue
                     newfield = '%s/%s' % (s1,s2)
                     invfield = '%s/%s' % (s2,s1)
                     for entry in sdb.entries():
                         if entry[newname] and newfield in sdb.fields(): entry[newfield].append(entry[newname])
                         elif entry[newname]: entry[invfield].append(1.0/entry[newname])
             ## ~ [1c] Calculate Geometric mean ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             (ex,etot) = (0.0,sdb.entryNum())
             for entry in sdb.entries():
                 self.progLog('\r#GEO','Calculating Geometric means: %.2f%%' % (ex/etot)); ex += 100.0
                 for ratio in ratios:
                     if entry[ratio]:
                         entry['%s_Min' % ratio] = min(entry[ratio])
                         entry['%s_Max' % ratio] = max(entry[ratio])
                         try: entry[ratio] = rje.geoMean(entry[ratio])
                         except: self.deBug(entry)
                         if entry[ratio] > 1 and entry['%s_Min' % ratio] > 1: entry['%s_Dirn' % ratio] = 'UP'
                         elif entry[ratio] < 1 and entry['%s_Max' % ratio] < 1: entry['%s_Dirn' % ratio] = 'DOWN'
                     else: entry['%s_Dirn' % ratio] = entry['%s_Min' % ratio] = entry['%s_Max' % ratio] = entry[ratio] = ''
             self.printLog('\r#GEO','Geometric mean calculations complete')
             sdb.saveToFile()
     except: self.errorLog('iTRAQSamples error')
Exemple #24
0
    def run(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for fasta in glob.glob('*.fasta'):
                fas = fasta[:-2]
                if os.path.exists(fas): continue
                sx = 0
                for line in open(fasta, 'r').readlines():
                    if line[:1] == '>':
                        try:
                            (name,
                             desc) = rje.matchExp('^>(\S+) (\S.+)$', line)
                        except:
                            name = rje.matchExp('^>(\S+)', line)[0]
                        if len(string.split(name, '|')) == 3:
                            name = '6rf_NEIME__%s' % string.split(name, '|')[2]
                            open(fas, 'a').write('>%s\n' % name)
                        elif len(string.split(name, '|')) == 5:
                            name = 'ref_NEIME__%s' % string.split(name, '|')[3]
                            open(fas, 'a').write('>%s %s\n' % (name, desc))
                        else:
                            print string.split(name, '|')
                            raise ValueError
                        self.progLog(
                            '\r#FAS', 'Processing %s: %s seqs' %
                            (fas, rje.integerString(sx)))
                        sx += 1
                    else:
                        open(fas, 'a').write(line)
                self.printLog(
                    '\r#FAS', 'Processed %s: %s seqs from %s' %
                    (fas, rje.integerString(sx), fasta))
                rje_blast.BLASTRun(self.log,
                                   self.cmd_list).formatDB(fas,
                                                           protein=True,
                                                           force=True)
            ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            rfhits = {}  # Dictionary of {hit:['File:hit_num']}
            acc = 'MC58_6RF_Hits.acc'
            open(acc, 'w')
            gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
            cx = 0
            for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
                cx += 1
                file = os.path.basename(csv)[:-4]
                hits = False
                for line in open(csv, 'r').readlines():
                    if line.find('prot_hit_num,prot_acc') == 0: hits = True
                    elif hits:
                        data = rje.readDelimit(line, ',')
                        if len(data) < 2: continue
                        [num, name] = data[:2]
                        try:
                            name = string.split(name, '|')[2]
                        except:
                            continue
                        if name not in rfhits:
                            open(acc, 'a').write('6rf_NEIME__%s\n' % name)
                            rfhits[name] = []
                        id = '%s:%s' % (file, num)
                        if id not in rfhits[name]: rfhits[name].append(id)
                        self.progLog(
                            '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' %
                            (cx, rje.integerString(len(rfhits))))
            self.printLog(
                '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' %
                (cx, rje.integerString(len(rfhits)), acc))
            ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(gfile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=%s' % acc, 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas',
                        'qryacc=F'
                    ]).gablam()
            ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum'])
            zeros = []
            for hit in gdata:
                if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
            zeros = rje.sortUnique(zeros, False)
            open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n'))
            self.printLog(
                '#ZERO',
                '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
            ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
            if not os.path.exists(ufile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Zeros.fas',
                        'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas',
                        'qryacc=F'
                    ]).gablam()
            gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True)
            fdata = rje.dataDict(self,
                                 string.replace(ufile, 'hitsum', 'gablam'),
                                 ['Qry'], ['Hit'],
                                 lists=True)
            headers = gdata.pop('Headers')
            headers.insert(1, 'Sample')
            headers.append('BestHit')
            rje.delimitedFileOutput(self,
                                    'MC58_6RF_Zeros.tdt',
                                    headers,
                                    rje_backup=True)
            for rf in rje.sortKeys(gdata):
                rfcut = string.split(rf, '__')[1]
                gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ')
                gdata[rf]['Qry'] = rfcut
                try:
                    gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
                except:
                    gdata[rf]['BestHit'] = '-'
                rje.delimitedFileOutput(self,
                                        'MC58_6RF_Zeros.tdt',
                                        headers,
                                        datadict=gdata[rf])

        except:
            self.errorLog(rje_zen.Zen().wisdom())
        self.printLog('#ZEN', rje_zen.Zen().wisdom())