def readResults(self, clear=True, readaln=False ): ### Reads results from self.list['HMMRes'] into objects ''' Reads results from self.list['HMMRes'] into objects. >> clear:boolean = whether to clear self.search before reading [True] >> readaln:boolean = whether to bother reading Alignments into objects [False] ''' try: if clear: self.search = [] for resfile in rje.sortUnique(self.list['HMMRes'], xreplace=False): if not os.path.exists( resfile) and self.opt['GZip'] and os.path.exists( '%s.gz' % resfile): os.system('gunzip %s.gz' % resfile) self.printLog('#GUNZIP', 'Gunzipped %s.gz' % resfile) if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile, readaln) else: self.readHMMSearch(resfile, readaln) if self.opt['GZip'] and os.path.exists(resfile): rje.backup(self, '%s.gz' % resfile, unlink=True) os.system('gzip %s' % resfile) self.printLog('#GZIP', '%s gzipped to save space' % resfile) except: self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!', quitchoice=True) return False
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self, outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][ hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove( spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict[ 'PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom], False, False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir, dom), 'w').write(string.join(acc, '\n')) self.printLog('#DPI', '%s domain => %d interactors' % (dom, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI', 'No %s PPI left after DPI removed' % hub, screen=False) self.printLog( '#PPX', '%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]): ### Special GO power calculation for GO slim set ''' Special GO power calculation for GO slim set. >> numbers:dictionary of {Sample:Count} >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time. >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples. >> total:str ['Total'] = Sample containing Total counts to compare against >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample - self.go(id)[countkey] = {Sample:count} >> ignore:list of Samples to ignore from calculation << returns a list of GO IDs that meet criteria ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### N = numbers[total] # Total count for calculating expectations/probabilities nlist = [] # List of counts for subsamples to be assessed for sample in numbers: if sample not in ignore + [total]: nlist.append(numbers[sample]) nlist = rje.sortUnique(nlist,xreplace=False,num=True) ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### plist = [] # List of acceptable Total counts for subset nx = 0.0 for i in range(1,N+1): # Look at all possible levels of occurrence self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N)) nx += 100.0 ok = 0 p = float(i) / N # Probability of each gene having this term for n in nlist: # Look at each subset k1 = min(i,n) # Want to look at largest possible count for sample-term pairing k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1 #!# Add under-representation too! #!# if ok and samples == 'any': break if (ok and samples == 'any') or ok == len(nlist): plist.append(i) self.printLog('\r#POW','Calculation of GO term power complete.',log=False) self.deBug(nlist) ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### terms = [] (ix,itot) = (0.0,len(self.go())) for id in rje.sortKeys(self.go()): self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms))) ix += 100.0 if self.go(id)[countkey][total] in plist: terms.append(id) self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms))) #!# Add correction terms #!# self.deBug(terms) return terms except: self.errorLog('Major problem with GO.powerGO()') return []
def fpi(self): ### Family-protein interactions '''Family-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_FPI' rje.mkDir(self,outdir) fpi = {} # Dictionary of {family:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qry in rje.sortKeys(self.dict['PPI']): try: fam = self.dict['Fam'][qry] if len(fam) < 2: continue except: self.errorLog('Problem with "%s" protein family' % qry); continue fpi[qry] = [] for hub in fam: if hub not in self.dict['PPI']: continue fpi[qry] += self.dict['PPI'][hub] # Add with redundancy for spoke in fpi[qry][0:]: if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke) # Must have 2+ family interactions for hub in fam: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in fpi[qry]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) fpi[qry] = rje.sortUnique(fpi[qry],False,False) acc = [] gene = self.dict['Gene'][qry] for name in fpi[qry]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n')) self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#FPI','No %s PPI left after FPI removed' % hub) self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
def readResults(self,clear=True,readaln=False): ### Reads results from self.list['HMMRes'] into objects ''' Reads results from self.list['HMMRes'] into objects. >> clear:boolean = whether to clear self.search before reading [True] >> readaln:boolean = whether to bother reading Alignments into objects [False] ''' try: if clear: self.search = [] for resfile in rje.sortUnique(self.list['HMMRes'],xreplace=False): if not os.path.exists(resfile) and self.opt['GZip'] and os.path.exists('%s.gz' % resfile): os.system('gunzip %s.gz' % resfile) self.printLog('#GUNZIP','Gunzipped %s.gz' % resfile) if self.opt['HMMPFam']: self.readHMMPFamSearch(resfile,readaln) else: self.readHMMSearch(resfile,readaln) if self.opt['GZip'] and os.path.exists(resfile): rje.backup(self,'%s.gz' % resfile,unlink=True) os.system('gzip %s' % resfile) self.printLog('#GZIP','%s gzipped to save space' % resfile) except: self.log.errorLog('Hmm indeed. rje_hmm.readResults() gone awry!',quitchoice=True) return False
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self,outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom],False,False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n')) self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False) self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
def treeListSPCode(self): ### Main taxa mapping from list of tree files '''Main taxa mapping from list of tree files.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() specdb = self.db('spcode', add=True, forcecheck=True, mainkeys=['protein']) if not specdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.spcode.tdt' % self.getStr('TaxBase') specdb = db.addTable(spfile, mainkeys=['protein'], name='spcode', expect=False) if specdb: specdb.dataFormat({'boot': 'num'}) return True specdb = db.addEmptyTable( 'spcode', ['protein', 'boot', 'spcode', 'inpara', 'paralogues'], ['protein']) #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein']) self.dict['Duplicates'] = {} # {prot1:[dups]} ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for nwkfile in self.list['NwkList']: tree = rje_tree.Tree(self.log, self.cmd_list) tree.loadTree(nwkfile, seqlist=None, postprocess=False) seqacc = rje.baseFile(nwkfile, strip_path=True) # Identify node corresponding to query sequence seqnode = None for node in tree.nodes(): try: if string.split(node.shortName(), '__')[1] == seqacc: seqnode = node except: pass # Internal node or bad sequence format if not seqnode: self.warnLog('Could not find %s in %s nodes!' % (seqacc, nwkfile)) continue # Get species code for query sequence seqspec = tree.cladeSpec(seqnode) if len(seqspec) != 1: self.warnLog('Could not find species in %s node!' % (seqacc)) continue seqspec = seqspec.keys()[0] if seqspec != string.split(seqnode.shortName(), '_')[1]: raise ValueError('Species mismatch for %s & %s' % (seqacc, seqnode.shortName())) # Find ancestor with closest orthologue outgroup rootnode = tree._getRootNode() if not rootnode: self.warnLog('Could not find root node in %s!' % (nwkfile)) continue ancnode = seqnode.ancNode() try: bootx = float(ancnode.ancBranch().stat['Bootstrap'] ) / tree.stat['Bootstraps'] except: bootx = 1.0 inparanode = None # Node to define in-paralogues ancspec = tree.cladeSpec(ancnode) while len(ancspec) < 2 or bootx < self.getNum('MinBoot'): inparanode = ancnode # All same species if ancnode == rootnode: break ancnode = ancnode.ancNode() ancspec = tree.cladeSpec(ancnode) try: bootx = float(ancnode.ancBranch().stat['Bootstrap'] ) / tree.stat['Bootstraps'] except: bootx = 1.0 ancspec.pop( seqspec) # Now only have counts of closest other species # Update table, replacing species codes with genera? sentry = { 'protein': seqacc, 'spcode': rje.sortUnique(ancspec.keys()) } sentry['boot'] = bootx if not ancspec: sentry['spcode'] = ['None'] sentry['boot'] = self.getNum('NoneBoot') sentry['spcode'] = string.join(sentry['spcode'], '|') # Establish list of duplicate proteins inpara = [] # List of in-paralogue nodes inparacc = [] # List of in-paralogue accnum if inparanode: inpara = tree._nodeClade(inparanode, internal=False) self.dict['Duplicates'][seqacc] = [] for node in tree._nodeClade(rootnode, internal=False): if node == seqnode: continue if len(string.split(node.shortName(), '_')) < 2: continue if string.split(node.shortName(), '_')[1] == seqspec: paracc = string.split(node.shortName(), '__')[1] if node in inpara: inparacc.append(paracc) else: self.dict['Duplicates'][seqacc].append(paracc) sentry['inpara'] = string.join(inparacc, '|') sentry['paralogues'] = string.join( self.dict['Duplicates'][seqacc], '|') specdb.addEntry(sentry) ## Update specdb and save specdb.saveToFile() #dupdb.saveToFile() return True except: self.errorLog(self.zen()) return False
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup Objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.getStrLC('DBSource'): self.setStr({'DBSource':string.split(rje.stripPath(self.getStr('MITAB')),'.')[0]}) if not self.obj['DB']: self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) pdb = self.db('pairwise',add=False) pfields = ['#','Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType'] if not pdb: self.db().addEmptyTable('pairwise',pfields,['#'],log=True) if not self.obj['XRef']: xcmd = ['mapfields=Gene,%s,Secondary,Ensembl,Aliases,Accessions,RefSeq,Previous Symbols,Synonyms' % self.getStr('UniField')] self.obj['XRef'] = rje_xref.XRef(self.log,xcmd+self.cmd_list) self.obj['XRef'].setup() skip_comments = True for field in self.list['IDField']: if field[:1] == '#': skip_comments = False if self.list['MapDB'] and 'uniprotkb' not in self.list['MapDB']: self.list['MapDB'].append('uniprotkb') self.printLog('#MAP','uniprotkb added to MapDB list.') elif not self.list['MapDB']: self.printLog('#MAP','No MapDB list: will attempt to match all IDs to xref KeyID "%s".' % self.obj['XRef'].getStr('KeyID')) ### ~ [2] Setup MITAB File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.open('MITAB') if not self.file['MITAB']: raise IOError self.printLog('#MITAB','Parse PPI from %s.' % self.getStr('MITAB')) ## ~ [2a] MITAB file headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = [] while not headers: self.list['Headers'] = headers = self.readDelimit('MITAB') if not headers: break if headers[0][:1] == '#' and skip_comments: headers = []; continue #self.debug(headers) ## ~ [2b] IDField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## idfields = [] for hfield in headers: #self.bugPrint(hfield.upper()) for idfield in rje.sortUnique(self.list['IDField'])[0:]: idfield = string.replace(idfield.upper(),'(','\(') idfield = string.replace(idfield,')','\)') idmatch = rje.matchExp('^(%s\s?[AB])$' % idfield.upper(),hfield.upper()) if not idmatch: idmatch = rje.matchExp('^(%s\s?[AB]) \(\S+\)$' % idfield.upper(),hfield.upper()) if idmatch and hfield not in idfields: idfields.append(hfield) self.printLog('#ID','IDField: %s' % hfield) #self.bugPrint(idfields) break #self.debug(idfields) self.list['IDField'] = idfields if not self.list['IDField']: raise ValueError('No IDField found in MITAB headers.') ## ~ [2c] TaxaField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## taxafields = [] for tfield in self.list['TaxaField'][0:]: for hfield in headers: tmatch = rje.matchExp('^(%s\s?[AB])$' % tfield.upper(),hfield.upper()) if not tmatch: tmatch = rje.matchExp('^(%s\s?[AB]) \(\S+\)$' % tfield.upper(),hfield.upper()) if tmatch and hfield not in taxafields: taxafields.append(hfield) self.printLog('#TAX','TaxaField: %s' % hfield) self.list['TaxaField'] = taxafields if not self.list['TaxaField']: self.warnLog('No TaxaField found in MITAB headers.',quitchoice=True) ## ~ [2d] TypeField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## methfields = [] lctypes = rje.listLower(self.list['MethodField']) for hfield in headers: if hfield.lower() in lctypes: methfields.append(hfield) self.printLog('#METH','MethodField: %s' % hfield) self.list['MethodField'] = methfields if not self.list['MethodField']: self.warnLog('No MethodField found in MITAB headers.',quitchoice=True) ## ~ [2e] TypeField headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## typefields = [] lctypes = rje.listLower(self.list['TypeField']) for hfield in headers: if hfield.lower() in lctypes: typefields.append(hfield) self.printLog('#TYPE','TypeField: %s' % hfield) self.list['TypeField'] = typefields if not self.list['TypeField']: self.warnLog('No TypeField found in MITAB headers.',quitchoice=True) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def parseMITAB(self): ### Parse MITAB file into pairwise PPI table. '''Parse MITAB file into pairwise PPI table.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.obj['XRef'] pdb = self.db('pairwise') pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType'] headers = {} for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h dbsource = self.getStr('DBSource') ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0 epos = self.endPos('MITAB') complexidlist = [] badtaxa = ['-'] baduni = [] while 1: self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist))) mline = self.readDelimit('MITAB'); mx += 1 if not mline: break entry = {'#':pdb.entryNum()} for field in pfields: entry[field] = '' ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## complexid = {} # This will take the first complex ID if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']: if int(mline[headers['numParticipants']]) > 2: complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']] #self.bugPrint(mline) #self.debug(complexid) ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## taxa = {'A':'','B':''} for tfield in self.list['TaxaField']: ab = tfield[-1:].upper() if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0] try: taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0] if self.list['TaxID'] and taxon not in self.list['TaxID']: continue taxa[ab] = taxon except: taxon = mline[headers[tfield]] if taxon not in badtaxa: badtaxa.append(taxon) self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True) if not self.list['TaxID']: taxa[ab] = '-' if not taxa['A'] and complexid: taxa['A'] = taxa['B'] if not taxa['B'] and complexid: taxa['B'] = taxa['A'] if not (taxa['A'] and taxa['B']): ftx += 1; continue ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ids = {'A':[],'B':[]} uni = {'A':'','B':''} for ifield in self.list['IDField']: ab = ifield[-1:].upper() if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0] # Split IDs on | then db:id vs self.list['MapDB'] for pid in string.split(mline[headers[ifield]],'|'): try: (db,dbid) = string.split(pid,':',1) except: continue if db.lower() in ['uniprotkb'] and '(' in dbid: continue # Only map uniprotkb accnum dbid = string.split(dbid,'(')[0] dbid = string.split(dbid,';')[0] if db.lower() in ['uniprotkb']: svid = dbid dbid = string.split(svid,'-')[0] if ab not in complexid: # First identifier for A/B if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid) else: complexid[ab] = '' if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid) # Parse uniprot directly if possible if db.lower() in ['uniprotkb'] and not uni[ab]: if self.getBool('SpliceVar'): uni[ab] = svid else: uni[ab] = dbid #self.bugPrint(ids) ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## amb = {'A':False,'B':False} if not ids['A'] or not ids['B']: #self.bugPrint('%s\n=> ID Failure' % mline) #self.bugPrint(ids['A']); self.bugPrint(ids['B']) #self.bugPrint(entry) fx += 1; continue for ida in ids['A']: #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False))) if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True) if entry['Hub'] == False: amb['A'] = True #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True) if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida) if self.getBool('AddUni') and not entry['HubUni']: entry['HubUni'] = uni['A'] if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A']) if not entry['Hub'] and entry['HubUni']: entry['Hub'] = entry['HubUni'] #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True) uhx += 1 if not entry['Hub'] and complexid['A']: entry['Hub'] = complexid['A'] else: complexid['A'] = '' if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = '' for idb in ids['B']: if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True) if entry['Spoke'] == False: amb['B'] = True #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True) if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb) if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B'] if not entry['Spoke'] and entry['SpokeUni']: entry['Spoke'] = entry['SpokeUni'] #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True) usx += 1 if not entry['Spoke'] and complexid['B']: entry['Spoke'] = complexid['B'] else: complexid['B'] = '' if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']: entry['Spoke'] = '' if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B']) if complexid['A'] and complexid['B']: if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')): self.printLog('\r#MITAB','',log=False) self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True) entry['Hub'] = entry['Spoke'] = '' #self.bugPrint(entry) #self.debug(complexid) if not (entry['Hub'] and entry['Spoke']): if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']): fax += 1; continue #self.bugPrint(mline); self.debug(entry) fx += 1; continue #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue entry['HubTaxID'] = taxa['A'] entry['SpokeTaxID'] = taxa['B'] if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A']) if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B']) #if complexid['A'] or complexid['B']: self.debug(entry) ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #self.bugPrint(mline) evidence = [] for tfield in self.list['MethodField']: #self.bugPrint(string.split(mline[headers[tfield]],'|')) for etype in string.split(mline[headers[tfield]],'|'): ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype) if ematch: evidence.append('%s:%s' % (dbsource,ematch[0])) if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource'))) evidence = rje.sortUnique(evidence) #self.debug(evidence) entry['Evidence'] = string.join(evidence,'|') ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## itypes = [] for tfield in self.list['TypeField']: #self.bugPrint(string.split(mline[headers[tfield]],'|')) for etype in string.split(mline[headers[tfield]],'|'): ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype) if ematch: itypes.append(ematch[0]) if not itypes: itypes.append('unknown') itypes = rje.sortUnique(itypes) #self.debug(itypes) entry['IType'] = string.join(itypes,'|') pdb.addEntry(entry); ex += 1 if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']: self.printLog('#DEV',string.join(mline,'\t')) #self.bugPrint(uni); self.debug(entry) if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']: pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'], 'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'], 'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'], 'Evidence':entry['Evidence'],'IType':entry['IType']}) self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist))) self.close('MITAB') if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx))) if baduni: baduni.sort() accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource) self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout)) open(accout,'w').write(string.join(baduni,'\n')) ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not complexidlist: return pdb self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist)) ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## complexes = {}; chentries = []; csentries = [] cevidence = {} # List of Evidence for each complex citypes = {} # List of ITypes for each complex ctaxa = {} ex = 0.0; etot = pdb.entryNum() for entry in pdb.entries(): self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0 if entry['Hub'] in complexidlist: cid = entry['Hub'] if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = [] complexes[cid].append(entry['Spoke']) ctaxa[entry['Spoke']] = entry['SpokeTaxID'] cevidence[cid].append(entry['Evidence']) citypes[cid].append(entry['IType']) chentries.append(entry) elif entry['Spoke'] in complexidlist: cid = entry['Spoke'] if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = [] complexes[cid].append(entry['Hub']) ctaxa[entry['Hub']] = entry['HubTaxID'] cevidence[cid].append(entry['Evidence']) citypes[cid].append(entry['IType']) csentries.append(entry) self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist))) #self.debug(complexes) ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cppi = {} ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0 for cid in rje.sortKeys(complexes): self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0 if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid])) if len(complexes[cid]) < 2: complexes.pop(cid) cevidence.pop(cid) citypes.pop(cid) rx += 1; continue complexes[cid].sort() #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|') #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|') cmax = max(cmax,len(complexes[cid])) #px += (len(complexes[cid]) * (len(complexes[cid])-1)) members = complexes[cid][0:] while members: hub = members.pop(0) if self.dev() and hub == 'KLF3': self.debug(cid) if hub not in cppi: cppi[hub] = {} for spoke in members: if spoke not in cppi[hub]: cppi[hub][spoke] = []; px += 1 cppi[hub][spoke].append(cid) self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px))) ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cix = pdb.entryNum() for centry in chentries + csentries: pdb.dropEntry(centry) ex = 0.0; etot = len(cppi) for hub in rje.sortKeys(cppi): self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0 #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]} hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]} for spoke in rje.sortKeys(cppi[hub]): evidence = [] itypes = [] ctypes = [] for cid in cppi[hub][spoke]: evidence += cevidence[cid] itypes += citypes[cid] ctypes += string.split(cid,':')[0] ctype = string.join(rje.sortUnique(ctypes),'|') evidence = string.join(rje.sortUnique(evidence),'|') if not evidence: evidence = '%s:%s' % (dbsource,ctype) itypes = string.join(rje.sortUnique(itypes),'|') if not itypes: itypes = ctype #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]} newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]} newentry['Evidence'] = evidence newentry['IType'] = itypes entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1 if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry) if self.getBool('Symmetry'): pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'], 'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'], 'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'], 'Evidence':entry['Evidence'],'IType':entry['IType']}) cix += 1 self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry'))) return pdb except: self.errorLog('%s.parseMITAB error' % self.prog())
def seqSubset2( self ): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(), mainkeys=['Ingolia'], name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile, mainkeys=['Gene'], name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData( ['sourcedata=%s' % xfile, 'aliases=%s' % afile]) ing_genes = string.split( string.join( self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes), rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog( '#MAP', 'Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene) ing_mgi.remove(gene) self.printLog( '#BAD', 'Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt', 'w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi), 'EnsEMBL', sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes), rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna', 'pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'seqout=%s' % seqout, 'autofilter=T', 'autload=T', 'seqmode=file', 'gooddesc=%s' % string.join(ing_musg, ',') ] rje_seqlist.SeqList(self.log, seqcmd) mdb = self.db().addEmptyTable('map', ['Ingolia', 'Gene', 'EnsEMBL'], ['Ignolia']) for gene in ing_map: entry = {'Ingolia': gene, 'Gene': ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'autofilter=F', 'autload=T', 'seqmode=file' ] iseq = rje_seqlist.SeqList(self.log, seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST', evalue='') while iseq.nextSeq(): (iname, icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)', iname)[0] for entry in mdb.indexEntries('EnsEMBL', musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]': 'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST') sdb.addField('ENSP') sdb.addField('ENSI') ENST = open('IngExact.cdna.all.fa', 'w') ENSP = open('IngExact.pep.all.fa', 'w') ex = 0.0 etot = sdb.entryNum() sx = 0 fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog( '\r#ING', 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot)) ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1 continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'], ','): (tname, tseq) = iseq.getDictSeq(trans, format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod] - 3:][:7], entry[icon])) if tseq[entry[icod] - 3:][:7] == entry[icon]: ipept = string.split( rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog( '\r#ING', 'Unable to find Harrington start for %s %s (%s)' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue elif len(peptseq) < minpep: self.printLog( '\r#ING', 'Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue id = rje.preZero(int(ex / 100), etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write( '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], entry['Gene'], mentry['Gene'], cdnaseq)) ENSP.write( '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id, entry['Gene'], mentry['Gene'], peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close() ENSP.close() self.printLog( '\r#ING', 'Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx), rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def seqSubset2(self): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile,mainkeys=['Gene'],name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile]) ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene); ing_mgi.remove(gene) self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt','w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna','pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')] rje_seqlist.SeqList(self.log,seqcmd) mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia']) for gene in ing_map: entry = {'Ingolia':gene,'Gene':ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file'] iseq = rje_seqlist.SeqList(self.log,seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST',evalue='') while iseq.nextSeq(): (iname,icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)',iname)[0] for entry in mdb.indexEntries('EnsEMBL',musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]':'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI'); ENST = open('IngExact.cdna.all.fa','w') ENSP = open('IngExact.pep.all.fa','w') ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1; continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'],','): (tname,tseq) = iseq.getDictSeq(trans,format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon])) if tseq[entry[icod]-3:][:7] == entry[icon]: ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue elif len(peptseq) < minpep: self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue id = rje.preZero(int(ex/100),etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq)) ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close(); ENSP.close() self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta,'r').readlines(): if line[:1] == '>': try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line) except: name = rje.matchExp('^>(\S+)',line)[0] if len(string.split(name,'|')) == 3: name = '6rf_NEIME__%s' % string.split(name,'|')[2] open(fas,'a').write('>%s\n' % name) elif len(string.split(name,'|')) == 5: name = 'ref_NEIME__%s' % string.split(name,'|')[3] open(fas,'a').write('>%s %s\n' % (name,desc)) else: print string.split(name,'|'); raise ValueError self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1 else: open(fas,'a').write(line) self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc'; open(acc,'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv,'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line,',') if len(data) < 2: continue [num,name] = data[:2] try: name = string.split(name,'|')[2] except: continue if name not in rfhits: open(acc,'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file,num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits)))) self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self,gfile,['Qry'],['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros,False) open('6rf_zeros.acc','w').write(string.join(zeros,'\n')) self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam() gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True) fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True) headers = gdata.pop('Headers') headers.insert(1,'Sample') headers.append('BestHit') rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf,'__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN',rje_zen.Zen().wisdom())
def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True): ### Maps taxa onto TaxID. If taxa is a list, will process each element. '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not taxa: return [] taxid = [] ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxa.sort() except: tlist = False if tlist: tx = 0.0; ttot = len(taxa) if ttot > 1: for t in taxa: if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0 taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False) taxid = rje.sortUnique(taxid) if log: if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid))) else: t = taxa[0] if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t) taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False)) if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid))) return taxid ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxmap = self.dict['TaxMap']; rankid = self.list['RankID'] taxa = '%s' % taxa ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if rje.matchExp('^(\d+)$', taxa): #if taxa not in taxmap: self.taxaChildren(taxa) #if taxa in rankid: return [taxa] if nodeonly: if taxa in rankid or not rankonly: return [taxa] else: return [] if taxa not in taxmap: if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True) return [] parents = [taxa] while parents: taxa = parents.pop(0) #if taxa not in taxmap: self.taxaChildren(taxa) if not rankonly or taxa in rankid: taxid.append(taxa) parents += taxmap[taxa] return taxid ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if taxa == string.replace(taxa.upper(),' ',''): greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines() for entry in greplines: try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0]) except: pass if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True) if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxa = taxa.replace('_',' ') ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines() gtaxid = None; comid = []; synid = [] for entry in greplines: try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0] except: pass if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid) elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid) elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid) if not taxid: taxid = comid if not taxid: taxid = synid if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True) if len(taxid) > 1: #self.bugPrint(string.join(greplines)) #self.debug('%s %s %s' % (taxid,comid,synid)) if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid #self.debug(taxid) ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines() for entry in greplines: try: #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry) gtaxid = string.split(entry,'\t|\t') if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0]) elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0]) except: pass if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid except: self.errorLog('%s.mapToTaxID() error' % (self)); raise
def treeListSPCode(self): ### Main taxa mapping from list of tree files '''Main taxa mapping from list of tree files.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() specdb = self.db('spcode',add=True,forcecheck=True,mainkeys=['protein']) if not specdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.spcode.tdt' % self.getStr('TaxBase') specdb = db.addTable(spfile,mainkeys=['protein'],name='spcode',expect=False) if specdb: specdb.dataFormat({'boot':'num'}); return True specdb = db.addEmptyTable('spcode',['protein','boot','spcode','inpara','paralogues'],['protein']) #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein']) self.dict['Duplicates'] = {} # {prot1:[dups]} ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for nwkfile in self.list['NwkList']: tree = rje_tree.Tree(self.log,self.cmd_list) tree.loadTree(nwkfile,seqlist=None,postprocess=False) seqacc = rje.baseFile(nwkfile,strip_path=True) # Identify node corresponding to query sequence seqnode = None for node in tree.nodes(): try: if string.split(node.shortName(),'__')[1] == seqacc: seqnode = node except: pass # Internal node or bad sequence format if not seqnode: self.warnLog('Could not find %s in %s nodes!' % (seqacc,nwkfile)) continue # Get species code for query sequence seqspec = tree.cladeSpec(seqnode) if len(seqspec) != 1: self.warnLog('Could not find species in %s node!' % (seqacc)); continue seqspec = seqspec.keys()[0] if seqspec != string.split(seqnode.shortName(),'_')[1]: raise ValueError('Species mismatch for %s & %s' % (seqacc,seqnode.shortName())) # Find ancestor with closest orthologue outgroup rootnode = tree._getRootNode() if not rootnode: self.warnLog('Could not find root node in %s!' % (nwkfile)); continue ancnode = seqnode.ancNode() try: bootx = float(ancnode.ancBranch().stat['Bootstrap'])/tree.stat['Bootstraps'] except: bootx = 1.0 inparanode = None # Node to define in-paralogues ancspec = tree.cladeSpec(ancnode) while len(ancspec) < 2 or bootx < self.getNum('MinBoot'): inparanode = ancnode # All same species if ancnode == rootnode: break ancnode = ancnode.ancNode(); ancspec = tree.cladeSpec(ancnode) try: bootx = float(ancnode.ancBranch().stat['Bootstrap'])/tree.stat['Bootstraps'] except: bootx = 1.0 ancspec.pop(seqspec) # Now only have counts of closest other species # Update table, replacing species codes with genera? sentry = {'protein':seqacc,'spcode':rje.sortUnique(ancspec.keys())} sentry['boot'] = bootx if not ancspec: sentry['spcode'] = ['None']; sentry['boot'] = self.getNum('NoneBoot') sentry['spcode'] = string.join(sentry['spcode'],'|') # Establish list of duplicate proteins inpara = [] # List of in-paralogue nodes inparacc = [] # List of in-paralogue accnum if inparanode: inpara = tree._nodeClade(inparanode,internal=False) self.dict['Duplicates'][seqacc] = [] for node in tree._nodeClade(rootnode,internal=False): if node == seqnode: continue if len(string.split(node.shortName(),'_')) < 2: continue if string.split(node.shortName(),'_')[1] == seqspec: paracc = string.split(node.shortName(),'__')[1] if node in inpara: inparacc.append(paracc) else: self.dict['Duplicates'][seqacc].append(paracc) sentry['inpara'] = string.join(inparacc,'|') sentry['paralogues'] = string.join(self.dict['Duplicates'][seqacc],'|') specdb.addEntry(sentry) ## Update specdb and save specdb.saveToFile() #dupdb.saveToFile() return True except: self.errorLog(self.zen()) return False
def contamination(self): ### Compares peptides from Chlamydia and human and outputs summaries '''Compares peptides from Chlamydia and human and outputs summaries.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) mods = ['none'] ### >>>> Shortcut reanalysis without modifications >>>> ### pepfile = '%s.chlam_peptides.tdt' % self.basefile() if not self.force() and os.path.exists(pepfile): pepdb = db.addTable(pepfile,mainkeys=['key','seqmod'],name='chlam_nomod') pepdb.dropFields(['pass','modification']) pepdb.compress(['key','seq'],default='max') pepdb.dropFields(['seqmod']) for entry in pepdb.entries(): for field in pepdb.fields(): if 'len' not in field: continue try: if entry[field] and int(entry[field]): entry[field] = len(entry['seq']) else: entry[field] = '' except: self.errorLog('%s >> %s' % (entry,field),quitchoice=True) tdb = pepdb comprules = {'key':'str','pi':'str','mass':'str'} shapefields = [] for field in pepdb.fields(): if 'len' in field: comprules[field] = 'mean' if len(string.split(field,'|')) > 1 and string.split(field,'|')[0] not in shapefields: shapefields.append(string.split(field,'|')[0]) print shapefields tdb.compress(['protein'],rules=comprules,default='sum') tdb.dropFields(['seq']) tdb.saveToFile() tdb.info['Name'] = 'chlam_nomod_summary' tdb.addField('temp',evalue=1) tdb.compress(['temp'],rules=comprules,default='sum') tdb.reshapeLong('exp',shapefields) tdb.newKey(['exp']) tdb.dropFields(['exp']+shapefields,inverse=True) tdb.saveToFile() return ### <<<< End Shortcut reanalysis without modifications <<<< ### ## ~ [0a] ~ Load EB and RB human peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#~~#','## ~ [0a] ~ Load EB and RB human peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False) #protein.key protein.Entry protein.Accession protein.Description protein.dataBaseType protein.score protein.falsePositiveRate protein.avgMass protein.MatchedProducts protein.matchedPeptides protein.digestPeps protein.seqCover(%) protein.MatchedPeptideIntenSum protein.top3MatchedPeptideIntenSum protein.MatchedProductIntenSum protein.fmolOnColumn protein.ngramOnColumn protein.AutoCurate protein.Key_ForHomologs protein.SumForTotalProteins peptide.Rank peptide.Pass peptide.matchType peptide.modification peptide.mhp peptide.seq peptide.OriginatingSeq peptide.seqStart peptide.seqLength peptide.pI peptide.componentID peptide.MatchedProducts peptide.UniqueProducts peptide.ConsectiveMatchedProducts peptide.ComplementaryMatchedProducts peptide.rawScore peptide.score peptide.(X)-P Bond peptide.MatchedProductsSumInten peptide.MatchedProductsTheoretical peptide.MatchedProductsString peptide.ModelRT peptide.Volume peptide.CSA peptide.ModelDrift peptide.RelIntensity peptide.AutoCurate precursor.leID precursor.mhp precursor.mhpCal precursor.retT precursor.inten precursor.calcInten precursor.charge precursor.z precursor.mz precursor.fraction precursor.numFrac precursor.fwhm precursor.liftOffRT precursor.infUpRT precursor.infDownRT precursor.touchDownRT prec.rmsFWHMDelta peptidePrecursor.deltaMhpPPM humedb = db.addTable('EB_IA_final_peptide.csv',mainkeys=['protein.Accession','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.key','protein.Accession','protein.Entry','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name='humaneb') humrdb = db.addTable('RB_IA_final_peptide.csv',mainkeys=['protein.Accession','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.key','protein.Accession','protein.Entry','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name='humanrb') for humdb in [humedb,humrdb]: humdb.info['Delimit'] = '\t' humdb.addField('exp',evalue=humdb.info['Name'][-2:]) humdb.renameField('protein.Accession','Protein') humdb.renameField('protein.Entry','Species') for entry in humdb.entries(): entry['Species'] = string.split(entry['Species'],'_')[-1] humdb.dropEntriesDirect('Species',['HUMAN'],inverse=True) for field in ['Rank','Pass','seq','OriginatingSeq','modification']: humdb.renameField('peptide.%s' % field,field) humdb.dataFormat({'Rank':'int'}) for mod in humdb.index('modification'): if mod.lower() and mod.lower() not in mods: mods.append(mod.lower()) humdb.addField('seqmod') for entry in humdb.entries(): if entry['modification'] and mods.index(entry['modification'].lower()): entry['seqmod'] = '%s-%d' % (entry['seq'],mods.index(entry['modification'].lower())) else: entry['seqmod'] = entry['seq'] humtdb = db.copyTable(humedb,'humantot') humtdb.newKey(['Protein','Rank','seq','modification','exp']) db.mergeTables(humtdb,db.copyTable(humrdb,'temp',add=False)) humtdb.compress(['Protein','seq','Pass'],rules={'Rank':'max'}) ## ~ [0b] ~ Load Proteomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#~~#','## ~ [0b] ~ Load Proteomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False) # Load human proteome hseqfile = '/home/re1u06/researchfiles/SBSBINF/Databases/DBase_120225/EnsEMBL/ens_HUMAN.loci.fas' hseq = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % hseqfile]) # Load Chlamydia proteome cseqfile = '../2011-07-18-Genome/NC_010287.proteome.fas' cseq = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % cseqfile]) # Load matched protein list rbpep = rje.listFromCommand('../2011-05-ProDigIS/soton_rb_peptides.txt') ebpep = rje.listFromCommand('../2011-05-ProDigIS/soton_rb_peptides.txt') ## ~ [0c] ~ Load EB and RB Chlamydia peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#~~#','## ~ [0c] ~ Load EB and RB Chlamydia peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##',log=False) chlamdb = {'EB':[],'RB':[]} for pfile in glob.glob('./Soton*/*peptide.csv'): (er,uniq) = rje.matchExp('\./Soton(\S\S)\S+_(\d+)/',pfile) chlamdb[er].append(db.addTable(pfile,mainkeys=['protein.key','protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.modification','peptide.OriginatingSeq'],name=uniq)) edb = chlamdb['EB'].pop(0); edb.info['Name'] = 'chlam_eb' while chlamdb['EB']: db.mergeTables(edb,chlamdb['EB'].pop(0)) rdb = chlamdb['RB'].pop(0); rdb.info['Name'] = 'chlam_rb' while chlamdb['RB']: db.mergeTables(rdb,chlamdb['RB'].pop(0)) # Load EB and RB matching peptide file #edb = db.addTable('../2011-05-ProDigIS/SotonEB_peptide_pjss.csv',mainkeys=['protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.OriginatingSeq'],name='chlam_eb') #rdb = db.addTable('../2011-05-ProDigIS/SotonRB_peptide_pjss.csv',mainkeys=['protein.name','peptide.Rank','peptide.seq','peptide.modification'],datakeys=['protein.name','peptide.Rank','peptide.Pass','peptide.seq','peptide.OriginatingSeq'],name='chlam_rb') for chlamdb in [edb,rdb]: chlamdb.info['Delimit'] = '\t' chlamdb.addField('exp',evalue=chlamdb.info['Name'][-2:]) chlamdb.renameField('protein.name','Protein'); chlamdb.renameField('protein.key','key') for field in ['Rank','Pass','seq','OriginatingSeq','modification']: chlamdb.renameField('peptide.%s' % field,field) chlamdb.dataFormat({'Rank':'int'}) for mod in chlamdb.index('modification'): if mod.lower() and mod.lower() not in mods: mods.append(mod.lower()) chlamdb.addField('seqmod') chlamdb.addField('Species',evalue='UNKNOWN') for entry in chlamdb.entries(): if 'Chlamydia trachomatis' in entry['Protein'] or '_CHLT2' in entry['Protein']: entry['Species'] = 'CHLT2' if entry['modification'] and mods.index(entry['modification'].lower()): entry['seqmod'] = '%s-%d' % (entry['seq'],mods.index(entry['modification'].lower())) else: entry['seqmod'] = entry['seq'] if not entry['OriginatingSeq']: entry['OriginatingSeq'] = entry['seq'] chlamdb.dropEntriesDirect('Species',['CHLT2'],inverse=True) chlamdb.remakeKeys() ## ~ Load Protein Key Mapping ~ ## kdb = db.addTable('NC_010287.proteinkey.tdt',mainkeys=['key'],name='keys') xdb = db.addTable('NC_010287.dbxref.tdt',mainkeys=['tag'],name='xref') tdb = db.copyTable(edb,'chlam_temp') self.deBug(tdb.entries()[0]) tdb.newKey(['Protein','Rank','Pass','seq','modification','exp']) db.mergeTables(tdb,db.copyTable(rdb,'temp',add=False)) kdb = db.joinTables(name='full_xref',join=[(kdb,'tag'),(xdb,'tag')],newkey=kdb.keys(),keeptable=True) tdb = db.joinTables(name='chlam_tot',join=[(tdb,'key'),(kdb,'key')],newkey=tdb.keys(),keeptable=True) self.deBug(tdb.keys()) self.deBug(tdb.entries()[0]) ### ~ [1] ~ Add Human Data to combined Chlamydia Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#~~#','### ~ [1] ~ Add Human Data to combined Chlamydia Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###',log=False) tdb.renameField('Pass','pass'); tdb.renameField('Protein','protein'); for entry in tdb.entries(): entry['pass'] = string.atoi(entry['pass'][-1]) keep = ['key'] + xdb.fields() + ['description','protein','exp','pass','seq','seqmod','modification'] tdb.newKey(['tag','exp','pass','seqmod','Rank']) tdb.compress(['tag','exp','pass','seqmod'],rules={'pass':'******'}) tdb.dropFields(keep,inverse=True) self.deBug(tdb.keys()) self.deBug(tdb.entries()[0]) ## ~ [1a] ~ Map ID'd peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~ ## bothpep = {'eb':[],'rb':[]} # Peptides found in both species chlampep = {'eb':[],'rb':[]} # Peptides found only in Chlamydia uniqpep = {'eb':[],'rb':[]} # Peptides found only in a single protein in Chlamydia fx = len(tdb.fields()) tdb.addField('pep',evalue=1) for field in ['pass1','pass2','hsap1','hsap2','uniq1','uniq2']: tdb.addField(field,evalue=0) comprules = {'pass':'******','key':'min'} for field in ['pep','pass1','pass2','hsap','uniq']: tdb.addField('%s_len' % field) comprules[tdb.fields()[-1]] = 'mean' shapefields = tdb.fields()[fx:] for entry in tdb.entries(): epass = '******' % entry['pass'] entry[epass.lower()] = 1 plen = entry['pep_len'] = len(entry['seq']) plen = entry['pass%d_len' % entry['pass']] = len(entry['seq']) hsap = False if entry['exp'] == 'eb': if 'Pass1' in humedb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap1'] += 1; hsap = True if 'Pass2' in humedb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap2'] += 1; hsap = True if entry['seqmod'] not in humedb.index('seqmod'): if entry['seq'] in humedb.index('seq'): self.errorLog('EB mod peptide %s not found in Human EB but unmod *is* found in Human EB!' % entry['seqmod'],printerror=False) if entry['seqmod'] in humrdb.index('seqmod'): self.errorLog('EB peptide %s not found in Human EB but found in Human RB!' % entry['seqmod'],printerror=False) else: if 'Pass1' in humrdb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap1'] += 1; hsap = True if 'Pass2' in humrdb.indexDataList('seqmod',entry['seqmod'],'Pass'): entry['hsap2'] += 1; hsap = True if entry['seqmod'] not in humrdb.index('seqmod'): if entry['seq'] in humrdb.index('seq'): self.errorLog('RB mod peptide %s not found in Human RB but unmod *is* found in Human RB!' % entry['seqmod'],printerror=False) if entry['seqmod'] in humedb.index('seqmod'): self.errorLog('RB peptide %s not found in Human RB but found in Human EB!' % entry['seqmod'],printerror=False) if hsap: entry['hsap_len'] = plen; bothpep[entry['exp']].append(entry['seq']); continue chlampep[entry['exp']].append(entry['seq']) entry['uniq1'] = entry['pass1'] entry['uniq2'] = entry['pass2'] entry['uniq_len'] = plen for altentry in tdb.indexEntries('seqmod',entry['seqmod']): if altentry['tag'] == entry['tag']: continue entry['uniq1'] = entry['uniq2'] = entry['uniq_len'] = 0 if entry['uniq1'] or entry['uniq2']: uniqpep[entry['exp']].append(entry['seq']) tdb.reshapeWide('exp',shapefields) fillfields = tdb.fields()[13:] for field in fillfields[0:]: if 'len' in field: fillfields.remove(field) tdb.fillBlanks(0,fillfields,fillempty=True) for entry in tdb.entries(): if entry['modification'] == 0: entry['modification'] = '' for field in shapefields: tdb.addField('%s|tot' % field) if field[-3:] == 'len': comprules['%s|eb' % field] = 'mean' comprules['%s|rb' % field] = 'mean' comprules['%s|tot' % field] = 'mean' for entry in tdb.entries(): for field in shapefields: if entry['%s|eb' % field] and not entry['%s|rb' % field]: entry['%s|tot' % field] = entry['%s|eb' % field] elif entry['%s|rb' % field] and not entry['%s|eb' % field]: entry['%s|tot' % field] = entry['%s|rb' % field] else: entry['%s|tot' % field] = max(entry['%s|eb' % field],entry['%s|rb' % field]) tdb.info['Name'] = 'chlam_peptides' tdb.saveToFile() tdb.info['Name'] = 'chlam_proteins' tdb.compress(['protein'],rules=comprules,default='sum') tdb.dropFields(['pass','seq','modification','seqmod']) tdb.saveToFile() tdb.info['Name'] = 'chlam_summary' tdb.addField('temp',evalue=1) tdb.compress(['temp'],rules=comprules,default='sum') tdb.reshapeLong('exp',shapefields) tdb.newKey(['exp']) tdb.dropFields(['exp']+shapefields,inverse=True) tdb.saveToFile() bothpep['tot'] = bothpep['eb'] + bothpep['rb'] # Peptides found in both species chlampep['tot'] = chlampep['eb'] + chlampep['rb']# Peptides found only in Chlamydia uniqpep['tot'] = uniqpep['eb'] + uniqpep['rb'] for er in bothpep: open('%s.%s.bothpep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(bothpep[er]),'\n')) open('%s.%s.chlampep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(chlampep[er]),'\n')) open('%s.%s.uniqpep.txt' % (self.basefile(),er),'w').write(string.join(rje.sortUnique(uniqpep[er]),'\n')) return #Peptide numbers for C. trachomatis/human #1. Number of chlamydial peptides assigned for each protein from RBs #2. Number of chlamydial peptides assigned for each protein from EBs #3. Number of chlamydial peptides assigned from both EB and RB combined, with redundancy removed #4. Number of unique chlamydial peptides assigned for each protein from RBs #5. Number of unique chlamydial peptides assigned for each protein from EBs #6. Number of unique chlamydial peptides assigned for EBs and RBs combined with redundancy removed #7. Total number of human peptides identified in EB (Length would be useful) #8. Total number of human peptides identified in RB (Length would be useful) #9. Total number of human peptides identified in EB and RB #10. Human peptides matching pass 1 chlamydia peptides for RB (sequence would be useful) #11. Human peptides matching pass 2 chlamydia peptides for EB (sequence would be useful) #An accession number and protein description would be useful where possible, i.e., the number of chlamydial peptides for each protein. tdb.compress(['Protein','seq'],rules={'Rank':'max','Pass':'******'}) for entry in tdb.entries(): entry['exp'] = 'tot' ### ~ [1] ~ Map ID'd peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mapkey = 'seqmod' self.deBug(rje.sortKeys(humdb.index(mapkey))) self.printLog('#~~#','## ~ [1] ~ Map ID\'d peptides onto Chlamydia, human and EB/RB hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###',log=False) for chlamdb in [edb,rdb,tdb]: bothpep = [] chlampep = [] uniqpep = [] if chlamdb == edb: humdb = humedb elif chlamdb == rdb: humdb = humrdb else: humdb = humtdb chlamdb.addField('Pep',evalue=1) chlamdb.addField('Pass1',evalue=0) chlamdb.addField('Pass2',evalue=0) chlamdb.addField('Hsap1',evalue=0) chlamdb.addField('Hsap2',evalue=0) chlamdb.addField('Uniq1',evalue=0) chlamdb.addField('Uniq2',evalue=0) comprules = {'Rank':'max'} for field in ['Pep','Pass1','Pass2','Hsap','Uniq']: chlamdb.addField('%s_len' % field) comprules[chlamdb.fields()[-1]] = 'mean' for entry in chlamdb.entries(): if 'Pass1' in entry['Pass']: entry['Pass'] = '******' else: entry['Pass'] = '******' entry[entry['Pass']] += 1 entry['Pep_len'] = plen = len(entry['seq']) entry['%s_len' % entry['Pass']] = plen hsap = False self.deBug(entry[mapkey]) self.deBug(entry[mapkey] in humdb.index(mapkey)) if 'Pass1' in humdb.indexDataList(mapkey,entry[mapkey],'Pass'): entry['Hsap1'] += 1; bothpep.append(entry[mapkey]); hsap = True if 'Pass2' in humdb.indexDataList(mapkey,entry[mapkey],'Pass'): entry['Hsap2'] += 1; bothpep.append(entry[mapkey]); hsap = True if hsap: entry['Hsap_len'] = plen; continue chlampep.append(entry[mapkey]) entry['Uniq1'] = entry['Pass1'] entry['Uniq2'] = entry['Pass2'] entry['Uniq_len'] = plen for altentry in chlamdb.indexEntries(mapkey,entry[mapkey]): if altentry['Protein'] == entry['Protein']: continue entry['Uniq1'] = entry['Uniq2'] = 0 if entry['Uniq1'] or entry['Uniq2']: uniqpep.append(entry[mapkey]) chlamdb.dropFields(['Pass','Rank','seq','OriginatingSeq','modification']) chlamdb.compress(['Protein'],rules=comprules,default='sum') #chlamdb.dropField('Rank') chlamdb.saveToFile() open('%s.%s.bothpep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(bothpep),'\n')) open('%s.%s.chlampep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(chlampep),'\n')) open('%s.%s.uniqpep.txt' % (self.basefile(),chlamdb.info['Name']),'w').write(string.join(rje.sortUnique(uniqpep),'\n')) chlamdb.newKey(['Protein','exp']) db.mergeTables(edb,rdb) db.mergeTables(edb,tdb) cdb = db.copyTable(edb,'chlam_summary') edb.info['Name'] = 'chlam_pep' edb.reshapeWide('exp',edb.fields()[-7:]) edb.saveToFile() cdb.compress(['exp'],rules=comprules,default='sum') cdb.dropField('Protein') cdb.saveToFile() # - twice maybe, once using EnsEMBL sequences directly, once using EB/RB search # - Numbers of unique Pass1/2 human peptides, and numbers matching Chlam # - Numbers of matched peptides per Chlam gene: total, eb, rb, human (e/r), unique (e/r), ens (e/r) ## Do complete digest of Chlam and search against Human except: self.errorLog('%s.contamination error' % self)
def fpi(self): ### Family-protein interactions '''Family-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_FPI' rje.mkDir(self, outdir) fpi = {} # Dictionary of {family:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qry in rje.sortKeys(self.dict['PPI']): try: fam = self.dict['Fam'][qry] if len(fam) < 2: continue except: self.errorLog('Problem with "%s" protein family' % qry) continue fpi[qry] = [] for hub in fam: if hub not in self.dict['PPI']: continue fpi[qry] += self.dict['PPI'][hub] # Add with redundancy for spoke in fpi[qry][0:]: if fpi[qry].count(spoke) == 1: fpi[qry].remove( spoke) # Must have 2+ family interactions for hub in fam: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in fpi[qry]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict[ 'PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) fpi[qry] = rje.sortUnique(fpi[qry], False, False) acc = [] gene = self.dict['Gene'][qry] for name in fpi[qry]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.fpi.acc' % (outdir, gene), 'w').write(string.join(acc, '\n')) self.printLog('#FPI', '%s family => %d interactors' % (gene, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#FPI', 'No %s PPI left after FPI removed' % hub) self.printLog( '#PPX', '%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.fpi()', quitchoice=True)
def picsi(self): ### Cleans up cross-species search results '''Cleans up cross-species search results.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datafile = self.info['SumFile'] delimit = rje.delimitFromExt(filename=self.info['SumFile']) data = {} # search:{hit:{???}} pep2prot = {} # search:{peptide:[hits]} id2prot = {} # search:{id:hit} prot2desc = {} fullpeplist = {} pepcon = {} # Convert pep:longer pep speclist = [] # List of species codes ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True) for ikey in rje.sortKeys(indata): (search,id) = string.split(ikey,delimit) prot = indata[ikey]['prot_acc'][0] desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','') if desc[3:7] == 'Name': desc = desc[9:] prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc)) indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|') pepconv = string.replace(indata[ikey]['pep_seq'],'I','L') pepconv = string.replace(pepconv,'Q','K') peplist = rje.sortUnique(string.split(pepconv,'|')) indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|') if search not in data: data[search] = {} pep2prot[search] = {} id2prot[search] = {} fullpeplist[search] = [] pepcon[search] = {} fullpeplist[search] += peplist id2prot[search][id] = prot spec = string.split(prot,'_')[1] if spec not in speclist: speclist.append(spec) data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec, 'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:], 'pep_rem':0} try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum'] except: data[search][prot]['accnum'] = string.split(prot,'__')[-1] for pep in peplist: if pep not in pep2prot[search]: pep2prot[search][pep] = [] pep2prot[search][pep].append(prot) ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for search in fullpeplist: fullpeplist[search] = rje.sortUnique(fullpeplist[search]) for pep in fullpeplist[search][0:]: for pep2 in fullpeplist[search]: if pep != pep2 and pep in pep2: pepcon[search][pep] = pep2 fullpeplist[search].remove(pep) break for pep in pepcon[search]: while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]] self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search)) #self.deBug(pepcon[search]) #self.deBug(rje.sortKeys(pep2prot[search])) pp = 0; pm = 0 for prot in data[search]: for pep in data[search][prot]['conpep'][0:]: if pep in pepcon[search]: newpep = pepcon[search][pep] if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1 data[search][prot]['conpep'].remove(pep); pm += 0 if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot) if pep in pep2prot[search]: pep2prot[search].pop(pep) data[search][prot]['pep_con'] = len(data[search][prot]['conpep']) self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm)) ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for search in pep2prot: ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## remx = 0 for prot in data[search]: if data[search][prot]['spec'] != self.info['QrySpec']: continue for pep in data[search][prot]['conpep']: for prot2 in pep2prot[search][pep][0:]: if data[search][prot2]['spec'] == self.info['QrySpec']: continue pep2prot[search][pep].remove(prot2) data[search][prot2]['conpep'].remove(pep) data[search][prot2]['pep_rem'] += 1; remx += 1 self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec'])) ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for prot in data[search]: if len(data[search][prot]['conpep']) < 2: for pep in data[search][prot]['conpep']: #if pep in pep2prot[search] and prot in pep2prot[search][pep]: pep2prot[search][pep].remove(prot) ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ux = 0 for pep in pep2prot[search]: #self.deBug(pep) if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1 self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search)) ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0} rx = 0 for prot in data[search]: #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue data[search][prot]['pep_red'] = 0 # Redundant peptides found in proteins with unique peptides data[search][prot]['pep_nr'] = 0 # Redundant peptides found only in proteins without unique peptides for pep in data[search][prot]['conpep']: if pep2prot[search][pep] == [prot]: continue upep = False for prot2 in pep2prot[search][pep]: if data[search][prot2]['pep_uniq']: upep = True; break if upep: data[search][prot]['pep_red'] += 1 # Redundant peptide found in unique protein else: data[search][prot]['pep_nr'] += 1 # Redundant peptide NOT found in unique protein if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1 elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE' elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR' else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1 summary[data[search][prot]['class']] += 1 self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search)) for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x)) ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### speclist.sort() species = {} for spec in speclist: try: grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read() species[spec] = string.split(grep,':')[-4] self.printLog('#SPEC','%s = %s' % (spec,species[spec])) except: species[spec] = '?' ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile']) headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep'] if self.dict['Acc2Seq']: headers.insert(3,'cluster') rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True) for search in rje.sortKeys(data): if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search]) for prot in rje.sortKeys(data[search]): if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']): data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1] else: data[search][prot]['species'] = species[data[search][prot]['spec']] rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot]) except: self.errorLog('Errg')
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mygo = rje_go.GO(self.log,self.cmd_list) mygo.readGO() gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True) self.deBug(rje.sortKeys(gomap)[:100]) #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!# self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap)))) slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum']) self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc)))) ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list) seqmap = {} (sx,stot) = (0.0,seqlist.seqNum()) for seq in seqlist.seq: self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0 if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0] self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap)))) self.deBug(rje.sortKeys(seqmap)[:100]) ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goocc = {} outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.') headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum'] for okey in slimocc.keys(): self.progLog('#NEW','Making new GO occurrences: %s ' % (rje.integerString(len(slimocc)))) data = slimocc.pop(okey) gene = seq = data['Seq'] type = 'fwd' if string.split(data['Motif'],'_')[-1] in ['rev','scram']: type = string.split(data['Motif'],'_')[-1] data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_') if gene not in gomap and gene in seqmap: gene = seqmap[gene] golist = [] if gene in gomap: for id in gomap[gene]: golist += mygo.parents(id) else: golist = ['NoGo'] self.deBug('%s:%s::%s' % (seq,gene,golist)) for id in rje.sortUnique(golist,False,False): if id not in goocc: goocc[id] = {} if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]} goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data)) self.printLog('\r#NEW','Making new GO occurrences complete. ' % (rje.integerString(len(slimocc)))) rje.delimitedFileOutput(self,outfile,headers,rje_backup=True) (mx,ox,ix,itot) = (0,0,0.0,len(goocc)) for id in rje.sortKeys(goocc): for motif in rje.sortKeys(goocc[id]): for type in rje.sortKeys(goocc[id][motif]): if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type) if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue mx += 1 for type in goocc[id][motif]: for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1 self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox))) self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox))) except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def makeFlySeq(self): ### Main run method '''Main run method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd) cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold':scaffold,'Gene':name}) try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) gene.opt['Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start':start,'End':end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE','Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx,ctot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False) cx += 100 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS','Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex,etot) = (0.0,exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False) ex += 100 try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON','Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) clist.append(pos) clist = rje.sortUnique(clist,xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) elist.append(pos) elist = rje.sortUnique(elist,xreplace=False) gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,',')) self.log.printLog('\r#GENE','Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def taxaMap(self): ### Maps species codes onto different taxonomic ranks. '''Maps species codes onto different taxonomic ranks.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() tax = self.obj['Taxonomy'] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### specdb = self.db('spcode') #descdb = self.db('protdesc') ranks = ['genus', 'family', 'order', 'class', 'phylum'] rankmap = {} # SPCODE to Taxon dictionary rankfields = ['protein'] + ranks + specdb.fields()[1:] #if descdb: rankfields.append('desc') if self.getStrLC('ProtDesc'): rankfields.append('desc') px = 0 for prot in self.dict['ProtDesc']: if prot.lower() in ['', 'protein', 'gene']: continue pentry = { 'protein': prot, 'spcode': 'None', 'boot': self.getNum('NoneBoot') } pkey = specdb.makeKey(pentry) if pkey not in specdb.dataKeys(): specdb.addEntry(pentry) px += 1 self.printLog( '#PROT', 'Added %s proteins from %s without trees.' % (rje.iStr(px), self.getStr('ProtDesc'))) rankdb = db.addEmptyTable('taxamap', rankfields, ['protein']) for rank in ranks: rankmap[rank] = { 'None': 'None', 'Unmapped': 'Unmapped', 'Uncertain': 'Uncertain' } taxdb = db.addEmptyTable('taxa', ['spcode', 'taxid', 'name'] + ranks, ['spcode']) sx = 0.0 stot = specdb.entryNum() for entry in specdb.entries(): self.progLog('\r#SPEC', 'Processing species: %.2f%%' % (sx / stot)) sx += 100.0 #if descdb: #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description'] try: entry['desc'] = self.dict['ProtDesc'][entry['protein']] except: entry['desc'] = '' for spcode in string.split(entry['spcode'], '|'): if spcode in rankmap['genus']: continue tentry = {'spcode': spcode} try: taxid = tax.mapToTaxID(spcode, nodeonly=True, warn=False)[0] rank = tax.dict['Rank'][taxid] tentry['taxid'] = taxid tentry['name'] = tax.getSpecies(taxid) except: self.warnLog( 'Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode) taxid = 'Unmapped' rank = 'genus' # Loop through different ranks for ri in range(len(ranks)): nextrank = ranks[ri] while rank not in ranks[ri:] and taxid in tax.dict[ 'Parent']: taxid = tax.dict['Parent'][taxid] rank = tax.dict['Rank'][taxid] #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid))) if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid) else: taxon = 'Unmapped' if rank != nextrank: if self.getBool('Monophyly'): taxon = 'Uncertain' else: taxon = '%s %s.' % (taxon, nextrank[:3]) rankmap[nextrank][spcode] = taxon tentry[nextrank] = taxon taxdb.addEntry(tentry) rentry = {} for nextrank in ranks: taxa = [] unmapped = '' for spcode in string.split(entry['spcode'], '|'): ranktax = rankmap[nextrank][spcode] if 'unmapped' in ranktax.lower( ) and ranktax not in taxa: if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank, unmapped, ranktax)) unmapped = ranktax #i# Should only be one if ranktax not in taxa: taxa.append(ranktax) if len(taxa) > 1 and 'None' in taxa: self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa), '|')) taxa.remove('None') if len(taxa) > 1 and unmapped: taxa.remove(unmapped) if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain' else: rentry[nextrank] = string.join(rje.sortUnique(taxa), '|') rankdb.addEntry(rje.combineDict(rentry, entry)) self.printLog( '\r#SPEC', '%s proteins with species codes processed.' % rje.iStr(stot)) rankdb.saveToFile() taxdb.saveToFile() except: self.errorLog('%s.taxaMap error' % self.prog())
def taxaMap(self): ### Maps species codes onto different taxonomic ranks. '''Maps species codes onto different taxonomic ranks.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() tax = self.obj['Taxonomy'] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### specdb = self.db('spcode') #descdb = self.db('protdesc') ranks = ['genus','family','order','class','phylum'] rankmap = {} # SPCODE to Taxon dictionary rankfields = ['protein']+ranks+specdb.fields()[1:] #if descdb: rankfields.append('desc') if self.getStrLC('ProtDesc'): rankfields.append('desc'); px = 0 for prot in self.dict['ProtDesc']: if prot.lower() in ['','protein','gene']: continue pentry = {'protein':prot,'spcode':'None','boot':self.getNum('NoneBoot')} pkey = specdb.makeKey(pentry) if pkey not in specdb.dataKeys(): specdb.addEntry(pentry); px += 1 self.printLog('#PROT','Added %s proteins from %s without trees.' % (rje.iStr(px),self.getStr('ProtDesc'))) rankdb = db.addEmptyTable('taxamap',rankfields,['protein']) for rank in ranks: rankmap[rank] = {'None':'None','Unmapped':'Unmapped','Uncertain':'Uncertain'} taxdb = db.addEmptyTable('taxa',['spcode','taxid','name']+ranks,['spcode']) sx = 0.0; stot = specdb.entryNum() for entry in specdb.entries(): self.progLog('\r#SPEC','Processing species: %.2f%%' % (sx/stot)); sx += 100.0 #if descdb: #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description'] try: entry['desc'] = self.dict['ProtDesc'][entry['protein']] except: entry['desc'] = '' for spcode in string.split(entry['spcode'],'|'): if spcode in rankmap['genus']: continue tentry = {'spcode':spcode} try: taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0] rank = tax.dict['Rank'][taxid] tentry['taxid'] = taxid tentry['name'] = tax.getSpecies(taxid) except: self.warnLog('Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode) taxid = 'Unmapped' rank = 'genus' # Loop through different ranks for ri in range(len(ranks)): nextrank = ranks[ri] while rank not in ranks[ri:] and taxid in tax.dict['Parent']: taxid = tax.dict['Parent'][taxid] rank = tax.dict['Rank'][taxid] #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid))) if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid) else: taxon = 'Unmapped' if rank != nextrank: if self.getBool('Monophyly'): taxon = 'Uncertain' else: taxon = '%s %s.' % (taxon,nextrank[:3]) rankmap[nextrank][spcode] = taxon tentry[nextrank] = taxon taxdb.addEntry(tentry) rentry = {} for nextrank in ranks: taxa = [] unmapped = '' for spcode in string.split(entry['spcode'],'|'): ranktax = rankmap[nextrank][spcode] if 'unmapped' in ranktax.lower() and ranktax not in taxa: if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank,unmapped,ranktax)) unmapped = ranktax #i# Should only be one if ranktax not in taxa: taxa.append(ranktax) if len(taxa) > 1 and 'None' in taxa: self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa),'|')) taxa.remove('None') if len(taxa) > 1 and unmapped: taxa.remove(unmapped) if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain' else: rentry[nextrank] = string.join(rje.sortUnique(taxa),'|') rankdb.addEntry(rje.combineDict(rentry,entry)) self.printLog('\r#SPEC','%s proteins with species codes processed.' % rje.iStr(stot)) rankdb.saveToFile() taxdb.saveToFile() except: self.errorLog('%s.taxaMap error' % self.prog())
def makeFlySeq(self): ### Main run method '''Main run method.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F'] genes = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd) cds = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd) exons = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx, gtot) = (0.0, genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE', 'Processing Gene Annotation: %.1f%%' % (gx / gtot), newline=False, log=False) gx += 100 (id, scaffold, pos, name, glen) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);', gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold': scaffold, 'Gene': name}) try: (end, start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) gene.opt[ 'Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start': start, 'End': end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE', 'Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx, ctot) = (0.0, cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS', 'Processing CDS Annotation: %.1f%%' % (cx / ctot), newline=False, log=False) cx += 100 try: (id, scaffold, pos, name, glen, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end, start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)', pos) except: try: (start, end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) seq.opt[ 'Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start': start, 'End': end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex, etot) = (0.0, exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON', 'Processing Exon Annotation: %.1f%%' % (ex / etot), newline=False, log=False) ex += 100 try: (id, scaffold, pos, name, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent, ',')[0]] try: (end, start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)', pos) except: try: (start, end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) seq.opt[ 'Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start': start, 'End': end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON', 'Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx, gtot) = (0.0, genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE', 'Generating new Gene Annotation: %.1f%%' % (gx / gtot), newline=False, log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt[ 'Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start, glen), rje.preZero(end, glen)) clist.append(pos) clist = rje.sortUnique(clist, xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt[ 'Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start, glen), rje.preZero(end, glen)) elist.append(pos) elist = rje.sortUnique(elist, xreplace=False) gene.info[ 'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % ( gene.info['Gene'], gene.info['SpecCode'], gene.info['AccNum'], gene.aaLen(), string.join(clist, ','), string.join(elist, ',')) self.log.printLog('\r#GENE', 'Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def iTRAQSamples(self): ### Uses self.dict['Samples'] and self.db('itraq') to summarise hit data '''Uses self.dict['Samples'] and self.db('itraq') to summarise hit data.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db(); idb = self.db('itraq') mdb = db.copyTable(idb,'itraq_summary') gdb = db.copyTable(idb,'itraq_geomean') ### ~ [1] Reformat Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mdb.dropField('geomean'); gdb.dropField('ratio'); gdb.renameField('geomean','ratio') for sdb in [mdb,gdb]: sdb.dropField('summary'); sdb.dropEntriesDirect('ratio','---') sdb.dropEntriesDirect('ratio','NN') sdb.dataFormat({'ratio':'num','n':'int'}) ## ~ [1a] Drop tags with Samples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (ex,etot) = (0.0,sdb.entryNum()) for entry in sdb.entries(): self.progLog('\r#ITRAQ','Drop isotags without Sample info: %.2f%%' % (ex/etot)); ex += 100.0 tags = string.split(entry['itraq'],'/') if tags[0] not in self.dict['Samples'] or tags[1] not in self.dict['Samples']: sdb.dropEntry(entry) self.printLog('\r#ITRAQ','Dropped all isotags without Sample info: %s of %s entries remain' % (rje.iStr(sdb.entryNum()),rje.iStr(etot))) ## ~ [1b] Reshape, rename, invert and remove redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sdb.reshapeWide('itraq',['ratio','n']) samples = rje.sortUnique(self.dict['Samples'].values()) ratios = [] self.printLog('#SAMP',string.join(samples,', ')) for s1 in samples: for s2 in samples[samples.index(s1):]: newfield = '%s/%s' % (s1,s2) sdb.addField(newfield) sdb.addField('%s_Min' % newfield) sdb.addField('%s_Max' % newfield) sdb.addField('%s_Dirn' % newfield) ratios.append(newfield) for entry in sdb.entries(): entry[newfield] = [] for field in sdb.fields(): if '|' in field: (score,tags) = string.split(field,'|') tag = string.split(tags,'/') if int(tag[0]) > int(tag[1]): ### Invert newfield = '%s|%s/%s' % (score,tag[1],tag[0]) if newfield in sdb.fields(): sdb.dropField(newfield); continue sdb.renameField(field,newfield) if score == 'ratio': for entry in sdb.entries(): if entry[newfield]: entry[newfield] = 1.0 / entry[newfield] tag = (tag[1],tag[0]) field = newfield s1 = self.dict['Samples'][tag[0]] s2 = self.dict['Samples'][tag[1]] newname = '%s|%s%s/%s%s' % (score,s1,tag[0],s2,tag[1]) sdb.renameField(field,newname) if score == 'n': continue newfield = '%s/%s' % (s1,s2) invfield = '%s/%s' % (s2,s1) for entry in sdb.entries(): if entry[newname] and newfield in sdb.fields(): entry[newfield].append(entry[newname]) elif entry[newname]: entry[invfield].append(1.0/entry[newname]) ## ~ [1c] Calculate Geometric mean ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (ex,etot) = (0.0,sdb.entryNum()) for entry in sdb.entries(): self.progLog('\r#GEO','Calculating Geometric means: %.2f%%' % (ex/etot)); ex += 100.0 for ratio in ratios: if entry[ratio]: entry['%s_Min' % ratio] = min(entry[ratio]) entry['%s_Max' % ratio] = max(entry[ratio]) try: entry[ratio] = rje.geoMean(entry[ratio]) except: self.deBug(entry) if entry[ratio] > 1 and entry['%s_Min' % ratio] > 1: entry['%s_Dirn' % ratio] = 'UP' elif entry[ratio] < 1 and entry['%s_Max' % ratio] < 1: entry['%s_Dirn' % ratio] = 'DOWN' else: entry['%s_Dirn' % ratio] = entry['%s_Min' % ratio] = entry['%s_Max' % ratio] = entry[ratio] = '' self.printLog('\r#GEO','Geometric mean calculations complete') sdb.saveToFile() except: self.errorLog('iTRAQSamples error')
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta, 'r').readlines(): if line[:1] == '>': try: (name, desc) = rje.matchExp('^>(\S+) (\S.+)$', line) except: name = rje.matchExp('^>(\S+)', line)[0] if len(string.split(name, '|')) == 3: name = '6rf_NEIME__%s' % string.split(name, '|')[2] open(fas, 'a').write('>%s\n' % name) elif len(string.split(name, '|')) == 5: name = 'ref_NEIME__%s' % string.split(name, '|')[3] open(fas, 'a').write('>%s %s\n' % (name, desc)) else: print string.split(name, '|') raise ValueError self.progLog( '\r#FAS', 'Processing %s: %s seqs' % (fas, rje.integerString(sx))) sx += 1 else: open(fas, 'a').write(line) self.printLog( '\r#FAS', 'Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(fas, protein=True, force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc' open(acc, 'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv, 'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line, ',') if len(data) < 2: continue [num, name] = data[:2] try: name = string.split(name, '|')[2] except: continue if name not in rfhits: open(acc, 'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file, num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog( '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' % (cx, rje.integerString(len(rfhits)))) self.printLog( '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' % (cx, rje.integerString(len(rfhits)), acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=%s' % acc, 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas', 'qryacc=F' ]).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros, False) open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n')) self.printLog( '#ZERO', '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Zeros.fas', 'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas', 'qryacc=F' ]).gablam() gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True) fdata = rje.dataDict(self, string.replace(ufile, 'hitsum', 'gablam'), ['Qry'], ['Hit'], lists=True) headers = gdata.pop('Headers') headers.insert(1, 'Sample') headers.append('BestHit') rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf, '__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN', rje_zen.Zen().wisdom())