def makePPI(self): ### Generates files for Human-HIV PPI analysis '''Generates files for Human-HIV PPI analysis.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%s' % self.getStr('HIVSeq'), 'autoload=T']) if not seqlist.seqs(): return False seqmap = seqlist.seqNameDic('Max') mdb = self.db('HHPIDMap') ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hivacc in mdb.index('AccHIV'): # map HIV accession numbers on to sequences seqNameDic accnum = string.split(hivacc, '.')[0] hivseq = seqmap[accnum] # extract short HIV name from sequence ID hivgene = string.split(hivseq.shortName(), '_')[0].upper() # create directory named after HIV gene #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene)) rje.mkDir(self, '%s/' % hivgene, log=True) # copy human PPI files into directories, adding HIV gene ex = 0.0 etot = len(mdb.index('AccHIV')[hivacc]) for entry in mdb.indexEntries('AccHIV', hivacc): self.progLog( '\r#PPI', 'Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot), hivgene)) pfile = self.getStr( 'PPIDir') + entry['Symbol'] + '.ppi.fas' if rje.exists(pfile): FAS = open( '%s/%s.%s.ppi.fas' % (hivgene, hivgene.lower(), entry['Symbol']), 'w') FAS.write('>%s\n%s\n' % (hivseq.info['Name'], hivseq.getSequence())) FAS.write(open(pfile, 'r').read()) FAS.close() else: self.errorLog( 'Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'], entry['Symbol']), printerror=False) self.printLog( '\r#PPI', 'Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot), entry['HIV'], hivgene)) except: self.errorLog('%s.makePPI error' % self) return False
def outputCards(self): ### Outputs cards to delimited file '''Outputs cards to delimited file.''' ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### genelist = self.list['Genes'] if self.opt['Purify'] and self.opt['Restrict']: for gene in genelist[0:]: if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: # Replace with symbol genelist.remove(gene) if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol']) delimit = rje.delimitFromExt(filename=self.info['CardOut']) CARDOUT = open(self.info['CardOut'],'a') ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (noens,noloci,ox) = (0,0,0) for gene in rje.sortKeys(self.dict['GeneCard']): if self.opt['Restrict'] and gene not in genelist: continue elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1 self.dict['GeneCard'][gene]['Alias'] = gene self.dict['GeneCard'][gene]['Species'] = self.info['Species'] rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene]) if self.dict['GeneCard'][gene]['Symbol'] == gene: # Not an alias if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1 if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1 CARDOUT.close() self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut'])) self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
def emptyToBlank(self): ### Replace empty values with 'blank' values '''Replace empty values with 'blank' values.''' db = self.db('TimePoints'); bx = 0 for entry in db.entries(): for field in db.fields(): if entry[field] == '': entry[field] = 'blank'; bx += 1 self.printLog('#DB','%s empty values represented with blank values' % rje.iStr(bx))
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.getStr('RestIn').startswith('http:'): #!# Check for rest URL and add if missing #!# Split on & restcmd = string.split(self.getStr('RestIn'),'&') for i in range(len(restcmd)): if '=' not in restcmd[i]: continue (opt,value) = string.split(restcmd[i],'=',1) if value.startswith('file:'): # Conversion of cmd=file:FILE into cmd=CONTENT rfile = string.split(value,':',1)[1] #!# Consider adding max size constraint. Probably a URL size limit. if rje.exists(rfile): restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n'))) if '&' in restcmd[i]: self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile)) restcmd[i] = string.replace(restcmd[i],'&','+') else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True) self.setStr({'RestIn':string.join(restcmd,'&')}) ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: # Convert to file self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)}) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = self.obj['SeqList'] if self.getStr('Basefile').lower() in ['','none']: self.str['Basefile'] = rje.baseFile(seqlist.getStr('Name')) self.obj['DB'].setInfo({'Basefile':self.str['Basefile']}) ## ~ [1a] Genetic Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cdb = self.db().addEmptyTable('Code',['Codon','AA'],['Codon']) for codon in rje_sequence.genetic_code: cdb.addEntry({'Codon':codon,'AA':rje_sequence.genetic_code[codon]}) cdb.index('AA') ### ~ [2] Calculate Codon Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### codons = rje.sortKeys(rje_sequence.genetic_code) db = self.db().addEmptyTable('Codons',['Seq','Len']+codons,['Seq']) sx = 0.0; seqx = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#COD','Calculating codon usage: %.2f%%' % (sx/seqx)); sx += 100.0 entry = rje_sequence.codons(seq.getSequence(),{}) #self.deBug(entry); self.deBug(entry.values()) entry['Len'] = sum(entry.values()) entry['Seq'] = seq.getStr('AccNum') db.addEntry(entry) self.printLog('\r#COD','Codon usage calculated for %s sequences' % rje.iStr(seqx)) db.fillBlanks(blank=0,fillempty=True) db.saveToFile() ### ~ [3] Calculate NT Count Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nt = ['C','A','G','U'] for i in [1,2,3]: for n in ['C','A','G','U']: nt.append('%s|%d' % (n,i)) ndb = self.db().addEmptyTable('NT',['Seq','Len']+nt,['Seq']) sx = 0.0; seqx = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#NT','Calculating NT Counts: %.2f%%' % (sx/seqx)); sx += 100.0 entry = rje_sequence.aaFreq(string.replace(seq.getSequence(),'T','U'),{'C':0,'A':0,'G':0,'U':0},False) entry['Len'] = sum(entry.values()) entry['Seq'] = seq.getStr('AccNum') centry = db.data(entry['Seq']) for i in [1,2,3]: for n in ['C','A','G','U']: entry['%s|%d' % (n,i)] = 0 for codon in codons: for i in [1,2,3]: n = codon[i-1] entry['%s|%d' % (n,i)] += centry[codon] ndb.addEntry(entry) self.printLog('\r#NT','NT Counts calculated for %s sequences' % rje.iStr(seqx)) ndb.saveToFile() except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def depthChargeForker(self): ### Main DepthCharge forking method ''' Work through each sequence and fork it out for DepthCharge analysis. ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqin = self.seqinObj() self.list['ToFork'] = seqin.list['Seq'][0:] resfile = '{0}.depthcharge.tdt'.format(self.baseFile()) if self.force(): rje.backup(resfile, appendable=False) elif rje.exists(resfile): ddb = self.db().addTable(resfile, ['seqname', 'start', 'end', 'type']) ddb.dataFormat({'start': 'int', 'end': 'int'}) complete = ddb.indexDataList('type', 'all', 'seqname') if complete: cx = 0 for seq in self.list['ToFork'][0:]: if seqin.shortName(seq) in complete: self.list['ToFork'].remove(seq) cx += 1 if cx: self.printLog( '#SKIP', 'Skipping {0} previously processed sequences (force=F)' .format(rje.iStr(cx))) if not self.list['ToFork']: self.printLog( '#CHARGE', 'All sequences previously processed (force=F)') return ddb while len(self.list['Forked']) < self.getNum( 'Forks') and self.list['ToFork']: self.nextFork() ### ~ [2] ~ Work through each sequence and fork out ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK', 'Forking of %s jobs completed.' % (rje.iStr(seqin.seqNum())), log=self.getBool('LogFork')) ddb = self.db().addTable(resfile, ['seqname', 'start', 'end', 'type'], replace=True) ddb.dataFormat({'start': 'int', 'end': 'int'}) return ddb except: self.errorLog('%s.depthChargeForker error' % self.prog())
def emptyToBlank(self): ### Replace empty values with 'blank' values '''Replace empty values with 'blank' values.''' db = self.db('TimePoints') bx = 0 for entry in db.entries(): for field in db.fields(): if entry[field] == '': entry[field] = 'blank' bx += 1 self.printLog( '#DB', '%s empty values represented with blank values' % rje.iStr(bx))
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### forkx = len(self.list['Forked']) self.setup() ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK','Forking of %s jobs completed.' % (rje.iStr(forkx))) except: self.errorLog('Forker.run() Error') if self.list['Forked']: self.warnLog('%s fork jobs remain unforked.' % rje.iLen(self.list['Forked'])) return False return True
def pileUpFDR(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fdrfile = '%s.fdr.tdt' % self.baseFile() if not self.force() and os.path.exists(fdrfile): return sigpval = {} # pval:[fpos] npos = 0; nx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?') ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r') headers = string.split(SAMSIG.readline()) + ['p.FDR'] fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0 while fline: self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px)) try: pval = float(string.split(fline)[-1]) except: break if pval <= 0.05: if pval not in sigpval: sigpval[pval] = [] sigpval[pval].append(fpos); px += 1 fpos = SAMSIG.tell(); fline = SAMSIG.readline() self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px)) ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMFDR = open(fdrfile,'w') rje.writeDelimit(SAMFDR, headers) px = 0; sx = 0.0; stot = len(sigpval) for pval in rje.sortKeys(sigpval): self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0 px += len(sigpval[pval]) if pval: fdr = (pval * npos) / px else: fdr = 0.0 for fpos in sigpval[pval]: SAMSIG.seek(fpos) rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)]) SAMSIG.close() SAMFDR.close() self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile)) except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
def saveReadMe(self, filename='pydocs.txt', append=False): ### Prints docs for modules to file ''' Prints docs for modules to file. >> filename:str = output file name >> append:boolean ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pydoc = self.obj['PyDoc'] if append: self.printLog('#DOC', 'Appending docstrings to %s' % filename) PYDOC = open(filename, 'a') else: rje.mkDir(self, filename) self.printLog('#DOC', 'Writing docstrings to %s' % filename) PYDOC = open(filename, 'w') PYDOC.write(self.readMeHeader()) db = self.db('Module') dx = 0 ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for sourcedir in pydoc.list['SourceDir']: PYDOC.write('-%s:\n\n' % sourcedir) for pyfile in db.dataKeys(): entry = db.data(pyfile) module = entry['Module'] if not pyfile.find(sourcedir) >= 0 or not os.path.exists( '%s%s%s.py' % (pydoc.getStr('PyPath'), rje.makePath(sourcedir), module)): continue ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module, pyfile) while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:] try: PYDOC.write('%s\n\n%s\n' % (mtxt, entry['DocString'])) dx += 1 except: self.errorLog('Cannot write DocString for %s' % module, printerror=False) PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)) dx += 1 PYDOC.write('\n\n\n') PYDOC.close() self.printLog( '#DOC', 'Output to %s complete: %s modules.' % (filename, rje.iStr(dx))) except: self.errorLog('Error in %s.saveDocs()' % self.prog())
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### forkx = len(self.list['Forked']) self.setup() ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK', 'Forking of %s jobs completed.' % (rje.iStr(forkx))) except: self.errorLog('Forker.run() Error') if self.list['Forked']: self.warnLog('%s fork jobs remain unforked.' % rje.iLen(self.list['Forked'])) return False return True
def batchRun(self,returnobj=False): ### Execute batch mode runs '''Execute batch mode runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### barg = self.getStrLC('BatchArg') if not barg: raise ValueError('Cannot use batchrun=FILELIST if batcharg=None.') batchfiles = self.list['BatchRun'][0:] self.list['BatchRun'] = [] # Avoid recursive running! blog = self.getStr('BatchLog') if not blog.startswith('.'): blog = '.%s' % blog if not blog.endswith('.log'): blog = '%s.log' % blog rawcmd = self.cmd_list[0:] rawlog = self.log batchobj = [] ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### bx = 0 for bfile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s of %s: %s=%s' % (rje.iStr(bx),rje.iLen(batchfiles),barg,bfile)) ## Setup parameters bbase = rje.baseFile(bfile,strip_path=True) bcmd = ['%s=%s' % (barg,bfile)] if self.getBool('BatchBase'): if blog == '.log': bcmd += ['basefile=%s' % bbase] else: bcmd += ['basefile=%s%s' % (bbase,rje.baseFile(blog))] elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase,blog)] else: bcmd += ['newlog=F'] #self.debug(bcmd) ## Setup Seqsuite object self.cmd_list = rawcmd + bcmd self.log = rje.setLog(self.log.obj['Info'],self,self.cmd_list) # Sets up Log object for controlling log file output ## Run batchobj.append(self.run()) ## Finish and Tidy self.log = rawlog runobj = batchobj[-1] if runobj: if not returnobj: batchobj[-1] = True info = runobj.log.obj['Info'] self.printLog('#RUN','%s V%s run finished.' % (info.program,info.version)) else: self.warnLog('Batch run failed (%s=%s).' % (barg,bfile)) ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### failx = batchobj.count(False) self.printLog('#BATCH','%s batch runs complete: %s failed.' % (rje.iLen(batchfiles),rje.iStr(failx))) self.list['BatchRun'] = batchfiles return batchobj except: self.errorLog('%s.batchRun error' % self); return False
def expectedCodonUsage(self): ### Calculate expected codon usage from Frequency data '''Calculate expected codon usage from Frequency data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### aacode = self.db('Code').index('AA') nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code) cdb = self.db('Codons'); ndb = self.db('NT') nsumdb = self.db().copyTable(ndb,'NTPos',replace=True) nsumdb.dropField('Len') for n in ['C','A','G','U']: nsumdb.renameField(n,'%s|All' % n) nsumdb.reshapeLong('Pos',reshape=['C','A','G','U']) nsumdb.compress(['Pos'],{'Pos':'str','Seq':'str'},default='sum') nsumdb.dropField('Seq'); nsumdb.addField('Total') for entry in nsumdb.entries(): pos = entry.pop('Pos'); entry.pop('Total') rje.dictFreq(entry) entry['Pos'] = pos nsumdb.saveToFile() nexentry = nsumdb.data('3') fdb = self.db().addEmptyTable('Freq',['Seq','Len']+nt+codons+['Total'],['Seq']) edb = self.db().copyTable(cdb,'Expected',replace=True) ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### x = 0.0; etot = cdb.entryNum() for oldentry in cdb.entries(): self.progLog('\r#FREQ','Calculating Frequencies: %.2f%%' % (x/etot)); x += 100.0 entry = rje.combineDict({},oldentry) seq = entry['Seq']; entry['Total'] = entry.pop('Len') exentry = edb.data(seq) ntentry = rje.combineDict({},ndb.data()[seq]) ntentry.pop('Seq'); ntentry.pop('Len') rje.dictFreq(ntentry) ntentry['Len'] = ntentry.pop('Total') for aa in aacode: ax = 0.0; ex = 0.0 for codon in aacode[aa]: ax += entry[codon] exentry[codon] = nexentry[codon[0]] * nexentry[codon[1]] * nexentry[codon[2]] ex += exentry[codon] for codon in aacode[aa]: if ax: entry[codon] = len(aacode[aa]) * entry[codon] / ax else: entry[codon] = 0.0 exentry[codon] = ax * (exentry[codon] / ex) fdb.addEntry(rje.combineDict(entry,ntentry)) self.printLog('\r#Freq','Frequencies calculated for %s entries' % rje.iStr(etot)) fdb.saveToFile(); edb.saveToFile() except: self.errorLog('%s.expectedCodonUsage error' % self)
def tidyMotifNames(self, dbtable): ### Tidy the motif names in given dbtable '''Tidy the motif names in given dbtable.''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### slist = self.obj['SLiMList'] if 'motif' not in dbtable.fields(): return 0 mx = 0 for entry in dbtable.entries(): newname = slist.slimCoreName(entry['motif']) if newname != entry['motif']: entry['motif'] = newname mx += 1 self.printLog( '#MOTIF', '%s motif names corrected for SLiMList splitting.' % rje.iStr(mx)) if mx: dbtable.remakeKeys() return mx except: self.errorLog('Problem during %s tidyMotifNames.' % self.prog()) raise
def run(self,save=True): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(): return False ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.parseMITAB() pdb = self.db('pairwise') if save: pdb.compress(['Hub','Spoke','HubTaxID','SpokeTaxID'],rules={'Evidence':'list','IType':'list'},joinchar='|') pdb.dropField('#') self.printLog('#PPI','%s unique pairwise PPI (Symmetry=%s)' % (rje.iStr(pdb.entryNum()),self.getBool('Symmetry'))) pdb.saveToFile() pdb.index('Evidence',splitchar='|') pdb.indexReport('Evidence','#METHOD') pdb.index('IType',splitchar='|') pdb.indexReport('IType','#ITYPE') return pdb except: self.errorLog(self.zen()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) self.db().basefile(self.basefile()) self.list['Accuracy'] = [0,1.0 - self.getNum('ErrPerBase')] ## ~ [1a] SMRTReads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while self.getStrLC('SMRTUnits') not in ['reads','gb','mb']: txt = 'SMRTUnits "%s" not recognised' if self.getNum('SMRTReads') < 10: smrtunits = 'Gb' elif self.getNum('SMRTReads') > 10000: smrtunits = 'reads' else: smrtunits = 'Mb' if self.i() < 0 or rje.yesNo('%s: switch to (%s) %s?' % (txt,self.getNum('SMRTReads'),smrtunits)): self.setStr({'SMRTUnits':smrtunits}) elif self.i() >0: self.setStr({'SMRTUnits':rje.choice('SMRTUnits (reads/Gb/Mb)?')}) self.printLog('#UNITS','%s => %s' % (txt,self.getStr('SMRTUnits'))) if self.getStrLC('SMRTUnits') in ['gb','mb']: smrttotal = self.getNum('SMRTReads') * {'gb':1e9,'mb':1e6}[self.getStrLC('SMRTUnits')] txt = '%s %s @ %.3f kb/read' % (self.getNum('SMRTReads'),self.getStr('SMRTUnits'),self.getNum('AvRead')/1000.0) self.setNum({'SMRTReads':smrttotal/self.getNum('AvRead')}) txt += ' => %s reads' % rje.iStr(int(self.getNum('SMRTReads'))) self.printLog('#READS',txt) ## ~ [1b] XnList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## xnlist = [] for xn in self.list['XnList']: if xn == '': continue try: ixn = int(xn) if xn not in [ixn,'%d' % ixn]: self.printLog('#XN','"%s" -> %dX' % (xn,ixn)) if ixn == 0: self.printLog('#XN','No point in 0X output: use 1-%Coverage.') elif ixn == 1: self.printLog('#XN','No point in 1X output: use %Coverage.') else: xnlist.append(ixn) except: self.errorLog('Could not process %s as part of XnList. (Integers only.)' % xn) xnlist.sort() if xnlist: self.printLog('#XN','XnList: %sX.' % string.join(string.split('%s' % xnlist,','),'X, ')[1:-1]) self.list['XnList'] = xnlist return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def saveReadMe(self,filename='pydocs.txt',append=False): ### Prints docs for modules to file ''' Prints docs for modules to file. >> filename:str = output file name >> append:boolean ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pydoc = self.obj['PyDoc'] if append: self.printLog('#DOC','Appending docstrings to %s' % filename) PYDOC = open(filename,'a') else: rje.mkDir(self,filename) self.printLog('#DOC','Writing docstrings to %s' % filename) PYDOC = open(filename,'w') PYDOC.write(self.readMeHeader()) db = self.db('Module') dx = 0 ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for sourcedir in pydoc.list['SourceDir']: PYDOC.write('-%s:\n\n' % sourcedir) for pyfile in db.dataKeys(): entry = db.data(pyfile) module = entry['Module'] if not pyfile.find(sourcedir) >= 0 or not os.path.exists('%s%s%s.py' % (pydoc.getStr('PyPath'),rje.makePath(sourcedir),module)): continue ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,pyfile) while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:] try: PYDOC.write('%s\n\n%s\n' % (mtxt,entry['DocString'])); dx += 1 except: self.errorLog('Cannot write DocString for %s' % module,printerror=False) PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)); dx += 1 PYDOC.write('\n\n\n') PYDOC.close() self.printLog('#DOC','Output to %s complete: %s modules.' % (filename,rje.iStr(dx))) except: self.errorLog('Error in %s.saveDocs()' % self.prog())
def inSilicoHybrid( self ): ### Filter and combine subreads from parent and output to fasta file. ''' Filter and combine subreads from parent and output to fasta file. This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell identifier table.) A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added. This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional subreads will be added from the other parent if they reduce the difference in cumulative output for each parent. Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for which the subread length distributions should also be similar. This is to overcome biases in resulting diploid assemblies, where one parent has higher quality data than the other. NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to reads with a minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be relaxed. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1')) base1 = rje.baseFile(self.getStr('Parent1')) parent1 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent1'), 'basefile=%s' % base1]) parent1.setup() udb1 = parent1.udb() cdb = parent1.db('smrt', add=True, mainkeys=['Name']) cdb.dataFormat({'SMRT': 'int'}) cx = cdb.entryNum() ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2')) base2 = rje.baseFile(self.getStr('Parent2')) parent2 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent2'), 'basefile=%s' % base2]) parent2.setup() udb2 = parent2.udb() cdb2 = parent2.db('smrt', add=True, mainkeys=['Name']) cdb2.dataFormat({'SMRT': 'int'}) # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1 for entry in cdb2.entries() + udb2.entries(): entry['SMRT'] = entry['SMRT'] + cx cdb = parent1.db().mergeTables(cdb, cdb2) ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #' ) minlen = self.getInt('LenFilter') minrq = self.getNum('RQFilter') rqstr = '%s' % minrq filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen, rqstr[2:]) ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqbatch = [] # List of SeqList objects self.printLog( '#BATCH', '%s sequence files to process.' % rje.iLen(parent1.list['Batch'] + parent2.list['Batch'])) for seqfile in parent1.list['Batch'] + parent2.list['Batch']: seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=F', 'seqin=%s' % seqfile, 'autofilter=F' ] seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd)) self.printLog( '#BATCH', '%s sequence files to summarise.' % rje.iLen(seqbatch)) if not seqbatch: raise IOError( 'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.' ) ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elists = [ udb1.sortedEntries('Len', reverse=True), udb2.sortedEntries('Len', reverse=True) ] plen = [0, 0] # Summed lengths for each parent pseq = [0, 0] # Total sequence number for each parent prq = [0, 0] # Total sequence RQ for each parent (convert to mean) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') lastlen = max(elists[0][0]['Len'], elists[1][0]['Len']) # Length of last selected read for elist in elists: while elist and elist[0]['RQ'] < minrq: elist.pop(0) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') nextp = 0 # Index of next parent to use if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1 ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## zmwlist = [] # List of (smrt,zmw) meeting filtering criteria ux = 0.0 utot = len(elists[0]) + len(elists[1]) while lastlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: elist.pop(0) ux += 100.0 if elist and elist[0]['Len'] < minlen: ux += 100.0 * len(elist) elist = [] if not elist: nextp = 1 - nextp break # Finish entry = elist.pop(0) ux += 100.0 zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp lastlen = entry['Len'] ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while elists[nextp]: elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist.pop(0) ux += 100.0 while elist and elist[0]['Len'] >= minlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) entry = elist.pop(0) ux += 100.0 pdiff = rje.modulus(plen[0] - plen[1]) ediff = rje.modulus(plen[nextp] + entry['Len'] - plen[1 - nextp]) if ediff >= pdiff: elists[nextp] = [] break #Finish! zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 self.printLog( '\r#DIP', 'Diploidising subreads complete: %s subreads to output.' % rje.iLen(zmwlist)) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]), 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0])) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]), 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1])) ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rje.backup(self, filtfile) SEQOUT = open(filtfile, 'w') sx = 0.0 stot = 0 sn = len(seqbatch) fx = 0 for seqlist in seqbatch: #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784 si = 100.0 / seqlist.seqNum() stot += seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#OUT', 'Extracting subreads: %.2f%%' % (sx / sn)) sx += si (name, sequence) = seqlist.getSeq(seq) try: [smrt, zmw, pos, rq] = string.split(string.replace(name, '/', ' ')) except: [smrt, zmw, pos] = string.split(string.replace(name, '/', ' ')) rq = minrq if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist: continue SEQOUT.write('>%s\n%s\n' % (name, sequence)) fx += 1 self.printLog( '\r#OUT', 'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile)) ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=T', 'seqin=%s' % filtfile, 'autofilter=F' ] rje_seqlist.SeqList(self.log, seqcmd) return True except: self.errorLog('%s.run error' % self.prog()) return False
def filterSPCode(self): ### Filters species codes according to mincount and shared taxa at different levels. '''Filters species codes according to mincount and shared taxa at different levels.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() tax = self.obj['Taxonomy'] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### specdb = self.db('spcode') parents = {} # Dictionary of {spcode:parents} taxsum = {} # Reduced according to low abundance and/or higher level taxa of species in clade fx = 0; bfx = 0; ufx = 0 for ekey in specdb.dataKeys(): entry = specdb.data(ekey) if entry['spcode'] == 'None': entry['boot'] = self.getNum('NoneBoot') continue if entry['boot'] < self.getNum('BootFilter'): self.printLog('#FILT','%s: filtered -> "Uncertain" (bootstrap %s < bootfilter=%s).' % (entry['protein'],entry['boot'],self.getNum('BootFilter'))) entry['spcode'] = 'Uncertain'; bfx += 1 continue #self.debug(entry) spcodes = string.split(entry['spcode'],'|') for spcode in spcodes[0:]: if spcode not in parents: parents[spcode] = [] try: taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0] except: continue while taxid in tax.dict['Parent']: taxid = tax.dict['Parent'][taxid] parsp = tax.getSpCode(taxid,invent=False,warn=False) if parsp: parents[spcode].append(parsp) if not parents[spcode] and len(spcodes) > 1: self.printLog('#FILT','%s: filtered unmapped spcode %s.' % (entry['protein'],spcode)) spcodes.remove(spcode); ufx += 1 for parsp in parents[spcode]: if parsp in spcodes: self.printLog('#FILT','%s: filtered %s as parent of %s.' % (entry['protein'],parsp,spcode)) spcodes.remove(parsp); fx += 1 for taxon in spcodes[0:]: if taxon not in taxsum: taxsum[taxon] = 0.0 if self.getBool('BootWeight'): taxweight = entry['boot'] else: taxweight = 1.0 taxsum[taxon] += taxweight / len(spcodes) entry['spcode'] = string.join(spcodes,'|') self.printLog('#FILT','Filtered %s species codes with co-occurring child taxa' % rje.iStr(fx)) self.printLog('#FILT','Filtered %s unmapped species codes with co-occurring mapped taxa' % rje.iStr(ufx)) if self.getNum('BootFilter') > 0.0: self.printLog('#FILT','Filtered %s proteins with bootstrap < bootfilter=%s' % (rje.iStr(bfx),self.getNum('BootFilter'))) #self.debug(entry) fx = 0 for ekey in specdb.dataKeys(): entry = specdb.data(ekey) if entry['spcode'] in ['None','Uncertain']: continue #self.debug(entry) spcodes = string.split(entry['spcode'],'|') for spcode in spcodes[0:]: if self.getNum('MinScore') > 0 and self.getNum('MinScore') > taxsum[spcode]: self.printLog('#FILT','%s: filtered %s < minscore=%s.' % (entry['protein'],spcode,self.getNum('MinScore'))) spcodes.remove(spcode); fx += 1 if spcodes: entry['spcode'] = string.join(spcodes,'|') else: self.printLog('#FILT','%s filter aborted: no spcode left!' % (entry['protein'])) #self.debug(entry) self.printLog('#FILT','Filtered %s species codes failing to meet minscore=%s.' % (rje.iStr(fx),self.getNum('MinScore'))) except: self.errorLog('%s.filterSPCode error' % self.prog())
def taxaMap(self): ### Maps species codes onto different taxonomic ranks. '''Maps species codes onto different taxonomic ranks.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() tax = self.obj['Taxonomy'] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### specdb = self.db('spcode') #descdb = self.db('protdesc') ranks = ['genus','family','order','class','phylum'] rankmap = {} # SPCODE to Taxon dictionary rankfields = ['protein']+ranks+specdb.fields()[1:] #if descdb: rankfields.append('desc') if self.getStrLC('ProtDesc'): rankfields.append('desc'); px = 0 for prot in self.dict['ProtDesc']: if prot.lower() in ['','protein','gene']: continue pentry = {'protein':prot,'spcode':'None','boot':self.getNum('NoneBoot')} pkey = specdb.makeKey(pentry) if pkey not in specdb.dataKeys(): specdb.addEntry(pentry); px += 1 self.printLog('#PROT','Added %s proteins from %s without trees.' % (rje.iStr(px),self.getStr('ProtDesc'))) rankdb = db.addEmptyTable('taxamap',rankfields,['protein']) for rank in ranks: rankmap[rank] = {'None':'None','Unmapped':'Unmapped','Uncertain':'Uncertain'} taxdb = db.addEmptyTable('taxa',['spcode','taxid','name']+ranks,['spcode']) sx = 0.0; stot = specdb.entryNum() for entry in specdb.entries(): self.progLog('\r#SPEC','Processing species: %.2f%%' % (sx/stot)); sx += 100.0 #if descdb: #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description'] try: entry['desc'] = self.dict['ProtDesc'][entry['protein']] except: entry['desc'] = '' for spcode in string.split(entry['spcode'],'|'): if spcode in rankmap['genus']: continue tentry = {'spcode':spcode} try: taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0] rank = tax.dict['Rank'][taxid] tentry['taxid'] = taxid tentry['name'] = tax.getSpecies(taxid) except: self.warnLog('Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode) taxid = 'Unmapped' rank = 'genus' # Loop through different ranks for ri in range(len(ranks)): nextrank = ranks[ri] while rank not in ranks[ri:] and taxid in tax.dict['Parent']: taxid = tax.dict['Parent'][taxid] rank = tax.dict['Rank'][taxid] #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid))) if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid) else: taxon = 'Unmapped' if rank != nextrank: if self.getBool('Monophyly'): taxon = 'Uncertain' else: taxon = '%s %s.' % (taxon,nextrank[:3]) rankmap[nextrank][spcode] = taxon tentry[nextrank] = taxon taxdb.addEntry(tentry) rentry = {} for nextrank in ranks: taxa = [] unmapped = '' for spcode in string.split(entry['spcode'],'|'): ranktax = rankmap[nextrank][spcode] if 'unmapped' in ranktax.lower() and ranktax not in taxa: if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank,unmapped,ranktax)) unmapped = ranktax #i# Should only be one if ranktax not in taxa: taxa.append(ranktax) if len(taxa) > 1 and 'None' in taxa: self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa),'|')) taxa.remove('None') if len(taxa) > 1 and unmapped: taxa.remove(unmapped) if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain' else: rentry[nextrank] = string.join(rje.sortUnique(taxa),'|') rankdb.addEntry(rje.combineDict(rentry,entry)) self.printLog('\r#SPEC','%s proteins with species codes processed.' % rje.iStr(stot)) rankdb.saveToFile() taxdb.saveToFile() except: self.errorLog('%s.taxaMap error' % self.prog())
def topTerms(self,slimx=20,parents=False,total='Total',countkey='counts'): ### Selects top terms for GO slim set ''' Selects top terms for GO slim set. >> slimx:int [20] = Desired min. number of terms for each GO domain. >> parents:bool [False] = Whether parents and children both allowed in list >> total:str ['Total'] = Sample containing Total counts for assessment >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample - self.go(id)[countkey] = {Sample:count} << returns a list of GO IDs that meet criteria ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #x#self.opt['DeBug'] = True terms = [] # List of terms dom = {'cc':{},'bp':{},'mf':{}} # Dictionary of {domain:{count:[IDs]}} for id in self.go(): n = self.go(id)[countkey][total] type = self.go(id)['type'] if n not in dom[type]: dom[type][n] = [id] else: dom[type][n].append(id) ### ~ [2] ~ Generate Top Terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(dom) for type in dom: dterms = [] # Terms for this domain only dkeys = rje.sortKeys(dom[type]) # Counts, low to high dkeys.reverse() # Counts, high to low (dx,dtot) = (0.0,len(dkeys)) while dkeys and len(dterms) < slimx: # Keep looping self.deBug('%s: %s' % (type,dterms)) self.progLog('#TOP','Generating top %d %s terms: %.1f%%' % (slimx,type,dx/dtot)) dx += 100.0 n = dkeys.pop(0) # Remove from list dterms += dom[type][n] # Add terms to term list if parents: continue # Don't care if parents and children all mixed up for id in dterms[0:]: if id not in dterms: continue # Previously-removed parent for par in self.parents(id): # Check all parents if par in dterms: dterms.remove(par) # Remove parent term self.printLog('\r#TOP','Identified %s top %s terms: >= %s genes' % (rje.iLen(dterms),type,rje.iStr(n))) terms += dterms # Found a stable list of terms self.deBug(terms) return terms except: self.errorLog('Major problem with GO.topTerms()') return []
def gopher(self): ### Sets up data for GOPHER run '''Sets up data for GOPHER run.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'BLAST/') rje_blast.BLASTRun(self.log, self.cmd_list).formatDB( fasfile='%s.ygob.fas' % self.info['Basefile'], protein=True, force=False) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB( fasfile='%s.yeast.fas' % self.info['Basefile'], protein=True, force=False) seqdict = self.obj['SeqList'].seqNameDic('AccNum') ymap = self.dict['PillarMap'] = {} ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (px, ptot) = (0.0, len(self.list['Pillars'])) ox = 0 for pillar in self.list['Pillars']: self.progLog( '\r#YGOB', 'Converting YGOB Pillars for GOPHER: %.2f%%' % (px / ptot)) px += 100 newpillar = [] for yid in pillar: seq = rje_sequence.Sequence(self.log, self.cmd_list) seq.opt['Yeast'] = True #self.deBug(yid) seq.info['Name'] = yid seq.extractDetails(gnspacc=True) #self.deBug(seq.info) ygob = seq.info['AccNum'] if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob] else: acc = ygob ymap[yid] = acc if acc not in seqdict: self.printLog( '\r#GENE', 'Non-coding gene %s (%s)? Cannot find in fasta file' % (acc, yid)) continue try: newpillar.append(seqdict[acc].shortName()) except: print yid, ygob, acc self.errorLog(rje_zen.Zen().wisdom()) if not newpillar: continue for ygob in pillar: acc = ymap[ygob] if acc not in seqdict: continue if acc in self.list['YeastSeq'] or ( not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'): open( rje.makePath('BLAST/%s.blast.id' % acc, wholepath=True), 'w').write(string.join(newpillar, '\n')) ox += 1 self.progLog( '\r#YGOB', 'Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def alignmentToLocal(self,alignment=[],protqry=False): ### Converts alignment into local hits table ''' Converts alignment into local hits table. >> alignment:list of alignment text strings parsed from exonerate output. >> protqry:bool[False] = Whether query is protein << returns local database table. ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand'] vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID']) ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ''' Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus] Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp] Model: protein2genome:local Raw score: 1170 Query range: 19 -> 295 Target range: 12312786 -> 12307250 20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg : 37 ..!...||| |||||||||||||||||||||||||||||||||||||||||| CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg 12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735 264 : {G} >>>> Target Intron 7 >>>> {ly}GluIleAspIleSerArg : 270 {|} 1304 bp {||}|||||||||||||||!!! {G}++ ++{ly}GluIleAspIleSerSer 12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328 289 : ValProProAsnTyrTyrTyr : 295 |||||| !!!..||| !!||| ValProAlaThrTyrAspTyr 12307273 : GTTCCTGCCACGTATGACTAT : 12307251 ''' qry = None hit = None alnx = {} ventry = {} parsing = alignment[0:] rank = 1 while parsing: line = parsing.pop(0) #self.bugPrint(line) # Query if rje.matchExp('Query: (\S+)',line): if ventry: vdb.addEntry(ventry) ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank} rank += 1 # Hit if rje.matchExp('Target: (\S+)',line): ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0] qh = (ventry['Qry'],ventry['Hit']) if qh in alnx: alnx[qh] += 1 else: alnx[qh] = 1 ventry['AlnID'] = alnx[qh] # Score if rje.matchExp('core: (\S+)',line): ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0]) # Alignment if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line): adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line) #self.bugPrint('= new aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] x = line.find(aln) if 'QryStart' not in ventry: ventry['QryStart'] = start ventry['QryEnd'] = end ventry['QrySeq'] += aln #self.bugPrint('^%s$' % ventry['QrySeq']) line = parsing.pop(0) #self.bugPrint(line) #self.bugPrint(']%s[' % aln) #self.bugPrint(']%s[' % line[x:x+len(aln)]) ventry['AlnSeq'] += line[x:x+len(aln)] #self.debug('^%s$' % ventry['AlnSeq']) #self.bugPrint(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: #self.deBug(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: raise ValueError('Partial alignment! Truncated output?') #self.bugPrint('+ hit aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] if 'HitStart' not in ventry: ventry['HitStart'] = start ventry['HitEnd'] = end ventry['HitSeq'] += aln if ventry: vdb.addEntry(ventry) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T']) self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F']) #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): # 1->1, 2->4, 3->7 = 1+3*(n-1) ventry['QryStart'] = 1+3*(ventry['QryStart']-1) if ventry['QrySeq'].startswith('{'): codend = ventry['QrySeq'].find('}') # {X} = phase 2, find = 2 if codend == 2: ventry['QryStart'] += 2 # {XX} = phase 1, find = 3 elif codend == 3: ventry['QryStart'] += 1 else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry) ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 vdb.newKey(['Qry','Rank','Hit','AlnID']) for vkey in vdb.dataKeys(): ventry = vdb.data(vkey) #i# Make a combined hitseq to output to fasta #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd'])) hitseq = '' phase = (ventry['QryStart'] + 2) % 3 alnx = 1 vkeyentries = [ventry] dirn = 1 if ventry['HitEnd'] < ventry['HitStart']: dirn = -1 ventry['HitStrand'] = '-' else: ventry['HitStrand'] = '+' for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = string.replace(ventry[seq],'}','') ventry[seq] = string.replace(ventry[seq],'{','') while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']): intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0] x = ventry['QrySeq'].find(intron) y = x + len(intron) intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0]) #i# Create a new entry of the first exon newentry = rje.combineDict({},ventry) for seq in ['HitSeq','QrySeq','AlnSeq']: newentry[seq] = newentry[seq][:x] newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1 newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1 newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn newentry['Length'] = x newentry['Identity'] = string.count(newentry['AlnSeq'],'|') vkeyentries.append(vdb.addEntry(newentry)) hitseq += newentry['HitSeq'] #i# Update ventry to be the rest of the hit for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = ventry[seq][y:] ventry['QryStart'] = newentry['QryEnd'] + 1 if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn #i# Calculate length and identity of final exon ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx) ventry['Length'] = len(ventry['AlnSeq']) ventry['Identity'] = string.count(ventry['AlnSeq'],'|') #i# Add sequence hits hitname += ' (%d alignment blocks)' % alnx hitseq += ventry['HitSeq'] hitseq = string.replace(hitseq,'-','') protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq)) self.obj['ProtHits']._addSeq(hitname,protseq) if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq) self.obj['DNAHits']._addSeq(hitname,hitseq) #i# Update AlnID for proper float sorting for ventry in vkeyentries: (vcore,vx) = string.split(ventry['AlnID'],'.') ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx)) #self.debug(ventry) vdb.dataFormat({'AlnID':'string'}) vdb.remakeKeys() self.debug(vdb.dataKeys()) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$\n' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) udb = self.reduceLocal(byqry=True) udb.rename('unique') udb.newKey(['Qry','Rank','Hit','AlnID']) self.debug(vdb.dataKeys()) #i# Calculate exon phase for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3 #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 for ventry in udb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 #vdb.remakeKeys() return vdb except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
def taxDict(self,taxid,store=False,skipuni=False): ### Extracts taxonomy details from SpecFile for taxid '''Extracts taxonomy details from SpecFile for taxid. If taxid is a list, will process each element.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxdict = {} ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxid.sort() except: tlist = False if tlist: tx = 0.0; ttot = len(taxid); mx = 0 for t in taxid: self.progLog('\r#SPEC','Extracting Uniprot species details: %.1f%%' % (tx/ttot)); tx += 100.0 taxdict[t] = self.taxDict(t,store) if not taxdict[t]: mx += 1 self.printLog('\r#SPEC','Extracted Uniprot/NCBI species details for %s TaxID: %s missing' % (rje.iStr(ttot),rje.iStr(mx))) return taxdict ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxid = '%s' % taxid if taxid in self.dict['TaxDict']: return self.dict['TaxDict'][taxid] if not skipuni: greplines = os.popen('grep -A 1 " %s:" %s' % (taxid, self.getStr('SpecFile'))).readlines() for entry in greplines: nmatch = rje.matchExp('^(\S+)\s+\S+\s+(\d+):\s+N=(\S.+)\s*$',entry) if nmatch and nmatch[1] != taxid: break # Next taxon if nmatch: taxdict['spcode'] = nmatch[0]; taxdict['name'] = nmatch[2] elif rje.matchExp('C=(\S.+)\s*$',entry): taxdict['common'] = rje.matchExp('C=(\S.+)\s*$',entry)[0] #if not taxdict and taxid in self.list['RankID']: self.warnLog('Cannot find TaxID "%s" in %s!' % (taxid,self.getStr('SpecFile')),'Missing_TaxID',suppress=True) ## ~ [2b] ~ Adding missing scientific names from NameMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not taxdict: for entry in os.popen('grep -i -e "^%s\t" %s' % (taxid, self.getStr('NameMap'))).readlines(): tdata = string.split(entry,'\t|\t') if not tdata[3].startswith('scientific name'): continue tname = tdata[1] if 'name' in taxdict: self.warnLog('TaxID %d hits "%s" and "%s"!' % (taxid, taxdict[name],tname)) else: taxdict['name'] = tname return taxdict except: self.errorLog('%s.taxDict() error' % (self)); raise
def gopher(self): ### Sets up data for GOPHER run '''Sets up data for GOPHER run.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self,'BLAST/') rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.ygob.fas' % self.info['Basefile'],protein=True,force=False) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.yeast.fas' % self.info['Basefile'],protein=True,force=False) seqdict = self.obj['SeqList'].seqNameDic('AccNum') ymap = self.dict['PillarMap'] = {} ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (px,ptot) = (0.0,len(self.list['Pillars'])); ox = 0 for pillar in self.list['Pillars']: self.progLog('\r#YGOB','Converting YGOB Pillars for GOPHER: %.2f%%' % (px/ptot)); px += 100 newpillar = [] for yid in pillar: seq = rje_sequence.Sequence(self.log,self.cmd_list) seq.opt['Yeast'] = True #self.deBug(yid) seq.info['Name'] = yid seq.extractDetails(gnspacc=True) #self.deBug(seq.info) ygob = seq.info['AccNum'] if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob] else: acc = ygob ymap[yid] = acc if acc not in seqdict: self.printLog('\r#GENE','Non-coding gene %s (%s)? Cannot find in fasta file' % (acc,yid)); continue try: newpillar.append(seqdict[acc].shortName()) except: print yid, ygob, acc self.errorLog(rje_zen.Zen().wisdom()) if not newpillar: continue for ygob in pillar: acc = ymap[ygob] if acc not in seqdict: continue if acc in self.list['YeastSeq'] or (not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'): open(rje.makePath('BLAST/%s.blast.id' % acc,wholepath=True),'w').write(string.join(newpillar,'\n')) ox += 1 self.progLog('\r#YGOB','Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox)) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def sgd2sp(self): ### Reformats yeast sequence names and outputs new data for GOPHER '''Reformats yeast sequence names and outputs new data for GOPHER.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### inseq = self.obj['SeqList'] uni = rje_uniprot.UniProt(self.log,self.cmd_list+['datout=None']) xref = self.db('XRef') self.dict['Rename'] = {} ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ufile = '%s.dat' % self.info['Basefile'] if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile,clear=True,cleardata=False) else: uni.readUniProt(clear=True,acclist=rje.sortKeys(xref.index('UniProt')),cleardata=False) uni.saveUniProt(ufile) ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniseq = {} for entry in uni.entries(): seq = entry.obj['Sequence'] uniseq[seq.info['AccNum']] = seq self.printLog('\r#USEQ','%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt'))))) ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### yseq = [] # List of YEAST sequence objects (sx,stot) = (0.0,inseq.seqNum()) for seq in inseq.seqs(): self.progLog('\r#SEQ','Reformatting sequence names: %.2f%%' % (sx/stot)); sx += 100.0 if seq.info['SpecCode'] != 'YEAST': continue yseq.append(seq) sgd = seq.info['AccNum']; newname = seq.info['Name'] try: for x in xref.indexEntries('EnsG',sgd): acc = x['UniProt'] if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD'],acc) else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD']); continue if acc not in uniseq: self.printLog('\r#UNIERR','Unable to find UniProt sequence %s (%s)' % (acc,sgd)); continue useq = uniseq[acc] if useq.info['Sequence'] != seq.info['Sequence']: self.printLog('\r#SEQERR','%s sequence <> %s sequence' % (sgd,acc)); continue nsplit = string.split(newname) nsplit[0] = '%s__%s' % (x['UniprotID'],acc) newname = string.join(nsplit) self.dict['Rename'][sgd] = acc break except: self.errorLog('%s problem' % sgd) seq.info['Name'] = newname seq.extractDetails(gnspacc=True) self.printLog('\r#SEQ','Reformatting sequence names complete.') ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.exists('%s.ygob.fas' % self.info['Basefile']): inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile']) if not rje.exists('%s.yeast.fas' % self.info['Basefile']): inseq.saveFasta(seqs=yseq,seqfile='%s.yeast.fas' % self.info['Basefile']) self.list['YeastSeq'] = inseq.accList(yseq) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def makePPI(self): ### Generates files for Human-HIV PPI analysis '''Generates files for Human-HIV PPI analysis.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % self.getStr('HIVSeq'),'autoload=T']) if not seqlist.seqs(): return False seqmap = seqlist.seqNameDic('Max') mdb = self.db('HHPIDMap') ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hivacc in mdb.index('AccHIV'): # map HIV accession numbers on to sequences seqNameDic accnum = string.split(hivacc,'.')[0] hivseq = seqmap[accnum] # extract short HIV name from sequence ID hivgene = string.split(hivseq.shortName(),'_')[0].upper() # create directory named after HIV gene #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene)) rje.mkDir(self,'%s/' % hivgene,log=True) # copy human PPI files into directories, adding HIV gene ex = 0.0; etot = len(mdb.index('AccHIV')[hivacc]) for entry in mdb.indexEntries('AccHIV',hivacc): self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot),hivgene)) pfile = self.getStr('PPIDir') + entry['Symbol'] + '.ppi.fas' if rje.exists(pfile): FAS = open('%s/%s.%s.ppi.fas' % (hivgene,hivgene.lower(),entry['Symbol']),'w') FAS.write('>%s\n%s\n' % (hivseq.info['Name'],hivseq.getSequence())) FAS.write(open(pfile,'r').read()) FAS.close() else: self.errorLog('Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'],entry['Symbol']),printerror=False) self.printLog('\r#PPI','Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot),entry['HIV'],hivgene)) except: self.errorLog('%s.makePPI error' % self); return False
def filterSPCode( self ): ### Filters species codes according to mincount and shared taxa at different levels. '''Filters species codes according to mincount and shared taxa at different levels.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() tax = self.obj['Taxonomy'] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### specdb = self.db('spcode') parents = {} # Dictionary of {spcode:parents} taxsum = {} # Reduced according to low abundance and/or higher level taxa of species in clade fx = 0 bfx = 0 ufx = 0 for ekey in specdb.dataKeys(): entry = specdb.data(ekey) if entry['spcode'] == 'None': entry['boot'] = self.getNum('NoneBoot') continue if entry['boot'] < self.getNum('BootFilter'): self.printLog( '#FILT', '%s: filtered -> "Uncertain" (bootstrap %s < bootfilter=%s).' % (entry['protein'], entry['boot'], self.getNum('BootFilter'))) entry['spcode'] = 'Uncertain' bfx += 1 continue #self.debug(entry) spcodes = string.split(entry['spcode'], '|') for spcode in spcodes[0:]: if spcode not in parents: parents[spcode] = [] try: taxid = tax.mapToTaxID(spcode, nodeonly=True, warn=False)[0] except: continue while taxid in tax.dict['Parent']: taxid = tax.dict['Parent'][taxid] parsp = tax.getSpCode(taxid, invent=False, warn=False) if parsp: parents[spcode].append(parsp) if not parents[spcode] and len(spcodes) > 1: self.printLog( '#FILT', '%s: filtered unmapped spcode %s.' % (entry['protein'], spcode)) spcodes.remove(spcode) ufx += 1 for parsp in parents[spcode]: if parsp in spcodes: self.printLog( '#FILT', '%s: filtered %s as parent of %s.' % (entry['protein'], parsp, spcode)) spcodes.remove(parsp) fx += 1 for taxon in spcodes[0:]: if taxon not in taxsum: taxsum[taxon] = 0.0 if self.getBool('BootWeight'): taxweight = entry['boot'] else: taxweight = 1.0 taxsum[taxon] += taxweight / len(spcodes) entry['spcode'] = string.join(spcodes, '|') self.printLog( '#FILT', 'Filtered %s species codes with co-occurring child taxa' % rje.iStr(fx)) self.printLog( '#FILT', 'Filtered %s unmapped species codes with co-occurring mapped taxa' % rje.iStr(ufx)) if self.getNum('BootFilter') > 0.0: self.printLog( '#FILT', 'Filtered %s proteins with bootstrap < bootfilter=%s' % (rje.iStr(bfx), self.getNum('BootFilter'))) #self.debug(entry) fx = 0 for ekey in specdb.dataKeys(): entry = specdb.data(ekey) if entry['spcode'] in ['None', 'Uncertain']: continue #self.debug(entry) spcodes = string.split(entry['spcode'], '|') for spcode in spcodes[0:]: if self.getNum('MinScore') > 0 and self.getNum( 'MinScore') > taxsum[spcode]: self.printLog( '#FILT', '%s: filtered %s < minscore=%s.' % (entry['protein'], spcode, self.getNum('MinScore'))) spcodes.remove(spcode) fx += 1 if spcodes: entry['spcode'] = string.join(spcodes, '|') else: self.printLog( '#FILT', '%s filter aborted: no spcode left!' % (entry['protein'])) #self.debug(entry) self.printLog( '#FILT', 'Filtered %s species codes failing to meet minscore=%s.' % (rje.iStr(fx), self.getNum('MinScore'))) except: self.errorLog('%s.filterSPCode error' % self.prog())
def pileUpStats(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### statfile = '%s.pdiff.tdt' % self.baseFile() if not self.force() and os.path.exists(statfile): return self.pileUpFDR() ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## wtdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: wtdata[locus] = {} for field in ['N','QN','MajFreq']: wtdata[locus][field] = [] WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1 fields = [] for line in WTDATA: data = rje.readDelimit(line) if fields: locus = data[0] pos = int(data[1]) while pos > wx: wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)])) wx += 1 else: fields = data[0:] WTDATA.close() ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mutdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: mutdata[locus] = {} for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = [] MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1 fields = [] for line in MUTDATA: data = rje.readDelimit(line) if fields: locus = data[0] self.str['RefSeq'] = self.dict['RefSeq'][locus] pos = int(data[1]) try: if pos > len(self.str['RefSeq']): while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?' self.str['RefSeq'] += data[2] self.dict['RefSeq'][locus] = self.str['RefSeq'] elif self.str['RefSeq'][pos-1] == '?': self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:] self.dict['RefSeq'][locus] = self.str['RefSeq'] except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq']))) while pos > mx: mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)])) for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)]) mx += 1 else: fields = data[0:] MUTDATA.close() ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Need a new check with locus info #!# #for field in wtdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError #for field in mutdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq'])) ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w') headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff'] SAMSIG.write('%s\n' % string.join(headers,'\t')) nodifx = 0; nomutx = 0; sx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): self.str['RefSeq'] = self.dict['RefSeq'][locus] self.list['WTMajor'] = self.dict['WTMajor'][locus] for i in range(len(self.str['RefSeq'])): try: sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i], mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]] except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue # Was: sigdata += [1.0,1.0] elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue elif not wtdata[locus]['MajFreq'][i]: # No Data for WT if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0] else: sigdata += [1.0,1.0] elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) sigdata.append(1.0) elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(1.0) sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) else: sigdata += [1.0,1.0] sigdata.append(min(1.0,2*min(sigdata[-2:]))) rje.writeDelimit(SAMSIG,sigdata); sx += 1 SAMSIG.close() ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx) if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx) if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx) self.printLog('#PDIFF','%s.' % ptxt) ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.pileUpFDR() except: self.errorLog('%s.pileUpStats() error' % (self)); return None
def summaryScores( self, rankdb=None, sumstr='taxasum', minsum='MinSum'): ### Generates summary scores from rank table. '''Generates summary scores from rank table.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() if not rankdb: rankdb = self.db('taxamap') sumdb = db.addEmptyTable(sumstr, [ 'rank', 'taxon', 'count', 'bootwt', 'meanboot', 'perc', 'wtperc' ], ['rank', 'taxon']) ranks = ['genus', 'family', 'order', 'class', 'phylum'] ### ~ [2] Normalise to reduced levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for rank in ranks: self.printLog('\r#RANK', 'Normalising %s data.' % rank) taxsum = {} ranksum = 0.0 # Summed counts for taxa and rank total taxwt = {} wtsum = 0.0 # Bootstrap-weighted summed counts for taxa and rank total bootsum = {} bootx = { } # Sum and count of bootstrap values for mean boot numbers for entry in rankdb.entries(): taxa = string.split(entry[rank], '|') for taxon in taxa: if taxon in self.list['TaxFilter']: continue if taxon not in taxsum: taxsum[taxon] = 0.0 taxwt[taxon] = 0.0 bootsum[taxon] = 0.0 bootx[taxon] = 0 taxsum[taxon] += 1.0 / len(taxa) ranksum += 1.0 / len(taxa) taxweight = entry['boot'] bootsum[taxon] += entry['boot'] bootx[taxon] += 1 taxwt[taxon] += taxweight / len(taxa) wtsum += taxweight / len(taxa) otherx = 0 for taxon in rje.sortKeys(taxsum): if taxon == 'Other': continue if taxsum[taxon] < self.getNum(minsum): if 'Other' not in taxsum: taxsum['Other'] = 0.0 taxwt['Other'] = 0.0 bootsum['Other'] = 0.0 bootx['Other'] = 0.0 taxsum['Other'] += taxsum.pop(taxon) taxwt['Other'] += taxwt.pop(taxon) bootsum['Other'] += bootsum.pop(taxon) bootx['Other'] += bootx.pop(taxon) otherx += 1 self.printLog( '#MINSUM', '%s %s taxa converted to "Other" (count < minsum=%s)' % (rje.iStr(otherx), rank, self.getNum(minsum))) for taxon in taxsum: sumdb.addEntry({ 'rank': rank, 'taxon': taxon, 'count': rje.dp(taxsum[taxon], 1), 'perc': rje.sf(100.0 * taxsum[taxon] / ranksum), 'bootwt': rje.dp(taxwt[taxon], 1), 'meanboot': rje.dp(bootsum[taxon] / bootx[taxon], 3), 'wtperc': rje.sf(100.0 * taxwt[taxon] / wtsum) }) ## ~ [2a] Rank taxa by counts such that highest is Rank 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sumdb.rankFieldByIndex('rank', 'count', rev=True, absolute=True, lowest=True) sumdb.rankFieldByIndex('rank', 'bootwt', rev=True, absolute=True, lowest=True) ## ~ [2b] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sumdb.saveToFile() except: self.errorLog('%s.summaryScores error' % self.prog())
def batchRun(self, returnobj=False): ### Execute batch mode runs '''Execute batch mode runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### barg = self.getStrLC('BatchArg') if not barg: raise ValueError( 'Cannot use batchrun=FILELIST if batcharg=None.') batchfiles = self.list['BatchRun'][0:] self.list['BatchRun'] = [] # Avoid recursive running! blog = self.getStr('BatchLog') if not blog.startswith('.'): blog = '.%s' % blog if not blog.endswith('.log'): blog = '%s.log' % blog rawcmd = self.cmd_list[0:] rawlog = self.log batchobj = [] ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### bx = 0 for bfile in batchfiles: bx += 1 self.printLog( '#BATCH', 'Batch running %s of %s: %s=%s' % (rje.iStr(bx), rje.iLen(batchfiles), barg, bfile)) ## Setup parameters bbase = rje.baseFile(bfile, strip_path=True) bcmd = ['%s=%s' % (barg, bfile)] if self.getBool('BatchBase'): if blog == '.log': bcmd += ['basefile=%s' % bbase] else: bcmd += ['basefile=%s%s' % (bbase, rje.baseFile(blog))] elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase, blog)] else: bcmd += ['newlog=F'] #self.debug(bcmd) ## Setup Seqsuite object self.cmd_list = rawcmd + bcmd self.log = rje.setLog( self.log.obj['Info'], self, self.cmd_list ) # Sets up Log object for controlling log file output ## Run batchobj.append(self.run()) ## Finish and Tidy self.log = rawlog runobj = batchobj[-1] if runobj: if not returnobj: batchobj[-1] = True info = runobj.log.obj['Info'] self.printLog( '#RUN', '%s V%s run finished.' % (info.program, info.version)) else: self.warnLog('Batch run failed (%s=%s).' % (barg, bfile)) ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### failx = batchobj.count(False) self.printLog( '#BATCH', '%s batch runs complete: %s failed.' % (rje.iLen(batchfiles), rje.iStr(failx))) self.list['BatchRun'] = batchfiles return batchobj except: self.errorLog('%s.batchRun error' % self) return False
def parseMITAB(self): ### Parse MITAB file into pairwise PPI table. '''Parse MITAB file into pairwise PPI table.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.obj['XRef'] pdb = self.db('pairwise') pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType'] headers = {} for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h dbsource = self.getStr('DBSource') ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0 epos = self.endPos('MITAB') complexidlist = [] badtaxa = ['-'] baduni = [] while 1: self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist))) mline = self.readDelimit('MITAB'); mx += 1 if not mline: break entry = {'#':pdb.entryNum()} for field in pfields: entry[field] = '' ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## complexid = {} # This will take the first complex ID if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']: if int(mline[headers['numParticipants']]) > 2: complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']] #self.bugPrint(mline) #self.debug(complexid) ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## taxa = {'A':'','B':''} for tfield in self.list['TaxaField']: ab = tfield[-1:].upper() if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0] try: taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0] if self.list['TaxID'] and taxon not in self.list['TaxID']: continue taxa[ab] = taxon except: taxon = mline[headers[tfield]] if taxon not in badtaxa: badtaxa.append(taxon) self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True) if not self.list['TaxID']: taxa[ab] = '-' if not taxa['A'] and complexid: taxa['A'] = taxa['B'] if not taxa['B'] and complexid: taxa['B'] = taxa['A'] if not (taxa['A'] and taxa['B']): ftx += 1; continue ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ids = {'A':[],'B':[]} uni = {'A':'','B':''} for ifield in self.list['IDField']: ab = ifield[-1:].upper() if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0] # Split IDs on | then db:id vs self.list['MapDB'] for pid in string.split(mline[headers[ifield]],'|'): try: (db,dbid) = string.split(pid,':',1) except: continue if db.lower() in ['uniprotkb'] and '(' in dbid: continue # Only map uniprotkb accnum dbid = string.split(dbid,'(')[0] dbid = string.split(dbid,';')[0] if db.lower() in ['uniprotkb']: svid = dbid dbid = string.split(svid,'-')[0] if ab not in complexid: # First identifier for A/B if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid) else: complexid[ab] = '' if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid) # Parse uniprot directly if possible if db.lower() in ['uniprotkb'] and not uni[ab]: if self.getBool('SpliceVar'): uni[ab] = svid else: uni[ab] = dbid #self.bugPrint(ids) ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## amb = {'A':False,'B':False} if not ids['A'] or not ids['B']: #self.bugPrint('%s\n=> ID Failure' % mline) #self.bugPrint(ids['A']); self.bugPrint(ids['B']) #self.bugPrint(entry) fx += 1; continue for ida in ids['A']: #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False))) if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True) if entry['Hub'] == False: amb['A'] = True #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True) if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida) if self.getBool('AddUni') and not entry['HubUni']: entry['HubUni'] = uni['A'] if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A']) if not entry['Hub'] and entry['HubUni']: entry['Hub'] = entry['HubUni'] #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True) uhx += 1 if not entry['Hub'] and complexid['A']: entry['Hub'] = complexid['A'] else: complexid['A'] = '' if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = '' for idb in ids['B']: if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True) if entry['Spoke'] == False: amb['B'] = True #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True) if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb) if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B'] if not entry['Spoke'] and entry['SpokeUni']: entry['Spoke'] = entry['SpokeUni'] #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True) usx += 1 if not entry['Spoke'] and complexid['B']: entry['Spoke'] = complexid['B'] else: complexid['B'] = '' if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']: entry['Spoke'] = '' if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B']) if complexid['A'] and complexid['B']: if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')): self.printLog('\r#MITAB','',log=False) self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True) entry['Hub'] = entry['Spoke'] = '' #self.bugPrint(entry) #self.debug(complexid) if not (entry['Hub'] and entry['Spoke']): if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']): fax += 1; continue #self.bugPrint(mline); self.debug(entry) fx += 1; continue #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue entry['HubTaxID'] = taxa['A'] entry['SpokeTaxID'] = taxa['B'] if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A']) if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B']) #if complexid['A'] or complexid['B']: self.debug(entry) ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #self.bugPrint(mline) evidence = [] for tfield in self.list['MethodField']: #self.bugPrint(string.split(mline[headers[tfield]],'|')) for etype in string.split(mline[headers[tfield]],'|'): ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype) if ematch: evidence.append('%s:%s' % (dbsource,ematch[0])) if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource'))) evidence = rje.sortUnique(evidence) #self.debug(evidence) entry['Evidence'] = string.join(evidence,'|') ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## itypes = [] for tfield in self.list['TypeField']: #self.bugPrint(string.split(mline[headers[tfield]],'|')) for etype in string.split(mline[headers[tfield]],'|'): ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype) if ematch: itypes.append(ematch[0]) if not itypes: itypes.append('unknown') itypes = rje.sortUnique(itypes) #self.debug(itypes) entry['IType'] = string.join(itypes,'|') pdb.addEntry(entry); ex += 1 if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']: self.printLog('#DEV',string.join(mline,'\t')) #self.bugPrint(uni); self.debug(entry) if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']: pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'], 'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'], 'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'], 'Evidence':entry['Evidence'],'IType':entry['IType']}) self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist))) self.close('MITAB') if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx))) if baduni: baduni.sort() accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource) self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout)) open(accout,'w').write(string.join(baduni,'\n')) ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not complexidlist: return pdb self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist)) ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## complexes = {}; chentries = []; csentries = [] cevidence = {} # List of Evidence for each complex citypes = {} # List of ITypes for each complex ctaxa = {} ex = 0.0; etot = pdb.entryNum() for entry in pdb.entries(): self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0 if entry['Hub'] in complexidlist: cid = entry['Hub'] if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = [] complexes[cid].append(entry['Spoke']) ctaxa[entry['Spoke']] = entry['SpokeTaxID'] cevidence[cid].append(entry['Evidence']) citypes[cid].append(entry['IType']) chentries.append(entry) elif entry['Spoke'] in complexidlist: cid = entry['Spoke'] if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = [] complexes[cid].append(entry['Hub']) ctaxa[entry['Hub']] = entry['HubTaxID'] cevidence[cid].append(entry['Evidence']) citypes[cid].append(entry['IType']) csentries.append(entry) self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist))) #self.debug(complexes) ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cppi = {} ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0 for cid in rje.sortKeys(complexes): self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0 if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid])) if len(complexes[cid]) < 2: complexes.pop(cid) cevidence.pop(cid) citypes.pop(cid) rx += 1; continue complexes[cid].sort() #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|') #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|') cmax = max(cmax,len(complexes[cid])) #px += (len(complexes[cid]) * (len(complexes[cid])-1)) members = complexes[cid][0:] while members: hub = members.pop(0) if self.dev() and hub == 'KLF3': self.debug(cid) if hub not in cppi: cppi[hub] = {} for spoke in members: if spoke not in cppi[hub]: cppi[hub][spoke] = []; px += 1 cppi[hub][spoke].append(cid) self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px))) ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cix = pdb.entryNum() for centry in chentries + csentries: pdb.dropEntry(centry) ex = 0.0; etot = len(cppi) for hub in rje.sortKeys(cppi): self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0 #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]} hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]} for spoke in rje.sortKeys(cppi[hub]): evidence = [] itypes = [] ctypes = [] for cid in cppi[hub][spoke]: evidence += cevidence[cid] itypes += citypes[cid] ctypes += string.split(cid,':')[0] ctype = string.join(rje.sortUnique(ctypes),'|') evidence = string.join(rje.sortUnique(evidence),'|') if not evidence: evidence = '%s:%s' % (dbsource,ctype) itypes = string.join(rje.sortUnique(itypes),'|') if not itypes: itypes = ctype #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]} newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]} newentry['Evidence'] = evidence newentry['IType'] = itypes entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1 if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry) if self.getBool('Symmetry'): pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'], 'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'], 'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'], 'Evidence':entry['Evidence'],'IType':entry['IType']}) cix += 1 self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry'))) return pdb except: self.errorLog('%s.parseMITAB error' % self.prog())
def _digest(self): ### Main digestion of sequences and population of results database '''Main digestion of sequences and population of results database.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db('ProDigIS') prot_combo = self.protCombo() ## ~ [1] ~ Peptide Probability Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pdb = self.db('PepProb'); pdict = {} if pdb: if self.getBool('CysWeight'): for plen in pdb.index('PepSize').keys(): pdict[plen] = {} for entry in pdb.entries(): pdict[entry['PepSize']][entry['CysCount']] = entry else: for entry in pdb.entries(): pdict[entry['PepSize']] = entry ### ~ [2] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(self.int) for prot in prot_combo: allpep = []; redundant = []; maxcys = 0 sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0 sequence = seq.getSequence() for protease in string.split(prot,'+'): for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) for frag in string.split(sequence,':'): if frag in allpep: redundant.append(frag) else: allpep.append(frag); maxcys = max(maxcys,frag.count('C')) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot))) if self.getBool('CysCount'): for c in range(maxcys+1): db.addField('Cys%d' % c) ### ~ [3] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0 acc = seq.getStr('AccNum') ## ~ [2a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~ ## entry = {'AccNum':acc,'Protease':prot} for i in range(1,self.getInt('MaxPepLen')+1): entry[i] = 0 if self.getBool('PepMWt'): entry[i*100.0] = 0 sequence = seq.getSequence() ## ~ [2b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~ ## for protease in string.split(prot,'+'): for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) ## ~ [2c] ~ Cut into fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## frag = string.split(sequence,':') while '' in frag: frag.remove('') self.deBug(frag) entry['PepCount'] = len(frag) if not self.getBool('NTerm'): frag = frag[1:] if self.getInt('MinPepLen') > 0: for pep in frag[0:]: if len(pep) < self.getInt('MinPepLen'): frag.remove(pep) entry['MinPepLen'] = len(frag) if self.getBool('NRPep'): for pep in frag[0:]: if pep in redundant: frag.remove(pep) entry['NRPep'] = len(frag) if self.getBool('CysCount'): for c in range(maxcys+1): entry['Cys%d' % c] = 0 for pep in frag: entry['Cys%d' % pep.count('C')] += 1 if pdict: entry['LenExp'] = 0.0; entry['MWtExp'] = 0.0; entry['Len7Exp'] = 0.0 ## ~ [2d] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for pep in frag[0:]: plen = min(len(pep),self.getInt('MaxPepLen')) self.deBug('"%s" -> %d' % (pep,plen)) entry[plen] += 1 if pdict: if self.getBool('CysWeight'): try: pprob = pdict[plen][pep.count('C')]['Prob'] except: pprob = 0.0 else: pprob = pdict[plen]['Prob'] if pdict: entry['LenExp'] += pprob if pdict and 7 <= plen: entry['Len7Exp'] += pprob if self.getBool('PepMWt'): pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen')) entry[pwt] += 1 if pdict: entry['MWtExp'] += pprob entry['Len3'] = rje.logPoisson(3,entry['LenExp'],callobj=self) if self.getBool('PepMWt'): entry['MWt3'] = rje.logPoisson(3,entry['MWtExp'],callobj=self) entry['Len5'] = rje.logPoisson(5,entry['LenExp'],callobj=self) if self.getBool('PepMWt'): entry['MWt5'] = rje.logPoisson(5,entry['MWtExp'],callobj=self) entry['Len37'] = rje.logPoisson(3,entry['Len7Exp'],callobj=self) db.addEntry(entry) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot))) except: self.errorLog('%s._digest error' % self)
def sgd2sp( self ): ### Reformats yeast sequence names and outputs new data for GOPHER '''Reformats yeast sequence names and outputs new data for GOPHER.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### inseq = self.obj['SeqList'] uni = rje_uniprot.UniProt(self.log, self.cmd_list + ['datout=None']) xref = self.db('XRef') self.dict['Rename'] = {} ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ufile = '%s.dat' % self.info['Basefile'] if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile, clear=True, cleardata=False) else: uni.readUniProt(clear=True, acclist=rje.sortKeys(xref.index('UniProt')), cleardata=False) uni.saveUniProt(ufile) ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniseq = {} for entry in uni.entries(): seq = entry.obj['Sequence'] uniseq[seq.info['AccNum']] = seq self.printLog( '\r#USEQ', '%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt'))))) ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### yseq = [] # List of YEAST sequence objects (sx, stot) = (0.0, inseq.seqNum()) for seq in inseq.seqs(): self.progLog( '\r#SEQ', 'Reformatting sequence names: %.2f%%' % (sx / stot)) sx += 100.0 if seq.info['SpecCode'] != 'YEAST': continue yseq.append(seq) sgd = seq.info['AccNum'] newname = seq.info['Name'] try: for x in xref.indexEntries('EnsG', sgd): acc = x['UniProt'] if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % ( seq.info['Name'], x['Gene'], x['EnsG'], x['SGD'], acc) else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % ( seq.info['Name'], x['Gene'], x['EnsG'], x['SGD']) continue if acc not in uniseq: self.printLog( '\r#UNIERR', 'Unable to find UniProt sequence %s (%s)' % (acc, sgd)) continue useq = uniseq[acc] if useq.info['Sequence'] != seq.info['Sequence']: self.printLog( '\r#SEQERR', '%s sequence <> %s sequence' % (sgd, acc)) continue nsplit = string.split(newname) nsplit[0] = '%s__%s' % (x['UniprotID'], acc) newname = string.join(nsplit) self.dict['Rename'][sgd] = acc break except: self.errorLog('%s problem' % sgd) seq.info['Name'] = newname seq.extractDetails(gnspacc=True) self.printLog('\r#SEQ', 'Reformatting sequence names complete.') ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.exists('%s.ygob.fas' % self.info['Basefile']): inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile']) if not rje.exists('%s.yeast.fas' % self.info['Basefile']): inseq.saveFasta(seqs=yseq, seqfile='%s.yeast.fas' % self.info['Basefile']) self.list['YeastSeq'] = inseq.accList(yseq) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def parsePileup(self,tname,filename,wtdb=None): ### Extracts, filters and processes PileUp data '''Extracts, filters and processes PileUp data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### table = self.db().addEmptyTable(tname,['Locus','Pos','Seq','N','QN','Major','MajFreq'],keys=['Locus','Pos']) qc = [] if wtdb: table.addField('WTFreq') PILEUP = open(filename,'r'); px = 0; ex = 0 PILEOUT = open('%s.%s.tdt' % (self.baseFile(),tname),'w') rje.writeDelimit(PILEOUT,outlist=table.fields(),delimit='\t') locus = None refseq = '' #? What is this used for? majors = [] #? What is this used for? ### ~ [2] Process each entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in PILEUP: # Split line up into data. Should be: locus, position, reference, no. reads, read data, qualscores data = string.split(rje.chomp(line)) if not data: break self.progLog('\r#PARSE','Parsing %s: %s pos...' % (filename,rje.iStr(px)),rand=0.01); px += 1 ## ~ [2a] Extract Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## entry = {'Locus':data[0],'Pos':int(data[1]),'Seq':data[2],'N':int(data[3]),'QN':0} if entry['Locus'] != locus: locus = entry['Locus']; refseq = ''; majors = [] refseq += data[2] #entry => 'Ref','Pos','Seq','N','Reads','Qual' rseq = data[4] reads = [] delx = 0 while rseq: try: if rseq[:1] in ['.',',']: reads.append(entry['Seq']); rseq = rseq[1:] elif rseq[:1] == '^': rseq = rseq[2:] #elif rseq[:1] == '*': # reads.append('-1%s' % entry['Seq'].upper()) # rseq = rseq[1:] elif rseq[:1] in ['-','+']: ilen = string.atoi(rje.matchExp('^(\d+)',rseq[1:])[0]) indel = rseq[len('%s' % ilen)+1:][:ilen] #self.deBug('%s: %s' % (rseq,indel)) if rseq[:1] == '-': delx += 1 reads.append(rseq[:len('%s' % ilen)+ilen+1].upper()) else: reads[-1] += indel.upper() #self.deBug(reads[-1]) rseq = rseq[len('%s' % ilen)+ilen+1:] elif rseq[:1] in ['$']: rseq = rseq[1:] else: if rseq[0].upper() not in 'ATGCN*': print ' ???', rseq[0].upper(), '???' reads.append(rseq[0].upper()); rseq = rseq[1:] except: self.errorLog('!') self.deBug(rseq) raise ValueError if len(reads) != (entry['N'] + delx): self.deBug('%s = %d' % (data[4],entry['N'])) self.deBug('%s = %d' % (reads,len(reads))) self.errorLog('Read versus Read Count mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False) raise ValueError ## ~ [2b] Convert Quality Scores ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## qual = [] for q in data[5]: # Gaps do not have a quality score, so fill these in first while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut')) # Then append actual qv qual.append(ord(q) - 33) qc += [0] * (qual[-1] - len(qc)); qc[qual[-1]-1] += 1 while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut')) while '*' in reads: reads[reads.index('*')] = '-' #'-1%s' % entry['Seq'].upper() if len(reads) != len(qual): self.deBug('%s = %d' % (reads,len(reads))) self.deBug('%s = %d' % (qual,len(qual))) self.deBug(data) self.errorLog('Read versus Quality length mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False) raise ValueError ## ~ [2c] Filter low quality ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Pos'] in [190359]: #100,98901,183697,169284, self.deBug(qual) self.deBug(reads) self.deBug(qc) # Remove (from back) any reads than do not meet QV cutoff for r in range(len(qual)-1,-1,-1): if qual[r] < self.getInt('QCut'): qual.pop(r); reads.pop(r) entry['QN'] = len(reads) ## ~ [2d] Major Allele ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## alleles = {} # Dictionary of {nt:count} # Setup major allele if reads: major = reads[0] else: major = '-'; alleles[major] = 0 # Cycle through reads. Keep most abundant allele as major - or reference allele if tied. for read in reads: if read in alleles: alleles[read] += 1 else: alleles[read] = 1 if alleles[read] > alleles[major] or (read == entry['Seq'] and alleles[read] == alleles[major]): major = read entry['Major'] = major majors.append(major) if reads: entry['MajFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[major]) / float(len(reads))) else: entry['MajFreq'] = 0.0 if wtdb: try: wtmajor = self.dict['WTMajor'][locus][entry['Pos']-1] if wtmajor in alleles and reads: entry['WTFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[wtmajor]) / float(len(reads))) else: entry['WTFreq'] = 0.0 if wtmajor != major: self.debug(entry) elif locus == 'chrIV_S288C__BK006938.2' and entry['Pos'] == 271733: self.debug(entry) except: self.warnLog('WTFreq Error (%s:Pos=%d) [Probably no WT read mapped]' % (locus,entry['Pos'])); entry['WTFreq'] = 0.0 if entry['Pos'] in [190359]: #100,98901,183697,169284, self.deBug(qual) self.deBug(reads) self.deBug(alleles) self.deBug(entry) self.deBug(line) #table.addEntry(entry) outlist = [] for field in table.fields(): outlist.append(entry[field]) rje.writeDelimit(PILEOUT,outlist,delimit='\t'); ex += 1 self.printLog('\r#PARSE','Parsed %s: %s entries from %s lines.' % (filename,rje.iStr(ex),rje.iStr(px))) PILEOUT.close() PILEUP.close() ### ~ [3] Save QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### QC = open('%s.%s.QC.tdt' % (self.baseFile(),tname),'w') QC.write('Qual\tCount\n') for q in range(len(qc)): try: QC.write('%d\t%d\n' % (q+1,qc[q])) except: self.errorLog('!') QC.close() return table except: self.errorLog('%s.parsePileup(%s) error' % (self,filename)); return None
def seqSubset2( self ): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(), mainkeys=['Ingolia'], name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile, mainkeys=['Gene'], name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData( ['sourcedata=%s' % xfile, 'aliases=%s' % afile]) ing_genes = string.split( string.join( self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes), rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog( '#MAP', 'Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene) ing_mgi.remove(gene) self.printLog( '#BAD', 'Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt', 'w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi), 'EnsEMBL', sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes), rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna', 'pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'seqout=%s' % seqout, 'autofilter=T', 'autload=T', 'seqmode=file', 'gooddesc=%s' % string.join(ing_musg, ',') ] rje_seqlist.SeqList(self.log, seqcmd) mdb = self.db().addEmptyTable('map', ['Ingolia', 'Gene', 'EnsEMBL'], ['Ignolia']) for gene in ing_map: entry = {'Ingolia': gene, 'Gene': ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'autofilter=F', 'autload=T', 'seqmode=file' ] iseq = rje_seqlist.SeqList(self.log, seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST', evalue='') while iseq.nextSeq(): (iname, icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)', iname)[0] for entry in mdb.indexEntries('EnsEMBL', musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]': 'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST') sdb.addField('ENSP') sdb.addField('ENSI') ENST = open('IngExact.cdna.all.fa', 'w') ENSP = open('IngExact.pep.all.fa', 'w') ex = 0.0 etot = sdb.entryNum() sx = 0 fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog( '\r#ING', 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot)) ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1 continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'], ','): (tname, tseq) = iseq.getDictSeq(trans, format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod] - 3:][:7], entry[icon])) if tseq[entry[icod] - 3:][:7] == entry[icon]: ipept = string.split( rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog( '\r#ING', 'Unable to find Harrington start for %s %s (%s)' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue elif len(peptseq) < minpep: self.printLog( '\r#ING', 'Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue id = rje.preZero(int(ex / 100), etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write( '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], entry['Gene'], mentry['Gene'], cdnaseq)) ENSP.write( '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id, entry['Gene'], mentry['Gene'], peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close() ENSP.close() self.printLog( '\r#ING', 'Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx), rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True): ### Maps taxa onto TaxID. If taxa is a list, will process each element. '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not taxa: return [] taxid = [] ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxa.sort() except: tlist = False if tlist: tx = 0.0; ttot = len(taxa) if ttot > 1: for t in taxa: if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0 taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False) taxid = rje.sortUnique(taxid) if log: if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid))) else: t = taxa[0] if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t) taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False)) if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid))) return taxid ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxmap = self.dict['TaxMap']; rankid = self.list['RankID'] taxa = '%s' % taxa ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if rje.matchExp('^(\d+)$', taxa): #if taxa not in taxmap: self.taxaChildren(taxa) #if taxa in rankid: return [taxa] if nodeonly: if taxa in rankid or not rankonly: return [taxa] else: return [] if taxa not in taxmap: if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True) return [] parents = [taxa] while parents: taxa = parents.pop(0) #if taxa not in taxmap: self.taxaChildren(taxa) if not rankonly or taxa in rankid: taxid.append(taxa) parents += taxmap[taxa] return taxid ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if taxa == string.replace(taxa.upper(),' ',''): greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines() for entry in greplines: try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0]) except: pass if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True) if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxa = taxa.replace('_',' ') ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines() gtaxid = None; comid = []; synid = [] for entry in greplines: try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0] except: pass if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid) elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid) elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid) if not taxid: taxid = comid if not taxid: taxid = synid if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True) if len(taxid) > 1: #self.bugPrint(string.join(greplines)) #self.debug('%s %s %s' % (taxid,comid,synid)) if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid #self.debug(taxid) ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines() for entry in greplines: try: #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry) gtaxid = string.split(entry,'\t|\t') if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0]) elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0]) except: pass if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid except: self.errorLog('%s.mapToTaxID() error' % (self)); raise
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log, self.cmd_list) self.db().basefile(self.basefile()) self.list['Accuracy'] = [0, 1.0 - self.getNum('ErrPerBase')] ## ~ [1a] SMRTReads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while self.getStrLC('SMRTUnits') not in ['reads', 'gb', 'mb']: txt = 'SMRTUnits "%s" not recognised' if self.getNum('SMRTReads') < 10: smrtunits = 'Gb' elif self.getNum('SMRTReads') > 10000: smrtunits = 'reads' else: smrtunits = 'Mb' if self.i() < 0 or rje.yesNo( '%s: switch to (%s) %s?' % (txt, self.getNum('SMRTReads'), smrtunits)): self.setStr({'SMRTUnits': smrtunits}) elif self.i() > 0: self.setStr( {'SMRTUnits': rje.choice('SMRTUnits (reads/Gb/Mb)?')}) self.printLog('#UNITS', '%s => %s' % (txt, self.getStr('SMRTUnits'))) if self.getStrLC('SMRTUnits') in ['gb', 'mb']: smrttotal = self.getNum('SMRTReads') * { 'gb': 1e9, 'mb': 1e6 }[self.getStrLC('SMRTUnits')] txt = '%s %s @ %.3f kb/read' % (self.getNum('SMRTReads'), self.getStr('SMRTUnits'), self.getNum('AvRead') / 1000.0) self.setNum({'SMRTReads': smrttotal / self.getNum('AvRead')}) txt += ' => %s reads' % rje.iStr(int(self.getNum('SMRTReads'))) self.printLog('#READS', txt) ## ~ [1b] XnList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## xnlist = [] for xn in self.list['XnList']: if xn == '': continue try: ixn = int(xn) if xn not in [ixn, '%d' % ixn]: self.printLog('#XN', '"%s" -> %dX' % (xn, ixn)) if ixn == 0: self.printLog( '#XN', 'No point in 0X output: use 1-%Coverage.') elif ixn == 1: self.printLog('#XN', 'No point in 1X output: use %Coverage.') else: xnlist.append(ixn) except: self.errorLog( 'Could not process %s as part of XnList. (Integers only.)' % xn) xnlist.sort() if xnlist: self.printLog( '#XN', 'XnList: %sX.' % string.join(string.split('%s' % xnlist, ','), 'X, ')[1:-1]) self.list['XnList'] = xnlist return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def difference(self,table1,table2): ### Generates differences as new table ''' Generates differences as new table. >> table1:Table = iTunes database table to compare >> table2:Table = iTunes database table to compare ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dfields = ['Name','Artist','Composer','Album','Album_Artist','Genre','Time','Disc Number','Disc Count','Track Number','Track Count','Year','Date Added','Plays','Last Played','Skips','Last Skipped','My Rating','Location','Tracks','Score'] db = self.db() tabindex = '#Artist#|#Album#|#Track Number#|#Name#' try: age1 = string.atoi(string.split(table1.name(),'.')[-1]) age2 = string.atoi(string.split(table2.name(),'.')[-1]) table1.index(tabindex,make=True) table2.index(tabindex,make=True) if age1 < age2: oldtable = table1; newtable = table2; newdate = age2 else: newtable = table1; oldtable = table2; newdate = age1 diftable = db.copyTable(newtable,'%s-%s' % (oldtable.name(),string.split(newtable.name(),'.')[-1])) diftable.keepFields(dfields+[tabindex]) diftable.addField('Status') except: self.errorLog('Cannot generate differences for %s and %s' % (table1,table2)) ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#NEW','%s tracks in new iTunes export.' % rje.iStr(newtable.entryNum())) self.printLog('#OLD','%s tracks in old iTunes export.' % rje.iStr(oldtable.entryNum())) oldfiles = oldtable.datakeys()[0:] for entry in diftable.entries(): ## ~ [2a] Find pair of entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Location'] in oldfiles: oldentry = oldtable.data(entry['Location']) elif entry[tabindex] in oldtable.index(tabindex): oldentry = oldtable.indexEntries(tabindex,entry[tabindex])[0] if len(oldtable.indexEntries(tabindex,entry[tabindex])) == 1: pass else: self.printLog('#DUP','Duplicate entries for %s' % entry[tabindex]) for ientry in oldtable.indexEntries(tabindex,entry[tabindex]): if ientry['Location'] in oldfiles: oldentry = ientry; break else: oldentry = None #self.deBug(entry) #self.deBug(oldentry) ## ~ [2b] Generate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not oldentry: entry['Status'] = 'New' continue #self.deBug(oldentry['Location'] in oldfiles) if oldentry['Location'] in oldfiles: oldfiles.remove(oldentry['Location']) #self.deBug(len(oldfiles)) changed = False for field in ['Plays','Skips','My Rating']: if entry[field] != oldentry[field]: changed = True try: entry[field] -= oldentry[field] except: pass # Keep new value - probably empty in old entry if changed: entry['Status'] = 'Changed' else: entry['Status'] = 'Unchanged' ### ~ [3] Add missing old entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### reportdel = rje.yesNo('Report deleted %s tracks?' % diftable.name()) for old in oldfiles: entry = diftable.addEntry(oldtable.data(old)) entry['Status'] = 'Deleted' if reportdel: self.printLog('#DEL','%s: %s [%s]' % (entry['Artist'],entry['Name'],entry['Album'])) ### ~ [4] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for status in rje.sortKeys(diftable.index('Status')): self.printLog('#STAT','%s: %d tracks' % (status.upper(),len(diftable.index('Status')[status]))) self.printLog('#TRACK','%s tracks in total' % rje.iStr(diftable.entryNum())) self.deBug('?') for table in [table1,table2,diftable]: table.dropField(tabindex) diftable.saveToFile('%s.tdt' % diftable.name()) except: self.errorLog('%s.difference() error' % self)
def summaryScores(self,rankdb=None,sumstr='taxasum',minsum='MinSum'): ### Generates summary scores from rank table. '''Generates summary scores from rank table.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() if not rankdb: rankdb = self.db('taxamap') sumdb = db.addEmptyTable(sumstr,['rank','taxon','count','bootwt','meanboot','perc','wtperc'],['rank','taxon']) ranks = ['genus','family','order','class','phylum'] ### ~ [2] Normalise to reduced levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for rank in ranks: self.printLog('\r#RANK','Normalising %s data.' % rank) taxsum = {}; ranksum = 0.0 # Summed counts for taxa and rank total taxwt = {}; wtsum = 0.0 # Bootstrap-weighted summed counts for taxa and rank total bootsum = {}; bootx = {} # Sum and count of bootstrap values for mean boot numbers for entry in rankdb.entries(): taxa = string.split(entry[rank],'|') for taxon in taxa: if taxon in self.list['TaxFilter']: continue if taxon not in taxsum: taxsum[taxon] = 0.0; taxwt[taxon] = 0.0 bootsum[taxon] = 0.0; bootx[taxon] = 0 taxsum[taxon] += 1.0 / len(taxa) ranksum += 1.0 / len(taxa) taxweight = entry['boot'] bootsum[taxon] += entry['boot']; bootx[taxon] += 1 taxwt[taxon] += taxweight / len(taxa) wtsum += taxweight / len(taxa) otherx = 0 for taxon in rje.sortKeys(taxsum): if taxon == 'Other': continue if taxsum[taxon] < self.getNum(minsum): if 'Other' not in taxsum: taxsum['Other'] = 0.0 taxwt['Other'] = 0.0 bootsum['Other'] = 0.0 bootx['Other'] = 0.0 taxsum['Other'] += taxsum.pop(taxon) taxwt['Other'] += taxwt.pop(taxon) bootsum['Other'] += bootsum.pop(taxon) bootx['Other'] += bootx.pop(taxon) otherx += 1 self.printLog('#MINSUM','%s %s taxa converted to "Other" (count < minsum=%s)' % (rje.iStr(otherx),rank,self.getNum(minsum))) for taxon in taxsum: sumdb.addEntry({'rank':rank,'taxon':taxon,'count':rje.dp(taxsum[taxon],1), 'perc':rje.sf(100.0*taxsum[taxon]/ranksum), 'bootwt':rje.dp(taxwt[taxon],1),'meanboot':rje.dp(bootsum[taxon]/bootx[taxon],3), 'wtperc':rje.sf(100.0*taxwt[taxon]/wtsum)}) ## ~ [2a] Rank taxa by counts such that highest is Rank 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sumdb.rankFieldByIndex('rank','count',rev=True,absolute=True,lowest=True) sumdb.rankFieldByIndex('rank','bootwt',rev=True,absolute=True,lowest=True) ## ~ [2b] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sumdb.saveToFile() except: self.errorLog('%s.summaryScores error' % self.prog())
def tidyMotifNames(self,dbtable): ### Tidy the motif names in given dbtable '''Tidy the motif names in given dbtable.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### slist = self.obj['SLiMList'] if 'motif' not in dbtable.fields(): return 0 mx = 0 for entry in dbtable.entries(): newname = slist.slimCoreName(entry['motif']) if newname != entry['motif']: entry['motif'] = newname; mx += 1 self.printLog('#MOTIF','%s motif names corrected for SLiMList splitting.' % rje.iStr(mx)) if mx: dbtable.remakeKeys() return mx except: self.errorLog('Problem during %s tidyMotifNames.' % self.prog()); raise
def difference(self, table1, table2): ### Generates differences as new table ''' Generates differences as new table. >> table1:Table = iTunes database table to compare >> table2:Table = iTunes database table to compare ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dfields = [ 'Name', 'Artist', 'Composer', 'Album', 'Album_Artist', 'Genre', 'Time', 'Disc Number', 'Disc Count', 'Track Number', 'Track Count', 'Year', 'Date Added', 'Plays', 'Last Played', 'Skips', 'Last Skipped', 'My Rating', 'Location', 'Tracks', 'Score' ] db = self.db() tabindex = '#Artist#|#Album#|#Track Number#|#Name#' try: age1 = string.atoi(string.split(table1.name(), '.')[-1]) age2 = string.atoi(string.split(table2.name(), '.')[-1]) table1.index(tabindex, make=True) table2.index(tabindex, make=True) if age1 < age2: oldtable = table1 newtable = table2 newdate = age2 else: newtable = table1 oldtable = table2 newdate = age1 diftable = db.copyTable( newtable, '%s-%s' % (oldtable.name(), string.split(newtable.name(), '.')[-1])) diftable.keepFields(dfields + [tabindex]) diftable.addField('Status') except: self.errorLog('Cannot generate differences for %s and %s' % (table1, table2)) ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '#NEW', '%s tracks in new iTunes export.' % rje.iStr(newtable.entryNum())) self.printLog( '#OLD', '%s tracks in old iTunes export.' % rje.iStr(oldtable.entryNum())) oldfiles = oldtable.datakeys()[0:] for entry in diftable.entries(): ## ~ [2a] Find pair of entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Location'] in oldfiles: oldentry = oldtable.data(entry['Location']) elif entry[tabindex] in oldtable.index(tabindex): oldentry = oldtable.indexEntries(tabindex, entry[tabindex])[0] if len(oldtable.indexEntries(tabindex, entry[tabindex])) == 1: pass else: self.printLog( '#DUP', 'Duplicate entries for %s' % entry[tabindex]) for ientry in oldtable.indexEntries( tabindex, entry[tabindex]): if ientry['Location'] in oldfiles: oldentry = ientry break else: oldentry = None #self.deBug(entry) #self.deBug(oldentry) ## ~ [2b] Generate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not oldentry: entry['Status'] = 'New' continue #self.deBug(oldentry['Location'] in oldfiles) if oldentry['Location'] in oldfiles: oldfiles.remove(oldentry['Location']) #self.deBug(len(oldfiles)) changed = False for field in ['Plays', 'Skips', 'My Rating']: if entry[field] != oldentry[field]: changed = True try: entry[field] -= oldentry[field] except: pass # Keep new value - probably empty in old entry if changed: entry['Status'] = 'Changed' else: entry['Status'] = 'Unchanged' ### ~ [3] Add missing old entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### reportdel = rje.yesNo('Report deleted %s tracks?' % diftable.name()) for old in oldfiles: entry = diftable.addEntry(oldtable.data(old)) entry['Status'] = 'Deleted' if reportdel: self.printLog( '#DEL', '%s: %s [%s]' % (entry['Artist'], entry['Name'], entry['Album'])) ### ~ [4] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for status in rje.sortKeys(diftable.index('Status')): self.printLog( '#STAT', '%s: %d tracks' % (status.upper(), len(diftable.index('Status')[status]))) self.printLog('#TRACK', '%s tracks in total' % rje.iStr(diftable.entryNum())) self.deBug('?') for table in [table1, table2, diftable]: table.dropField(tabindex) diftable.saveToFile('%s.tdt' % diftable.name()) except: self.errorLog('%s.difference() error' % self)
def codonUsageEntropyBias(self): ### Calculate bias in Codon Usage using Entropy-based measure '''Calculate bias in Codon Usage using Entropy-based measure.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### aacode = self.db('Code').index('AA') nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code) cdb = self.db('Codons'); edb = self.db('Expected') ## ~ [1a] Setup bias table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## bdb = self.db().addEmptyTable('Bias',['Seq','Len','Bias','ExpBias','WtBias','ExpWtBias'],['Seq']) ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### x = 0.0; etot = cdb.entryNum() for codentry in cdb.entries(): self.progLog('\r#BIAS','Calculating Bias: %.2f%%' % (x/etot)); x += 100.0 expentry = edb.data(codentry['Seq']) entry = {'Seq':codentry['Seq'],'Len':codentry['Len'],'Bias':0.0,'ExpBias':0.0,'WtBias':0.0,'ExpWtBias':0.0} aafreq = {} for aa in aacode: aafreq[aa] = 0.0 for code in aacode[aa]: aafreq[aa] += codentry[code] rje.dictFreq(aafreq,total=False) for aa in aacode: entry['Bias'] += rje.entropyDict(codentry,aacode[aa]) entry['ExpBias'] += rje.entropyDict(expentry,aacode[aa]) entry['WtBias'] += (aafreq[aa] * rje.entropyDict(codentry,aacode[aa])) entry['ExpWtBias'] += (aafreq[aa] * rje.entropyDict(expentry,aacode[aa])) bdb.addEntry(entry) self.printLog('\r#BIAS','Codon Usage entropy bias calculated for %s entries' % rje.iStr(etot)) bdb.saveToFile() except: self.errorLog('%s.expectedCodonUsage error' % self)
def combineSNPs(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['SNPTables']: self.printLog('\r#SNP','No SNP tables to add.'); return False fdb = self.db().addTable(name='fdr',expect=True,mainkeys=['Locus','Pos']) fdb.remakeKeys() #!# Delete once tuple thing OK fdbkeys = fdb.dataKeys() self.debug(fdbkeys[:100]) snps = [] snppos = [] for snptable in self.list['SNPTables']: snps.append(self.db().addTable(snptable,name=rje.baseFile(snptable,True),expect=True,mainkeys=['Locus','Pos'])) snps[-1].addField('SNP',evalue="YES") self.debug(snps[-1].dataKeys()[:100]) snps[-1].remakeKeys() #!# Delete once tuple thing OK self.debug(snps[-1].dataKeys()[:100]) px = 0; ptot = snps[-1].entryNum(); sx = 0 for pos in snps[-1].dataKeys(): # This should be a (Locus,Pos) tuple self.progLog('\r#SNP','Scanning %s for extra SNP positions: %.2f%%' % (snps[-1].name(),px/ptot)); px += 100.0 if pos not in snppos + fdbkeys: snppos.append(pos); sx += 1 self.printLog('\r#SNP','Scanned %s for extra SNP positions: %s to add.' % (snps[-1].name(),rje.iStr(sx))) ## ~ [0a] Add missing data from other tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if snppos: SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r'); px = 0; ptot = len(snppos); ix = 0 fline = SAMSIG.readline(); headers = rje.readDelimit(fline) fline = SAMSIG.readline() self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) while fline: data = rje.readDelimit(fline); ix += 1 if (data[0],data[1]) in snppos: entry = {'p.FDR':'-'} for i in range(len(data)): entry[headers[i]] = data[i] fdb.addEntry(entry); px += 1 snppos.remove((data[0],data[1])) self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) else: self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) if not snppos: break fline = SAMSIG.readline() SAMSIG.close() self.printLog('\r#SNP','%s/%s SNP positions added from PDiff file.' % (rje.iStr(px),rje.iStr(ptot))) else: self.printLog('\r#SNP','No SNP positions to add.'); return False ### ~ [1] Join Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### temp = fdb temp.makeField('#Locus#|#Pos#') for snptable in snps: snptable.makeField('#Locus#|#Pos#') newtemp = self.db().joinTables(name='newtemp',join=[(temp,'#Locus#|#Pos#'),(snptable,'#Locus#|#Pos#',['SNP'])],newkey=['Locus','Pos'],keeptable=True) self.printLog('#SNP','Added SNPs from %s' % snptable.name()) self.db().deleteTable(temp) temp = newtemp temp.renameField('SNP',snptable.name()) temp.setStr({'Name':'temp'}) temp.dropField('#Locus#|#Pos#') self.db().list['Tables'].append(temp) temp.setStr({'Name':'SNPs'}) temp.saveToFile() return temp except: self.errorLog('%s.pileUpStats() error' % (self)); return None
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) seqcmd = self.cmd_list + ['autoload=T','seqmode=file','seqindex=T'] dfile = '%s.data.tdt' % self.basefile() ### ~ [2] Load Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqmode=list']) self.debug(self.dict['SeqList']['full'].seqNum()) if self.dict['SeqList']['full'].seqNum(): return self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.full.fas' % (self.basefile()),'seqmode=list']) for stype in ['CDS','gene','prot']: seq = self.dict['SeqList'][stype] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.%s.fas' % (self.basefile(),stype)]) seq.dict['SeqDict'] = {} for s in seq.list['Seq']: (name,sequence) = seq.getSeq(s) seq.dict['SeqDict'][string.split(string.split(name)[0],'_')[-1]] = s ### ~ [3] Database Compilation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.exists(dfile) and not self.getBool('Force'): db.addTable(dfile,name='data',mainkeys=['tag']) else: ## ~ [3a] ~ Load part tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## fdb = db.addTable('%s.function.tdt' % self.basefile(),name='function',mainkeys=['tag']) fdb.dropField('description') edb = db.addTable('%s.expression.tdt' % self.basefile(),name='expression',mainkeys=['key']) nx = 0 edb.fillBlanks(blank='0',fillempty=True) for ekey in rje.sortKeys(edb.data()): entry = edb.data(ekey) for field in edb.fields(): if entry[field] == 'na': entry[field] = '0.0'; nx += 1 self.printLog('#TDT','Updated %s entries for expression table' % rje.iStr(nx)) kdb = db.addTable('%s.proteinkey.tdt' % self.basefile(),name='proteinkey',mainkeys=['key']) xdb = db.addTable('%s.dbxref.tdt' % self.basefile(),name='dbxref',mainkeys=['tag']) xdb.dropField('gene') # Pull from genbank instead #pdb = db.addTable('%s.cysweight.tdt' % self.basefile(),name='cysweight',mainkeys=['AccNum']) pdb = db.addTable('%s.protein.tdt' % self.basefile(),name='prodigis',mainkeys=['AccNum']) pdb.addField('NRPep5','NRPep',0); pdb.addField('NRPep7','NRPep5',0) for x in range(5,51): xfield = '%d' % x if xfield not in pdb.fields(): continue for entry in pdb.entries(): entry['NRPep5'] += int(entry[xfield]) if x >= 7: entry['NRPep7'] += int(entry[xfield]) for field in pdb.fields()[0:]: if field not in ['AccNum','File','ProtMWt','PepCount','LenExp','Len3','Len5','Len7Exp','Len37','NRPep','NRPep5','NRPep7','Cys0']: pdb.dropField(field) #pdb.renameField('AccNum','uniprot') #pdb.newKey(['uniprot']) pdb.renameField('AccNum','tag') pdb.newKey(['tag']) mdb = db.addTable('%s.PNASmaintable.tdt' % self.basefile(),name='main',mainkeys=['tag']) tdb = db.addTable('%s.tmhmm.tdt' % self.basefile(),name='TMHMM',mainkeys=['acc_num']) ## ~ [3b] ~ Load and process features table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gdb = db.addTable('%s.Feature.tdt' % self.basefile(),name='feature',mainkeys=['locus','feature','position']) gdb.dropEntriesDirect('feature',['CDS'],inverse=True) gdb.list['Fields'] += ['tag','start','end','gene','product'] for entry in gdb.entries(): pos = rje.matchExp('(\d+)\.\.(\d+)',entry['position']) if entry['position'][:4] == 'comp': entry['start'] = pos[1]; entry['end'] = pos[0] else: entry['start'] = pos[0]; entry['end'] = pos[1] try: entry['tag'] = rje.matchExp('locus_tag="(\S+)"',entry['details'])[0] except: entry['tag'] = '-' try: entry['gene'] = rje.matchExp('gene="(\S+)"',entry['details'])[0] except: entry['gene'] = '' try: entry['product'] = string.split(string.split(entry['details'],'/product="')[1],'"')[0] except: entry['product'] = '' gdb.dropEntriesDirect('tag',['-']) gdb.newKey(['tag']) for field in ['locus','feature','position','details']: gdb.dropField(field) ## ~ [3c] ~ Codon Bias Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cfile = '%s.CDS.Bias.tdt' % self.basefile() if not rje.exists(cfile) or self.getBool('Force'): rje_codons.Codons(self.log,self.cmd_list+['seqin=%s.CDS.fas' % self.basefile(),'backups=F']).run() bdb = db.addTable(cfile,name='Bias',mainkeys=['Seq']) bdb.renameField('Len','AALen') ndb = db.addTable('%s.CDS.NT.tdt' % self.basefile(),name='NT',mainkeys=['Seq']) ndb.renameField('Len','NTLen') for field in ndb.fields(): if field != string.replace(field,'U','T'): ndb.renameField(field,string.replace(field,'U','T')) ## ~ [3d] ~ Join tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## temp = db.joinTables(name='temp',join=[(edb,'key'),(kdb,'key')],newkey=['key'],cleanup=True,keeptable=True) #pfields = pdb.fields()[0:] #pfields.remove('uniprot') #temp2 = db.joinTables(name='temp2',join=[(xdb,'uniprot'),(pdb,'uniprot',pfields)],newkey=['tag'],cleanup=True,keeptable=True) #data = db.joinTables(name='data',join=[(temp2,'tag'),(fdb,'tag'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True) data = db.joinTables(name='data',join=[(pdb,'tag'),(xdb,'tag'),(fdb,'tag'),(tdb,'acc_num'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True) data.dropField('Seq') ## ~ [3e] ~ Fill out data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## data.fillBlanks(blank='0.0',fields=['eb','rb'],fillempty=True) #for entry in data.entries(): # if entry['tag'] not in self.dict['SeqList']['CDS'].dict['SeqDict']: entry['function'] = 'Non-CDS' data.fillBlanks(blank='Unassigned',fields=['function'],fillempty=True) data.fillBlanks() data.fillBlanks(blank='no mapping',fields=['description'],fillempty=True) data.saveToFile(dfile) allfields = data.list['Fields'][0:] data.list['Fields'] = ["tag","File","PepCount","LenExp","Len3","Len5","Len7Exp","Len37","NRPep",'NRPep5','NRPep7',"Cys0", "pi","mass","function","new_function","tm","start","end","AALen","Bias", "WtBias","AbsBias",'NTLen','C','A','G','T','C|3','A|3','G|3','T|3', 'eb_1.1','eb_1.2','eb_2.1','eb_2.2','rb_1.1','rb_1.2','rb_2.1','rb_2.2','eb','rb'] data.saveToFile('%s.cutdata.tdt' % self.basefile()) data.list['Fields'] = allfields return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def seqSubset2(self): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile,mainkeys=['Gene'],name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile]) ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene); ing_mgi.remove(gene) self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt','w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna','pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')] rje_seqlist.SeqList(self.log,seqcmd) mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia']) for gene in ing_map: entry = {'Ingolia':gene,'Gene':ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file'] iseq = rje_seqlist.SeqList(self.log,seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST',evalue='') while iseq.nextSeq(): (iname,icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)',iname)[0] for entry in mdb.indexEntries('EnsEMBL',musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]':'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI'); ENST = open('IngExact.cdna.all.fa','w') ENSP = open('IngExact.pep.all.fa','w') ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1; continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'],','): (tname,tseq) = iseq.getDictSeq(trans,format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon])) if tseq[entry[icod]-3:][:7] == entry[icon]: ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue elif len(peptseq) < minpep: self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue id = rje.preZero(int(ex/100),etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq)) ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close(); ENSP.close() self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def taxaMap(self): ### Maps species codes onto different taxonomic ranks. '''Maps species codes onto different taxonomic ranks.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() tax = self.obj['Taxonomy'] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### specdb = self.db('spcode') #descdb = self.db('protdesc') ranks = ['genus', 'family', 'order', 'class', 'phylum'] rankmap = {} # SPCODE to Taxon dictionary rankfields = ['protein'] + ranks + specdb.fields()[1:] #if descdb: rankfields.append('desc') if self.getStrLC('ProtDesc'): rankfields.append('desc') px = 0 for prot in self.dict['ProtDesc']: if prot.lower() in ['', 'protein', 'gene']: continue pentry = { 'protein': prot, 'spcode': 'None', 'boot': self.getNum('NoneBoot') } pkey = specdb.makeKey(pentry) if pkey not in specdb.dataKeys(): specdb.addEntry(pentry) px += 1 self.printLog( '#PROT', 'Added %s proteins from %s without trees.' % (rje.iStr(px), self.getStr('ProtDesc'))) rankdb = db.addEmptyTable('taxamap', rankfields, ['protein']) for rank in ranks: rankmap[rank] = { 'None': 'None', 'Unmapped': 'Unmapped', 'Uncertain': 'Uncertain' } taxdb = db.addEmptyTable('taxa', ['spcode', 'taxid', 'name'] + ranks, ['spcode']) sx = 0.0 stot = specdb.entryNum() for entry in specdb.entries(): self.progLog('\r#SPEC', 'Processing species: %.2f%%' % (sx / stot)) sx += 100.0 #if descdb: #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description'] try: entry['desc'] = self.dict['ProtDesc'][entry['protein']] except: entry['desc'] = '' for spcode in string.split(entry['spcode'], '|'): if spcode in rankmap['genus']: continue tentry = {'spcode': spcode} try: taxid = tax.mapToTaxID(spcode, nodeonly=True, warn=False)[0] rank = tax.dict['Rank'][taxid] tentry['taxid'] = taxid tentry['name'] = tax.getSpecies(taxid) except: self.warnLog( 'Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode) taxid = 'Unmapped' rank = 'genus' # Loop through different ranks for ri in range(len(ranks)): nextrank = ranks[ri] while rank not in ranks[ri:] and taxid in tax.dict[ 'Parent']: taxid = tax.dict['Parent'][taxid] rank = tax.dict['Rank'][taxid] #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid))) if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid) else: taxon = 'Unmapped' if rank != nextrank: if self.getBool('Monophyly'): taxon = 'Uncertain' else: taxon = '%s %s.' % (taxon, nextrank[:3]) rankmap[nextrank][spcode] = taxon tentry[nextrank] = taxon taxdb.addEntry(tentry) rentry = {} for nextrank in ranks: taxa = [] unmapped = '' for spcode in string.split(entry['spcode'], '|'): ranktax = rankmap[nextrank][spcode] if 'unmapped' in ranktax.lower( ) and ranktax not in taxa: if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank, unmapped, ranktax)) unmapped = ranktax #i# Should only be one if ranktax not in taxa: taxa.append(ranktax) if len(taxa) > 1 and 'None' in taxa: self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa), '|')) taxa.remove('None') if len(taxa) > 1 and unmapped: taxa.remove(unmapped) if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain' else: rentry[nextrank] = string.join(rje.sortUnique(taxa), '|') rankdb.addEntry(rje.combineDict(rentry, entry)) self.printLog( '\r#SPEC', '%s proteins with species codes processed.' % rje.iStr(stot)) rankdb.saveToFile() taxdb.saveToFile() except: self.errorLog('%s.taxaMap error' % self.prog())
def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists '''Populates PosPep and NegPep Lists.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pfile = '%s.peptides.tdt' % self.basefile() #if rje.exists(pfile) and not self.getBool('Force'): # try: # pdb = self.db().addTable(pfile,['Peptide'],name='Peptides') # pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'}) # self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y'] # self.list['NegPep'] = pdb.index('Positive')['Neg'] # return pdb # except: pass if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True) seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T']) pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide']) ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### protease = self.getStr('PepCut') self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) frag = string.split(sequence,':') while '' in frag: frag.remove('') if not self.getBool('NTerm'): frag = frag[1:] for pep in frag[0:]: if pep not in poslist: poslist.append(pep) self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist))) ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## allpep = []; self.list['Redundant'] = redundant = [] sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) for frag in string.split(sequence,':'): if frag in allpep: redundant.append(frag) else: allpep.append(frag) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot))) ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## px = 0.0; ptot = len(poslist) for pep in poslist[0:]: self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0 entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False), 'Len':len(pep),'NR':'Y','Pos':'Y'} if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N' if pep in redundant: entry['NR'] = 'N' for aacomb in ['C','HPW','DENQ','M']: x = 0 for a in aacomb: x += pep.count(a) entry[aacomb] = x pdb.addEntry(entry) self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist))) ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdb.saveToFile(pfile) POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w') for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep)) for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep)) POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile())) NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile())) return pdb except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None # Setup failed
def mapTaxa(self,taxin,taxout=['spcode'],nodeonly=False,rankonly=False,savetaxout=True): ### Takes a list of Taxa and returns mapped Taxa data ''' Takes a list of Taxa and returns mapped Taxa data. >> taxin:str or list of taxon identifiers to map from. >> taxout:str or list of taxa output formats >> nodeonly:bool = whether to limit TaxID mapping to the precise matching nodes (else include children) >> rankonly:bool = whether to limit TaxID to those matching self.list['RankTypes'] taxon types. >> savetaxout:bool [True] = Whether to save the TaxOut list to a text file << taxoutlist:list of mapped taxa if taxout is a string, OR << taxoutdict:dict of mapped taxa if taxout is a list ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxout.sort() except: tlist = False if tlist: if not taxout: return {} taxout = [taxout] elif not taxout: return [] ### ~ [2] ~ Map to TaxID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxid = self.mapToTaxID(self.list['TaxIn'],nodeonly,rankonly) if self.list['RestrictID']: tx = len(taxid) taxid = rje.listIntersect(taxid,self.list['RestrictID']) self.printLog('#TAXID','%s of %s TaxID in %s Restricted IDs.' % (rje.iLen(taxid),rje.iStr(tx),rje.iLen(self.list['RestrictID']))) ### ~ [3] ~ Map TaxID and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxdict = {}; taxoutdict = {} for taxout in self.list['TaxOut']: taxout = taxout.lower() if taxout == 'taxid': taxoutlist = taxid elif taxout in ['spcode','name','common']: if not taxdict: taxdict = self.taxDict(taxid) taxoutlist = [] for t in taxid: try: taxoutlist.append(taxdict[t][taxout]) except: self.warnLog('No "%s" data for TaxID %s' % (taxout, t),'Missing_%s' % taxout,suppress=True) taxoutlist.sort() else: self.errorLog('TaxOut format "%s" not recognised' % taxout,printerror=False); continue taxoutdict[taxout] = taxoutlist if savetaxout: if not taxoutlist: self.printLog('#OUT','No %s IDs to output' % taxout); continue tfile = '%s.%s.txt' % (self.baseFile(),taxout) rje.backup(self,tfile) open(tfile,'w').write(string.join(taxoutlist,'\n')) self.printLog('#OUT','%s %s IDs output to %s.' % (rje.iLen(taxoutlist), taxout, tfile)) if tlist: return taxoutdict return taxoutlist except: self.errorLog('Problem during %s mapTaxa.' % self); raise
def iTRAQSamples(self): ### Uses self.dict['Samples'] and self.db('itraq') to summarise hit data '''Uses self.dict['Samples'] and self.db('itraq') to summarise hit data.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db(); idb = self.db('itraq') mdb = db.copyTable(idb,'itraq_summary') gdb = db.copyTable(idb,'itraq_geomean') ### ~ [1] Reformat Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mdb.dropField('geomean'); gdb.dropField('ratio'); gdb.renameField('geomean','ratio') for sdb in [mdb,gdb]: sdb.dropField('summary'); sdb.dropEntriesDirect('ratio','---') sdb.dropEntriesDirect('ratio','NN') sdb.dataFormat({'ratio':'num','n':'int'}) ## ~ [1a] Drop tags with Samples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (ex,etot) = (0.0,sdb.entryNum()) for entry in sdb.entries(): self.progLog('\r#ITRAQ','Drop isotags without Sample info: %.2f%%' % (ex/etot)); ex += 100.0 tags = string.split(entry['itraq'],'/') if tags[0] not in self.dict['Samples'] or tags[1] not in self.dict['Samples']: sdb.dropEntry(entry) self.printLog('\r#ITRAQ','Dropped all isotags without Sample info: %s of %s entries remain' % (rje.iStr(sdb.entryNum()),rje.iStr(etot))) ## ~ [1b] Reshape, rename, invert and remove redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sdb.reshapeWide('itraq',['ratio','n']) samples = rje.sortUnique(self.dict['Samples'].values()) ratios = [] self.printLog('#SAMP',string.join(samples,', ')) for s1 in samples: for s2 in samples[samples.index(s1):]: newfield = '%s/%s' % (s1,s2) sdb.addField(newfield) sdb.addField('%s_Min' % newfield) sdb.addField('%s_Max' % newfield) sdb.addField('%s_Dirn' % newfield) ratios.append(newfield) for entry in sdb.entries(): entry[newfield] = [] for field in sdb.fields(): if '|' in field: (score,tags) = string.split(field,'|') tag = string.split(tags,'/') if int(tag[0]) > int(tag[1]): ### Invert newfield = '%s|%s/%s' % (score,tag[1],tag[0]) if newfield in sdb.fields(): sdb.dropField(newfield); continue sdb.renameField(field,newfield) if score == 'ratio': for entry in sdb.entries(): if entry[newfield]: entry[newfield] = 1.0 / entry[newfield] tag = (tag[1],tag[0]) field = newfield s1 = self.dict['Samples'][tag[0]] s2 = self.dict['Samples'][tag[1]] newname = '%s|%s%s/%s%s' % (score,s1,tag[0],s2,tag[1]) sdb.renameField(field,newname) if score == 'n': continue newfield = '%s/%s' % (s1,s2) invfield = '%s/%s' % (s2,s1) for entry in sdb.entries(): if entry[newname] and newfield in sdb.fields(): entry[newfield].append(entry[newname]) elif entry[newname]: entry[invfield].append(1.0/entry[newname]) ## ~ [1c] Calculate Geometric mean ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (ex,etot) = (0.0,sdb.entryNum()) for entry in sdb.entries(): self.progLog('\r#GEO','Calculating Geometric means: %.2f%%' % (ex/etot)); ex += 100.0 for ratio in ratios: if entry[ratio]: entry['%s_Min' % ratio] = min(entry[ratio]) entry['%s_Max' % ratio] = max(entry[ratio]) try: entry[ratio] = rje.geoMean(entry[ratio]) except: self.deBug(entry) if entry[ratio] > 1 and entry['%s_Min' % ratio] > 1: entry['%s_Dirn' % ratio] = 'UP' elif entry[ratio] < 1 and entry['%s_Max' % ratio] < 1: entry['%s_Dirn' % ratio] = 'DOWN' else: entry['%s_Dirn' % ratio] = entry['%s_Min' % ratio] = entry['%s_Max' % ratio] = entry[ratio] = '' self.printLog('\r#GEO','Geometric mean calculations complete') sdb.saveToFile() except: self.errorLog('iTRAQSamples error')
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T']) if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut'))) if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup')) ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### refseq = {}; rx = 0 majors = {} locus = None WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0 for line in WTDATA: self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01) data = rje.readDelimit(line); wx += 1 if data[0] == 'Locus': continue else: if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = [] pos = int(data[1]) while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1 while (pos - 1) > len(majors[locus]): majors[locus].append('-') refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2]) WTDATA.close() self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx))) for locus in rje.sortKeys(majors): if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError self.dict['WTMajor'] = majors self.dict['RefSeq'] = refseq ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed