def combineSNPs(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['SNPTables']: self.printLog('\r#SNP','No SNP tables to add.'); return False fdb = self.db().addTable(name='fdr',expect=True,mainkeys=['Locus','Pos']) fdb.remakeKeys() #!# Delete once tuple thing OK fdbkeys = fdb.dataKeys() self.debug(fdbkeys[:100]) snps = [] snppos = [] for snptable in self.list['SNPTables']: snps.append(self.db().addTable(snptable,name=rje.baseFile(snptable,True),expect=True,mainkeys=['Locus','Pos'])) snps[-1].addField('SNP',evalue="YES") self.debug(snps[-1].dataKeys()[:100]) snps[-1].remakeKeys() #!# Delete once tuple thing OK self.debug(snps[-1].dataKeys()[:100]) px = 0; ptot = snps[-1].entryNum(); sx = 0 for pos in snps[-1].dataKeys(): # This should be a (Locus,Pos) tuple self.progLog('\r#SNP','Scanning %s for extra SNP positions: %.2f%%' % (snps[-1].name(),px/ptot)); px += 100.0 if pos not in snppos + fdbkeys: snppos.append(pos); sx += 1 self.printLog('\r#SNP','Scanned %s for extra SNP positions: %s to add.' % (snps[-1].name(),rje.iStr(sx))) ## ~ [0a] Add missing data from other tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if snppos: SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r'); px = 0; ptot = len(snppos); ix = 0 fline = SAMSIG.readline(); headers = rje.readDelimit(fline) fline = SAMSIG.readline() self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) while fline: data = rje.readDelimit(fline); ix += 1 if (data[0],data[1]) in snppos: entry = {'p.FDR':'-'} for i in range(len(data)): entry[headers[i]] = data[i] fdb.addEntry(entry); px += 1 snppos.remove((data[0],data[1])) self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) else: self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) if not snppos: break fline = SAMSIG.readline() SAMSIG.close() self.printLog('\r#SNP','%s/%s SNP positions added from PDiff file.' % (rje.iStr(px),rje.iStr(ptot))) else: self.printLog('\r#SNP','No SNP positions to add.'); return False ### ~ [1] Join Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### temp = fdb temp.makeField('#Locus#|#Pos#') for snptable in snps: snptable.makeField('#Locus#|#Pos#') newtemp = self.db().joinTables(name='newtemp',join=[(temp,'#Locus#|#Pos#'),(snptable,'#Locus#|#Pos#',['SNP'])],newkey=['Locus','Pos'],keeptable=True) self.printLog('#SNP','Added SNPs from %s' % snptable.name()) self.db().deleteTable(temp) temp = newtemp temp.renameField('SNP',snptable.name()) temp.setStr({'Name':'temp'}) temp.dropField('#Locus#|#Pos#') self.db().list['Tables'].append(temp) temp.setStr({'Name':'SNPs'}) temp.saveToFile() return temp except: self.errorLog('%s.pileUpStats() error' % (self)); return None
def tableToHTML(delimtext, delimit, tabwidth='100%', tdwidths=[], tdalign=[], valign='center', thead=True, border=1, tabid=''): # Makes HTML Table ''' Converts delimited plain text into an HTML table. >> delimtext:str = Delimited text to convert >> delimit:str = Text delimiter for conversion. >> tabwidth:str ['100%'] = width of table >> tdwidths:list [] = Optional list of widths of columns >> tdalign:list [] = Optional list of text alignment for columns >> valign:str ['center'] = Vertical text alignment for columns >> thead:bool [True] = Whether first row should use th rather than td >> border:int [1] = Table border strength >> tabid:str [''] = Table ID setting (for CSS formatting) ''' ### [0] Setup Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if tabid: html = '<table width="%s" id="%s">\n' % (tabwidth, tabid) else: html = '<table width="%s" border=%d>\n' % (tabwidth, border) tablines = string.split(delimtext, '\n') ### [1] Header Row ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if thead: html += '<tr>\n' headtext = rje.readDelimit(tablines.pop(0), delimit) tw = tdwidths[0:] ta = tdalign[0:] while headtext: tag = 'th' if tw: tag += ' width="%s"' % tw.pop(0) if ta: tag += ' align=%s' % ta.pop(0) html += '<%s>%s</th>\n' % (tag, headtext.pop(0)) html += '</tr>\n' ### [2] Main body ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while tablines: tabtext = rje.readDelimit(tablines.pop(0), delimit) if not tabtext: continue html += '<tr>\n' tw = tdwidths[0:] ta = tdalign[0:] while tabtext: tag = 'td valign=%s' % valign if tw: tag += ' width="%s"' % tw.pop(0) if ta: tag += ' align=%s' % ta.pop(0) html += '<%s>%s</td>\n' % (tag, tabtext.pop(0)) html += '</tr>\n' ### [3] End table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### html += '</table>\n\n' return html
def loadFeatures(self,ftfile): ### Loads features from given file '''Loads features from given file.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['','none']: return if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile,'r').readline(),delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in ['feature','%sstart' % pos,'%send' % pos,'description']: if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True) (mx,mtot,fx) = (0.0,len(ftdata),0) for mainkey in rje.sortKeys(ftdata): self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot)) mx += 100.0 (id,ft,start,end) = string.split(mainkey,delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc}) self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T']) if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut'))) if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup')) ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### refseq = {}; rx = 0 majors = {} locus = None WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0 for line in WTDATA: self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01) data = rje.readDelimit(line); wx += 1 if data[0] == 'Locus': continue else: if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = [] pos = int(data[1]) while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1 while (pos - 1) > len(majors[locus]): majors[locus].append('-') refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2]) WTDATA.close() self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx))) for locus in rje.sortKeys(majors): if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError self.dict['WTMajor'] = majors self.dict['RefSeq'] = refseq ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def tableToHTML(delimtext,delimit,tabwidth='100%',tdwidths=[],tdalign=[],valign='center',thead=True,border=1,tabid=''): # Makes HTML Table ''' Converts delimited plain text into an HTML table. >> delimtext:str = Delimited text to convert >> delimit:str = Text delimiter for conversion. >> tabwidth:str ['100%'] = width of table >> tdwidths:list [] = Optional list of widths of columns >> tdalign:list [] = Optional list of text alignment for columns >> valign:str ['center'] = Vertical text alignment for columns >> thead:bool [True] = Whether first row should use th rather than td >> border:int [1] = Table border strength >> tabid:str [''] = Table ID setting (for CSS formatting) ''' ### [0] Setup Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if tabid: html = '<table width="%s" id="%s">\n' % (tabwidth,tabid) else: html = '<table width="%s" border=%d>\n' % (tabwidth,border) tablines = string.split(delimtext,'\n') ### [1] Header Row ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if thead: html += '<tr>\n' headtext = rje.readDelimit(tablines.pop(0),delimit) tw = tdwidths[0:] ta = tdalign[0:] while headtext: tag = 'th' if tw: tag += ' width="%s"' % tw.pop(0) if ta: tag += ' align=%s' % ta.pop(0) html += '<%s>%s</th>\n' % (tag,headtext.pop(0)) html += '</tr>\n' ### [2] Main body ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while tablines: tabtext = rje.readDelimit(tablines.pop(0),delimit) if not tabtext: continue html += '<tr>\n' tw = tdwidths[0:] ta = tdalign[0:] while tabtext: tag = 'td valign=%s' % valign if tw: tag += ' width="%s"' % tw.pop(0) if ta: tag += ' align=%s' % ta.pop(0) html += '<%s>%s</td>\n' % (tag,tabtext.pop(0)) html += '</tr>\n' ### [3] End table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### html += '</table>\n\n' return html
def loadFeatures(self, ftfile): ### Loads features from given file '''Loads features from given file.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['', 'none']: return if not os.path.exists(ftfile): return self.printLog('#ERR', 'Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in [ 'feature', '%sstart' % pos, '%send' % pos, 'description' ]: if h not in hmap: return self.printLog( '#ERR', 'No %s field detected in "%s" features file' % (h, ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self, ftfile, mainkeys, ['description'], delimit, headers, lists=True) (mx, mtot, fx) = (0.0, len(ftdata), 0) for mainkey in rje.sortKeys(ftdata): self.progLog( '\r#FT', 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot)) mx += 100.0 (id, ft, start, end) = string.split(mainkey, delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({ 'Type': ft, 'Start': int(start), 'End': int(end), 'Desc': desc }) self.printLog( '\r#FT', 'Loaded %s features for %s IDs from %s' % (rje.integerString(fx), rje.integerString(len(self.dict['Features'])), ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def restOutput(self,outfmt=None,maxparsesize=0,asjson=False): ### Returns rest output for outfmt '''Returns rest output for outfmt.''' if not outfmt: outfmt = self.getStrLC('Rest') if not outfmt: self.jsonText('No REST output',asjson) if outfmt in self.dict['Output']: rfile = string.split(self.dict['Output'][outfmt],'\n')[0] if rje.exists(rfile): fext = string.split(rfile,'.')[-1] if fext in ['png']: self.debug(rfile) self.jsonText(rfile,asjson) nbytes = os.path.getsize(rfile) if nbytes > maxparsesize > 0: # Too large to parse otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize)) try: jobid = self.dict['Output']['jobid'] except: jobid = None resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,outfmt) if not jobid or outfmt == self.getStrLC('Rest'): return self.jsonText('ERROR: %s' % (otext),asjson) else: return self.jsonText('%s in full output. Try %s.' % (otext,resturl),asjson) else: delimit = rje.delimitFromExt(filename=rfile,write=False) if asjson and delimit in [',','\t']: jtext = [] for rline in open(rfile,'r').readlines(): jtext.append(json.dumps(rje.readDelimit(rline,delimit))) return '[%s]' % string.join(jtext,',\n ') #!# Add json parsing of fasta files? else: outtxt = open(rfile,'r').read() if not outtxt.endswith('\n'): outtxt += '\n' return self.jsonText(outtxt,asjson) elif asjson and outfmt in self.dict['Outfile']: pass #!# Sort out json formatting here based on file extension! return self.dict['Output'][outfmt] elif outfmt in ['parse','format']: intro = '<pre>%s</pre>\n\n' % self.restOutput('intro') return self.jsonText(intro,asjson) elif outfmt in ['default','full']: return self.jsonText(self.restFullOutput(maxparsesize),asjson) elif outfmt in ['restkeys','outputs']: return string.join(self.list['RestKeys']+[''],'\n') return self.jsonText('No %s output generated.' % outfmt,asjson)
def pileUpFDR(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fdrfile = '%s.fdr.tdt' % self.baseFile() if not self.force() and os.path.exists(fdrfile): return sigpval = {} # pval:[fpos] npos = 0; nx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?') ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r') headers = string.split(SAMSIG.readline()) + ['p.FDR'] fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0 while fline: self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px)) try: pval = float(string.split(fline)[-1]) except: break if pval <= 0.05: if pval not in sigpval: sigpval[pval] = [] sigpval[pval].append(fpos); px += 1 fpos = SAMSIG.tell(); fline = SAMSIG.readline() self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px)) ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMFDR = open(fdrfile,'w') rje.writeDelimit(SAMFDR, headers) px = 0; sx = 0.0; stot = len(sigpval) for pval in rje.sortKeys(sigpval): self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0 px += len(sigpval[pval]) if pval: fdr = (pval * npos) / px else: fdr = 0.0 for fpos in sigpval[pval]: SAMSIG.seek(fpos) rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)]) SAMSIG.close() SAMFDR.close() self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile)) except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
def splitMascot(self): ### Reads the MASCOT file and splits into header, hits and unmatched files. '''Reads the MASCOT file and splits into header, hits and unmatched files.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() infile = self.getStr('MASCOT') if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('MASCOT'))) #x#self.deBug(self.basefile()) headfile = '%s.header.txt' % self.basefile() hitsfile = '%s.mascot.csv' % self.basefile() peptfile = '%s.nohits.csv' % self.basefile() if rje.isYounger(self.getStr('MASCOT'),hitsfile) == hitsfile and not self.force(): return self.printLog('#FILE','%s file found (force=F)' % hitsfile) ### ~ [1] Split MASCOT~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headlines = [] csvhead = [] mdb = None mx = 0 itraq = [] prot_data = {} for mline in open(self.getStr('MASCOT'),'r').readlines(): mx += 1 # Index of next line in case needed for iTRAQ reading! ## ~ [1a] Skip down until Header found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not headlines and mline.find('Header') < 0: continue ## ~ [1b] Add Header lines to headlines until results headers found ~~~~~~~~~~~~~~~ ## if not csvhead and mline.find('prot_hit_num') < 0: headlines.append(mline); continue ## ~ [1c] Sort out MASCOT results headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if mline.find('prot_hit_num') >= 0: ## ~ Read Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## open(headfile,'w').writelines(headlines) csvhead = rje.readDelimit(string.join(string.split(rje.chomp(mline))),',') while '' in csvhead: csvhead.remove('') ## ~ Sort out iTRAQ headers (missing) ~~~~~~~~~ ## if self.getBool('iTRAQ'): iline = open(self.getStr('MASCOT'),'r').readlines()[mx] for isplit in rje.readDelimit(iline,',')[len(csvhead):]: # Should be start of iTRAQ data if '/' in isplit: itraq.append(isplit) self.printLog('#ITRAQ',string.join(itraq)) csvhead += itraq idb = db.addEmptyTable('itraq',['prot_hit_num','prot_acc','prot_desc','itraq','ratio','n','geomean','summary'],keys=['prot_hit_num','itraq']) idb.info['Delimit'] = ',' ## ~ Add emPAI header (also missing) ~~~~~~~~~~ ## if self.getBool('emPAI'): csvhead.append('empai') ## ~ Set up Database Table ~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#HEAD',string.join(csvhead,'; ')) mdb = db.addEmptyTable('mascot',csvhead,keys=['prot_hit_num','pep_query']) mdb.info['Delimit'] = ',' elif mline.find('Peptide matches') >= 0: mdb.saveToFile() if self.getBool('emPAI'): csvhead.remove('empai') mdb = db.addEmptyTable('nohits',csvhead,keys=['pep_query']) for field in mdb.fields(): if field[:4] == 'prot': mdb.dropField(field) mdb.info['Delimit'] = ',' continue elif rje.chomp(mline): #self.deBug('%s ... %s' % (mline[:20],mline.find('Peptide matches'))) data = rje.readDelimit(mline,',') entry = {}; pretraq = True #self.deBug(csvhead); self.deBug(itraq); for d in range(len(csvhead)+len(itraq)): if d >= len(data): break if data[d] in itraq: dhead = data[d]; pretraq = False elif data[d] == 'emPAI': entry['empai'] = data[d+1]; pretraq = False elif pretraq and d < len(csvhead): dhead = csvhead[d] elif pretraq: continue # Unmatched peptides will not have emPAI or iTRAQ data #self.deBug('%s > %s' % (data[d],dhead)) if d and data[d-1] == 'emPAI': continue elif data[d] in itraq + ['emPAI']: continue elif dhead not in entry: entry[dhead] = data[d] #self.deBug('%s = %s' % (dhead,entry[dhead])) if entry['prot_acc']: prot_data[entry['prot_hit_num']] = {'prot_acc':entry['prot_acc'],'prot_desc':entry['prot_desc']} if self.getBool('iTRAQ') and 'Quantitation summary for protein' in data: d = data.index('Quantitation summary for protein') + 1 if entry['prot_hit_num'] in prot_data: pacc = prot_data[entry['prot_hit_num']]['prot_acc'] pdesc = prot_data[entry['prot_hit_num']]['prot_desc'] else: pacc = entry['prot_acc'] pdesc = entry['prot_desc'] while d < len(data): if data[d] in itraq: idb.addEntry({'prot_hit_num':entry['prot_hit_num'],'prot_acc':pacc,'prot_desc':pdesc, 'itraq':data[d],'ratio':data[d+1],'n':data[d+2],'geomean':data[d+3],'summary':data[d+4]}) d += 1 #self.deBug(entry) if entry['prot_hit_num'] or entry['pep_query']: mdb.addEntry(entry) mdb.saveToFile() if self.getBool('iTRAQ'): idb.saveToFile() self.deBug('') return True except: self.errorLog('Error reading MASCOT file'); return False
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta,'r').readlines(): if line[:1] == '>': try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line) except: name = rje.matchExp('^>(\S+)',line)[0] if len(string.split(name,'|')) == 3: name = '6rf_NEIME__%s' % string.split(name,'|')[2] open(fas,'a').write('>%s\n' % name) elif len(string.split(name,'|')) == 5: name = 'ref_NEIME__%s' % string.split(name,'|')[3] open(fas,'a').write('>%s %s\n' % (name,desc)) else: print string.split(name,'|'); raise ValueError self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1 else: open(fas,'a').write(line) self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc'; open(acc,'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv,'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line,',') if len(data) < 2: continue [num,name] = data[:2] try: name = string.split(name,'|')[2] except: continue if name not in rfhits: open(acc,'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file,num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits)))) self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self,gfile,['Qry'],['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros,False) open('6rf_zeros.acc','w').write(string.join(zeros,'\n')) self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam() gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True) fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True) headers = gdata.pop('Headers') headers.insert(1,'Sample') headers.append('BestHit') rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf,'__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN',rje_zen.Zen().wisdom())
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta, 'r').readlines(): if line[:1] == '>': try: (name, desc) = rje.matchExp('^>(\S+) (\S.+)$', line) except: name = rje.matchExp('^>(\S+)', line)[0] if len(string.split(name, '|')) == 3: name = '6rf_NEIME__%s' % string.split(name, '|')[2] open(fas, 'a').write('>%s\n' % name) elif len(string.split(name, '|')) == 5: name = 'ref_NEIME__%s' % string.split(name, '|')[3] open(fas, 'a').write('>%s %s\n' % (name, desc)) else: print string.split(name, '|') raise ValueError self.progLog( '\r#FAS', 'Processing %s: %s seqs' % (fas, rje.integerString(sx))) sx += 1 else: open(fas, 'a').write(line) self.printLog( '\r#FAS', 'Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(fas, protein=True, force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc' open(acc, 'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv, 'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line, ',') if len(data) < 2: continue [num, name] = data[:2] try: name = string.split(name, '|')[2] except: continue if name not in rfhits: open(acc, 'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file, num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog( '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' % (cx, rje.integerString(len(rfhits)))) self.printLog( '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' % (cx, rje.integerString(len(rfhits)), acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=%s' % acc, 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas', 'qryacc=F' ]).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros, False) open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n')) self.printLog( '#ZERO', '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Zeros.fas', 'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas', 'qryacc=F' ]).gablam() gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True) fdata = rje.dataDict(self, string.replace(ufile, 'hitsum', 'gablam'), ['Qry'], ['Hit'], lists=True) headers = gdata.pop('Headers') headers.insert(1, 'Sample') headers.append('BestHit') rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf, '__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN', rje_zen.Zen().wisdom())
def pileUpStats(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### statfile = '%s.pdiff.tdt' % self.baseFile() if not self.force() and os.path.exists(statfile): return self.pileUpFDR() ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## wtdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: wtdata[locus] = {} for field in ['N','QN','MajFreq']: wtdata[locus][field] = [] WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1 fields = [] for line in WTDATA: data = rje.readDelimit(line) if fields: locus = data[0] pos = int(data[1]) while pos > wx: wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)])) wx += 1 else: fields = data[0:] WTDATA.close() ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mutdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: mutdata[locus] = {} for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = [] MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1 fields = [] for line in MUTDATA: data = rje.readDelimit(line) if fields: locus = data[0] self.str['RefSeq'] = self.dict['RefSeq'][locus] pos = int(data[1]) try: if pos > len(self.str['RefSeq']): while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?' self.str['RefSeq'] += data[2] self.dict['RefSeq'][locus] = self.str['RefSeq'] elif self.str['RefSeq'][pos-1] == '?': self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:] self.dict['RefSeq'][locus] = self.str['RefSeq'] except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq']))) while pos > mx: mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)])) for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)]) mx += 1 else: fields = data[0:] MUTDATA.close() ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Need a new check with locus info #!# #for field in wtdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError #for field in mutdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq'])) ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w') headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff'] SAMSIG.write('%s\n' % string.join(headers,'\t')) nodifx = 0; nomutx = 0; sx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): self.str['RefSeq'] = self.dict['RefSeq'][locus] self.list['WTMajor'] = self.dict['WTMajor'][locus] for i in range(len(self.str['RefSeq'])): try: sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i], mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]] except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue # Was: sigdata += [1.0,1.0] elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue elif not wtdata[locus]['MajFreq'][i]: # No Data for WT if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0] else: sigdata += [1.0,1.0] elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) sigdata.append(1.0) elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(1.0) sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) else: sigdata += [1.0,1.0] sigdata.append(min(1.0,2*min(sigdata[-2:]))) rje.writeDelimit(SAMSIG,sigdata); sx += 1 SAMSIG.close() ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx) if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx) if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx) self.printLog('#PDIFF','%s.' % ptxt) ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.pileUpFDR() except: self.errorLog('%s.pileUpStats() error' % (self)); return None