Esempio n. 1
0
    def convert(self,filelist=[],outfile=None):      ### Converts scansite output files in FileList to Outfile
        '''
        Converts scansite output files in FileList to Outfile.
        '''
        try:
            ### Setup ###
            _stage = 'Setup'
            if len(filelist) < 1:
                filelist = self.list['FileList']
            if not outfile:
                outfile = self.info['Name']          
            if len(filelist) < 1:
                self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False)
                return False
            delimit = rje.getDelimit(self.cmd_list)
            ext = rje.delimitExt(delimit)
            if ext != outfile[-3:]:
                newfile = outfile[:-3] + ext
                if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)):
                    outfile = newfile
            self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile))

            ### Output File ###
            _stage = 'Output File'
            if not self.opt['Append'] or not os.path.exists(outfile):   # Create with header
                OUTFILE = open(outfile,'w')
                headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa']
                rje.writeDelimit(OUTFILE,headers,delimit)
            else:
                OUTFILE = open(outfile,'a')

            ### Conversion ###
            _stage = 'Conversion'
            sx = 0
            for infile in filelist:
                if not os.path.exists(infile):
                    self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False)
                    continue
                fx = 0
                INFILE = open(infile,'r')
                inline = rje.nextLine(INFILE)
                while inline != None:
                    if rje.matchExp(re_scansite,inline):
                        scanlist = rje.matchExp(re_scansite,inline)
                    rje.writeDelimit(OUTFILE,scanlist,delimit)
                    sx += 1
                    fx += 1
                    rje.progressPrint(self,sx)
                    inline = rje.nextLine(INFILE)
                self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx)))
                INFILE.close()

            ### End ###
            _stage = 'End'
            OUTFILE.close()
            self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile))
            return True            
        except:
            self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False)
            raise   
Esempio n. 2
0
 def pileUpFDR(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
     '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         fdrfile = '%s.fdr.tdt' % self.baseFile()
         if not self.force() and os.path.exists(fdrfile): return 
         sigpval = {}    # pval:[fpos]
         npos = 0; nx = 0
         for locus in rje.sortKeys(self.dict['RefSeq']):
             npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?')
         ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r')
         headers = string.split(SAMSIG.readline()) + ['p.FDR']
         fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0
         while fline:
             self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px))
             try: pval = float(string.split(fline)[-1])
             except: break
             if pval <= 0.05:
                 if pval not in sigpval: sigpval[pval] = []
                 sigpval[pval].append(fpos); px += 1
             fpos = SAMSIG.tell(); fline = SAMSIG.readline()
         self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px))
         ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMFDR = open(fdrfile,'w')
         rje.writeDelimit(SAMFDR, headers)
         px = 0; sx = 0.0; stot = len(sigpval)
         for pval in rje.sortKeys(sigpval):
             self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0
             px += len(sigpval[pval])
             if pval: fdr = (pval * npos) / px
             else: fdr = 0.0
             for fpos in sigpval[pval]:
                 SAMSIG.seek(fpos)
                 rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)])
         SAMSIG.close()
         SAMFDR.close()
         self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile))
     except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
Esempio n. 3
0
    def altPAM(self):   ### Alternative PAM matrix construction
        '''Alternative PAM matrix construction.'''
        try:
            ### Setup ##
            wlines = self.loadFromFile(self.info['AltPam'])
            if not wlines:
                raise IOError
            aas = string.split(wlines[0].upper())
            codes = string.split(wlines[1])
            rawfreqs = string.split(wlines[2])
            freq = {}
            for i in range(len(rawfreqs)):
                freq[aas[i]] = string.atof(rawfreqs[i])
            prob = {}
            for r in range(3,22):
                subs = string.split(wlines[r])
                for i in range(len(subs)):
                    prob['%s%s' % (aas[i],aas[r-2])] = string.atof(subs[i])
                    prob['%s%s' % (aas[r-2],aas[i])] = string.atof(subs[i])

            ### Alternative freqs ###
            if self.info['SeqIn'].lower() not in ['','none'] and os.path.exists(self.info['SeqIn']):
                ## Clear freq ##
                freq = {}
                for a in aas:
                    freq[a] = 0.0
                ## Count freq ##
                slines = self.loadFromFile(self.info['SeqIn'])
                for line in slines:
                    if line[:1] == '>':
                        continue
                    for a in aas:
                        freq[a] += string.count(line.upper(),a)
                ## Convert to freq ##
                total = sum(freq.values())
                if total > 0:
                    for a in aas:
                        freq[a] = freq[a] / total
                self.log.printLog('#AA','Rescaling matrix based on %s aa from %s.' % (rje.integerString(total),self.info['SeqIn']))
            
            ### Calculate s ###
            s = 0.01
            step = 0.000001
            solve = True
            bests = 1.000000
            bestdif = -1
            while solve and s >= step:
                ## Scaler ##
                s = s - step
                self.log.printLog('\r#WAG','Considering s = %.6f; Best s = %.6f (Dif = %.6f)' % (s,bests,bestdif),log=False,newline=False)
                ## Self Subs ##
                newprobs = rje.scaledict(dict=prob,scale=s)
                toobig = False
                for a in aas:
                    newprobs['%s%s' % (a,a)] = 1.0
                    for key in prob.keys():
                        if key[0] == a:
                            newprobs['%s%s' % (a,a)] -= newprobs[key]
                            if newprobs['%s%s' % (a,a)] < 0.0:  # Overshot possibility
                                toobig = True
                                break
                    if toobig:
                        break
                if toobig:
                    continue
                #print 'PAM!!', 
                ## PAM1 ##
                dsum = 0.0
                for a in aas:
                    dsum += freq[a] * newprobs['%s%s' % (a,a)]
                dif = 0.99 - dsum
                if dif < 0:
                    dif = -dif
                if dif < bestdif or bestdif < 0:
                    bestdif = dif
                    bests = s

            ### Output best s ###
            self.log.printLog('\r#WAG','Considered all s <= 0.010000; Best s = %.6f (Dif = %.6f)' % (bests,bestdif))
            if self.info['PamOut'].lower() in ['','none']:
                self.info['PamOut'] = self.info['AltPam'] + '.pam'
            self.log.printLog('#PAM','Rescaled PAM matrix output to %s' % self.info['PamOut'])
            PAM = open(self.info['PamOut'],'w')
            rje.writeDelimit(PAM,aas,' ')
            newprobs = rje.scaledict(dict=prob,scale=bests)
            for a in aas:
                newprobs['%s%s' % (a,a)] = 1.0
                for key in prob.keys():
                    if key[0] == a:
                        newprobs['%s%s' % (a,a)] -= newprobs[key]
            for i in range(len(aas)):
                out = [codes[i]]
                a = aas[i]
                for b in aas:
                    out.append('%.6f' % newprobs['%s%s' % (a,b)])
                rje.writeDelimit(PAM,out,' ')
            PAM.close()
            self.info['Name'] = self.info['PamOut']

        except:
            self.log.errorLog('Major Error with PamCtrl.altPAM().',quitchoice=True)
Esempio n. 4
0
	def run(self):		### Main Run method
		'''
		Main Run method.
		'''
		try:
			### SLiMDisc Run ###
			if self.opt['SLiMDisc']:
				return self.slimDisc()
			
			### TEIRESIAS ###
			if self.opt['Teiresias']:
				## Setup ##
				seqlist = rje_seq.SeqList(self.log,self.cmd_list)
				infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True)
				outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True)
				run_teiresias = True
				if rje.isYounger(outfile,infile) == outfile:
					if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'):
						run_teiresias = False
				## Run TEIRESIAS ##
				if run_teiresias:
					seqlist.saveFasta(seqfile=infile,name='Teiresias')	### Saves sequences in fasta format
					command = rje.makePath(self.info['TeiresiasPath'],True)
					command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt'])
					self.log.printLog('#CMD',command)
					os.system(command)
				## Read Results ##
				self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1)
				self.list['Pattern'] = []
				RESULTS = open(outfile,'r')
				line = RESULTS.readline()
				while line:
					if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern
						self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line))
					elif len(line) > 3 and line[0] != '#':
						self.log.errorLog('Did not recognise line: %s' % line,False,False)
					line = RESULTS.readline()
				RESULTS.close()
				patx = len(self.list['Pattern'])
				self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile))
				## Calculate Information Content ##
				aafreq = seqlist.aaFreq()
				self.verbose(0,3,'Calculating Information Content & Length stats...',0)
				occx = 0
				for pattern in self.list['Pattern']:
					pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq)
					pattern._makeLength()
					occx += 1
					rje.progressPrint(self,occx,patx/100,patx/10)
				self.verbose(0,1,'...Done!',2)
				## Prepare Results ##
				delimit = rje.getDelimit(self.cmd_list)
				if self.info['Name'] == 'None':
					self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit))
				if self.opt['MySQL']:	# Two tables
					patfile = os.path.splitext(self.info['Name'])
					occfile = '%s.occ%s' % (patfile[0],patfile[1])
					patfile = '%s.patterns%s' % (patfile[0],patfile[1])
					if self.opt['Append']:
						PATFILE = open(patfile,'a')
						OCCFILE = open(occfile,'a')
					else:
						PATFILE = open(patfile,'w')
						rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit)
						OCCFILE = open(occfile,'a')
						rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit)
				else:
					if self.opt['Append']:
						RESFILE = open(self.info['Name'],'a')
					else:
						RESFILE = open(patfile,'w')
						rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit)
				## Save Results ##
				occx = 0
				for pattern in self.list['Pattern']:
					patstats = []
					for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']:
						patstats.append('%d' % pattern.stat[stat])
					patstats[2] = '%.3f' % pattern.stat['Info']
					if self.opt['MySQL']:	# Two tables
						rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit)
					for occ in rje.sortKeys(pattern.occ):
						seq = seqlist.seq[occ]
						for pos in pattern.occ[occ]:
							match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])]
							outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match]
							if self.opt['MySQL']:	# Two tables
								rje.writeDelimit(OCCFILE,outlist,delimit)
							else:
								rje.writeDelimit(RESFILE,outlist+patstats,delimit)
							occx += 1
				if self.opt['MySQL']:	# Two tables
					PATFILE.close()
					OCCFILE.close()
					self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile))
					self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile))
				else:
					RESFILE.close()
					self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' %
									  (rje.integerString(occx),rje.integerString(patx),self.info['Name']))

			### InfoContent ###
			elif self.info['Info'] != 'None':
				## Setup ##
				alphabet = rje_seq.alph_protx 
				if not os.path.exists(self.info['Info']):
					self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False)
					return False
				else:
					mypresto = presto.Presto(self.log,self.cmd_list)
					mypresto.loadMotifs(file=self.info['Info'],clear=True)
				seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
				if seqlist.seqNum() > 0:
					aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False)  ### Returns dictionary of AA (& gap etc.) frequencies
				else:
					aafreq = {}
					for aa in alphabet:
						aafreq[aa] = 1.0 / len(alphabet)
				alphabet = aafreq.keys()
				maxinfo = 0 
				for aa in alphabet:
					maxinfo +=  (aafreq[aa] * math.log(aafreq[aa],2))
				## Output ##
				delimit = rje.getDelimit(self.cmd_list)
				ext = rje.delimitExt(delimit)
				outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext)
				if self.opt['Append']:
					OUTFILE = open(outfile,'a')
				else:
					OUTFILE = open(outfile,'w')
					rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit)
				
				## Calculate Information Scores ##
				for motif in mypresto.motif:
					self.verbose(2,4,motif.info['Sequence'],0)
					pattern = string.replace(motif.info['Sequence'],'X','.')
					elements = string.split(pattern,'-')
					pattern = ''
					for el in elements:
						if el.find('.{') == 0:	# Ambiguous spacer length - compress
							pattern += '.'
						else:
							pattern += el
					self.verbose(2,2,'=> %s' % pattern,1)
					motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen'])
					self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1)
					## Output ##
					rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit)
				
				## Finish ##
				OUTFILE.close()
		except:
			self.log.errorLog('Error in run().',printerror=True,quitchoice=False)
			raise	# Delete this if method error not terrible
Esempio n. 5
0
    def _pepStats(self):      ### Peptide Distance
        '''
        Peptide Distance.
        '''
        try:
            ### Setup ###
            seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
            aaprop = rje_aaprop.AAPropMatrix(self.log,self.cmd_list)
            aaprop.makePropDif()
            delimit = rje.getDelimit(self.cmd_list)

            ### Output File Setup ###
            OUTFILE = open('hrb.pepstats.%s' % rje.delimitExt(delimit),'w')
            headlist = ['peptide']
            ## 10 Dimensional Peptide Property Output ##
            for property in rje.sortKeys(aaprop.prop):
                headlist.append(property.lower())
                for aa in aaprop.prop[property].keys():
                    try:
                        if aa not in ['-','X']:
                            aaprop.prop[property][aa] = string.atoi(aaprop.prop[property][aa])
                    except:
                        print aaprop.prop, property, aa, aaprop.prop[property][aa]
                        raise
            ## Additional Stats ##
            headlist.append('net_charge')
            #headlist.append('hydrophobicity')
            headlist.append('charge_balance')
            headlist.append('hydrophobic_balance')
            #headlist.append('hydrophobicity_balance')
            ## Output
            rje.writeDelimit(OUTFILE,headlist,delimit)
            
            ### Calculate stats ###
            for pep in seqlist.seq:
                pepname = pep.shortName()
                if rje.matchExp('^(\S+_\d[CQ])',pepname):
                    pepname = rje.matchExp('^(\S+_\d[CQ])',pepname)[0]
                outlist = [pepname]
                pepseq = pep.info['Sequence']
                ## 10 Dimensional Peptide Property Output ##
                for property in rje.sortKeys(aaprop.prop):
                    px = 0
                    for aa in pepseq:
                        px += aaprop.prop[property][aa]
                    outlist.append('%d' % px)
                ## Additional Stats ##
                net_charge = 0
                for aa in pepseq:
                    net_charge += (aaprop.prop['Positive'][aa] - aaprop.prop['Negative'][aa])
                outlist.append('%d' % net_charge)
                charge_balance = 0
                hydrophobic_balance = 0
                for r in range(len(pepseq)):
                    charge_balance += aaprop.prop['Charged'][pepseq[r]] * (1.0 / (r+1))
                    charge_balance -= aaprop.prop['Charged'][pepseq[r]] * (1.0 / (10-r))
                    hydrophobic_balance += aaprop.prop['Hydrophobic'][pepseq[r]] * (1.0 / (r+1))
                    hydrophobic_balance -= aaprop.prop['Hydrophobic'][pepseq[r]] * (1.0 / (10-r))
                outlist.append('%.3f' % charge_balance)
                outlist.append('%.3f' % hydrophobic_balance)
                rje.writeDelimit(OUTFILE,outlist,delimit)
                        
            ### Finish ###
            OUTFILE.close()
                    
        except:
            self.log.errorLog('Error in _pepStats',printerror=True,quitchoice=False)
            raise   # Delete this if method error not terrible
Esempio n. 6
0
 def pileUpStats(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
     '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         statfile = '%s.pdiff.tdt' % self.baseFile()
         if not self.force() and os.path.exists(statfile): return self.pileUpFDR()
         ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         wtdata = {}     # Load lists of data for compiling
         for locus in self.dict['RefSeq']:
             wtdata[locus] = {}
             for field in ['N','QN','MajFreq']: wtdata[locus][field] = []
         WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1
         fields = []
         for line in WTDATA:
             data = rje.readDelimit(line)
             if fields:
                 locus = data[0]
                 pos = int(data[1])
                 while pos > wx:
                     wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1
                 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)]))
                 for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)]))
                 wx += 1
             else: fields = data[0:]
         WTDATA.close()
         ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         mutdata = {}     # Load lists of data for compiling
         for locus in self.dict['RefSeq']:
             mutdata[locus] = {}
             for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = []
         MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1
         fields = []
         for line in MUTDATA:
             data = rje.readDelimit(line)
             if fields:
                 locus = data[0]
                 self.str['RefSeq'] = self.dict['RefSeq'][locus]
                 pos = int(data[1])
                 try:
                     if pos > len(self.str['RefSeq']):
                         while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?'
                         self.str['RefSeq'] += data[2]
                         self.dict['RefSeq'][locus] = self.str['RefSeq']
                     elif self.str['RefSeq'][pos-1] == '?':
                         self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:]
                         self.dict['RefSeq'][locus] = self.str['RefSeq']
                 except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq'])))
                 while pos > mx:
                     mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1
                 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)]))
                 for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)]))
                 for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)])
                 mx += 1
             else: fields = data[0:]
         MUTDATA.close()
         ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #!# Need a new check with locus info #!#
         #for field in wtdata:    #!# Won't be true - not all reference genome positions present in output (0 mapped reads)
         #    if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError
         #for field in mutdata:    #!# Won't be true - not all reference genome positions present in output (0 mapped reads)
         #    if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError
         #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq']))
         ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w')
         headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff']
         SAMSIG.write('%s\n' % string.join(headers,'\t'))
         nodifx = 0; nomutx = 0; sx = 0
         for locus in rje.sortKeys(self.dict['RefSeq']):
             self.str['RefSeq'] = self.dict['RefSeq'][locus]
             self.list['WTMajor'] = self.dict['WTMajor'][locus]
             for i in range(len(self.str['RefSeq'])):
                 try:
                     sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i],
                                mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]]
                 except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue
                 if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue   # Was: sigdata += [1.0,1.0]
                 elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue
                 elif not wtdata[locus]['MajFreq'][i]:    # No Data for WT
                     if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0]
                     else: sigdata += [1.0,1.0]
                 elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]:
                     obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5)
                     sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self))
                     sigdata.append(1.0)
                 elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]:
                     obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5)
                     sigdata.append(1.0)
                     sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self))
                 else: sigdata += [1.0,1.0]
                 sigdata.append(min(1.0,2*min(sigdata[-2:])))
                 rje.writeDelimit(SAMSIG,sigdata); sx += 1
         SAMSIG.close()
         ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx)
         if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx)
         if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx)
         self.printLog('#PDIFF','%s.' % ptxt)
         ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.pileUpFDR()
     except: self.errorLog('%s.pileUpStats() error' % (self)); return None
Esempio n. 7
0
 def parsePileup(self,tname,filename,wtdb=None):  ### Extracts, filters and processes PileUp data
     '''Extracts, filters and processes PileUp data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         table = self.db().addEmptyTable(tname,['Locus','Pos','Seq','N','QN','Major','MajFreq'],keys=['Locus','Pos'])
         qc = []
         if wtdb: table.addField('WTFreq')
         PILEUP = open(filename,'r'); px = 0; ex = 0
         PILEOUT = open('%s.%s.tdt' % (self.baseFile(),tname),'w')
         rje.writeDelimit(PILEOUT,outlist=table.fields(),delimit='\t')
         locus = None
         refseq = ''     #? What is this used for?
         majors = []     #? What is this used for?
         ### ~ [2] Process each entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in PILEUP:
             # Split line up into data. Should be: locus, position, reference, no. reads, read data, qualscores
             data = string.split(rje.chomp(line))
             if not data: break
             self.progLog('\r#PARSE','Parsing %s: %s pos...' % (filename,rje.iStr(px)),rand=0.01); px += 1
             ## ~ [2a] Extract Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             entry = {'Locus':data[0],'Pos':int(data[1]),'Seq':data[2],'N':int(data[3]),'QN':0}
             if entry['Locus'] != locus: locus = entry['Locus']; refseq = ''; majors = []
             refseq += data[2]
             #entry => 'Ref','Pos','Seq','N','Reads','Qual'
             rseq = data[4]
             reads = []
             delx = 0
             while rseq:                    
                 try:
                     if rseq[:1] in ['.',',']: reads.append(entry['Seq']); rseq = rseq[1:]
                     elif rseq[:1] == '^': rseq = rseq[2:]
                     #elif rseq[:1] == '*':
                     #    reads.append('-1%s' % entry['Seq'].upper())
                     #    rseq = rseq[1:]
                     elif rseq[:1] in ['-','+']:
                         ilen = string.atoi(rje.matchExp('^(\d+)',rseq[1:])[0])
                         indel = rseq[len('%s' % ilen)+1:][:ilen]
                         #self.deBug('%s: %s' % (rseq,indel))
                         if rseq[:1] == '-':
                             delx += 1
                             reads.append(rseq[:len('%s' % ilen)+ilen+1].upper())
                         else:
                             reads[-1] += indel.upper()
                         #self.deBug(reads[-1])
                         rseq = rseq[len('%s' % ilen)+ilen+1:]
                     elif rseq[:1] in ['$']: rseq = rseq[1:]
                     else:
                         if rseq[0].upper() not in 'ATGCN*': print ' ???', rseq[0].upper(), '???'
                         reads.append(rseq[0].upper()); rseq = rseq[1:]
                 except:
                     self.errorLog('!')
                     self.deBug(rseq)
                     raise ValueError
             if len(reads) != (entry['N'] + delx):
                 self.deBug('%s = %d' % (data[4],entry['N']))
                 self.deBug('%s = %d' % (reads,len(reads)))
                 self.errorLog('Read versus Read Count mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False)
                 raise ValueError
             ## ~ [2b] Convert Quality Scores ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             qual = []
             for q in data[5]:
                 # Gaps do not have a quality score, so fill these in first
                 while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut'))
                 # Then append actual qv
                 qual.append(ord(q) - 33)
                 qc += [0] * (qual[-1] - len(qc)); qc[qual[-1]-1] += 1
             while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut'))
             while '*' in reads: reads[reads.index('*')] = '-'   #'-1%s' % entry['Seq'].upper()
             if len(reads) != len(qual):
                 self.deBug('%s = %d' % (reads,len(reads)))
                 self.deBug('%s = %d' % (qual,len(qual)))
                 self.deBug(data)
                 self.errorLog('Read versus Quality length mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False)
                 raise ValueError
             ## ~ [2c] Filter low quality ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if entry['Pos'] in [190359]:    #100,98901,183697,169284,
                 self.deBug(qual)
                 self.deBug(reads)
                 self.deBug(qc)
             # Remove (from back) any reads than do not meet QV cutoff
             for r in range(len(qual)-1,-1,-1):
                 if qual[r] < self.getInt('QCut'): qual.pop(r); reads.pop(r)
             entry['QN'] = len(reads)
             ## ~ [2d] Major Allele ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             alleles = {}    # Dictionary of {nt:count}
             # Setup major allele
             if reads: major = reads[0]
             else: major = '-'; alleles[major] = 0
             # Cycle through reads. Keep most abundant allele as major - or reference allele if tied.
             for read in reads:
                 if read in alleles: alleles[read] += 1
                 else: alleles[read] = 1
                 if alleles[read] > alleles[major] or (read == entry['Seq'] and alleles[read] == alleles[major]): major = read
             entry['Major'] = major
             majors.append(major)
             if reads: entry['MajFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[major]) / float(len(reads)))
             else: entry['MajFreq'] = 0.0
             if wtdb:
                 try:
                     wtmajor = self.dict['WTMajor'][locus][entry['Pos']-1]
                     if wtmajor in alleles and reads: entry['WTFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[wtmajor]) / float(len(reads)))
                     else: entry['WTFreq'] = 0.0
                     if wtmajor != major: self.debug(entry)
                     elif locus == 'chrIV_S288C__BK006938.2' and entry['Pos'] == 271733: self.debug(entry)
                 except: self.warnLog('WTFreq Error (%s:Pos=%d) [Probably no WT read mapped]' % (locus,entry['Pos'])); entry['WTFreq'] = 0.0
             if entry['Pos'] in [190359]:    #100,98901,183697,169284,
                 self.deBug(qual)
                 self.deBug(reads)
                 self.deBug(alleles)
                 self.deBug(entry)
                 self.deBug(line)
             #table.addEntry(entry)
             outlist = []
             for field in table.fields(): outlist.append(entry[field])
             rje.writeDelimit(PILEOUT,outlist,delimit='\t'); ex += 1
         self.printLog('\r#PARSE','Parsed %s: %s entries from %s lines.' % (filename,rje.iStr(ex),rje.iStr(px)))
         PILEOUT.close()
         PILEUP.close()
         ### ~ [3] Save QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         QC = open('%s.%s.QC.tdt' % (self.baseFile(),tname),'w')
         QC.write('Qual\tCount\n')
         for q in range(len(qc)):
             try: QC.write('%d\t%d\n' % (q+1,qc[q]))
             except: self.errorLog('!')
         QC.close()
         return table
     except: self.errorLog('%s.parsePileup(%s) error' % (self,filename)); return None