Python DiscreteDistribution Examples

Programming Language: Python

Namespace/Package Name: probability

Examples at hotexamples.com: 16

Python DiscreteDistribution - 16 examples found. These are the top rated real world Python examples of probability.DiscreteDistribution extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DiscreteDistribution(8)

sample(3)

Example #1

Show file

File: SequenceContainer.py Project: johanneskoester/neat-genreads

    def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen,
                       coverageDat):
        self.x = xOffset
        self.ploidy = ploidy
        self.readLen = readLen
        self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
        self.seqLen = len(sequence)
        self.indelList = [[] for n in xrange(self.ploidy)]
        self.snpList = [[] for n in xrange(self.ploidy)]
        self.allCigar = [[] for n in xrange(self.ploidy)]
        self.adj = [None for n in xrange(self.ploidy)]
        # blackList[ploid][pos] = 0		safe to insert variant here
        # blackList[ploid][pos] = 1		indel inserted here
        # blackList[ploid][pos] = 2		snp inserted here
        # blackList[ploid][pos] = 3		invalid position for various processing reasons
        self.blackList = [
            np.zeros(self.seqLen, dtype='<i4') for n in xrange(self.ploidy)
        ]

        # disallow mutations to occur on window overlap points
        self.winBuffer = windowOverlap
        for p in xrange(self.ploidy):
            self.blackList[p][-self.winBuffer] = 3
            self.blackList[p][-self.winBuffer - 1] = 3

        # if we're only creating a vcf, skip some expensive initialization related to coverage depth
        if not self.onlyVCF:
            (self.windowSize, coverage_vals) = coverageDat
            self.win_per_read = int(self.readLen / float(self.windowSize) +
                                    0.5)
            self.which_bucket = DiscreteDistribution(coverage_vals,
                                                     range(len(coverage_vals)))

Example #2

Show file

File: SequenceContainer.py Project: zstephens/neat-genreads

	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat):
		self.x         = xOffset
		self.ploidy    = ploidy
		self.readLen   = readLen
		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
		self.seqLen    = len(sequence)
		self.indelList = [[] for n in xrange(self.ploidy)]
		self.snpList   = [[] for n in xrange(self.ploidy)]
		self.allCigar  = [[] for n in xrange(self.ploidy)]
		self.adj       = [None for n in xrange(self.ploidy)]
		# blackList[ploid][pos] = 0		safe to insert variant here
		# blackList[ploid][pos] = 1		indel inserted here
		# blackList[ploid][pos] = 2		snp inserted here
		# blackList[ploid][pos] = 3		invalid position for various processing reasons
		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]

		# disallow mutations to occur on window overlap points
		self.winBuffer = windowOverlap
		for p in xrange(self.ploidy):
			self.blackList[p][-self.winBuffer]   = 3
			self.blackList[p][-self.winBuffer-1] = 3
		
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, coverage_vals) = coverageDat
			self.win_per_read = int(self.readLen/float(self.windowSize)+0.5)
			self.which_bucket = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

Example #3

Show file

	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))

Example #4

Show file

File: SequenceContainer.py Project: johanneskoester/neat-genreads

    def init_mutModels(self, mutationModels, mutRate):
        if mutationModels == []:
            ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
            self.modelData = ml[:self.ploidy]
        else:
            if len(mutationModels) != self.ploidy:
                print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
                exit(1)
            self.modelData = copy.deepcopy(mutationModels)

        # do we need to rescale mutation frequencies?
        mutRateSum = sum([n[0] for n in self.modelData])
        self.mutRescale = mutRate
        if self.mutRescale == None:
            self.mutScalar = 1.0
        else:
            self.mutScalar = float(
                self.mutRescale) / (mutRateSum / float(len(self.modelData)))

        # how are mutations spread to each ploid, based on their specified mut rates?
        self.ploidMutFrac = [float(n[0]) / mutRateSum for n in self.modelData]
        self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,
                                                  range(self.ploidy))

        # init mutation models
        #
        # self.models[ploid][0] = average mutation rate
        # self.models[ploid][1] = p(mut is homozygous | mutation occurs)
        # self.models[ploid][2] = p(mut is indel | mut occurs)
        # self.models[ploid][3] = p(insertion | indel occurs)
        # self.models[ploid][4] = distribution of insertion lengths
        # self.models[ploid][5] = distribution of deletion lengths
        # self.models[ploid][6] = distribution of trinucleotide SNP transitions
        # self.models[ploid][7] = p(trinuc mutates)
        self.models = []
        for n in self.modelData:
            self.models.append([
                self.mutScalar * n[0], n[1], n[2], n[3],
                DiscreteDistribution(n[5], n[4]),
                DiscreteDistribution(n[7], n[6]), []
            ])
            for m in n[8]:
                self.models[-1][6].append([
                    DiscreteDistribution(m[0], NUCL),
                    DiscreteDistribution(m[1], NUCL),
                    DiscreteDistribution(m[2], NUCL),
                    DiscreteDistribution(m[3], NUCL)
                ])
            self.models[-1].append([m for m in n[9]])

Example #5

Show file

File: SequenceContainer.py Project: zstephens/neat-genreads

	def init_mutModels(self,mutationModels,mutRate):
		if mutationModels == []:
			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
			self.modelData = ml[:self.ploidy]
		else:
			if len(mutationModels) != self.ploidy:
				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
				exit(1)
			self.modelData = copy.deepcopy(mutationModels)

		# do we need to rescale mutation frequencies?
		mutRateSum = sum([n[0] for n in self.modelData])
		self.mutRescale = mutRate
		if self.mutRescale == None:
			self.mutScalar = 1.0
		else:
			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))

		# how are mutations spread to each ploid, based on their specified mut rates?
		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))

		# init mutation models
		#
		# self.models[ploid][0] = average mutation rate
		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
		# self.models[ploid][2] = p(mut is indel | mut occurs)
		# self.models[ploid][3] = p(insertion | indel occurs)
		# self.models[ploid][4] = distribution of insertion lengths
		# self.models[ploid][5] = distribution of deletion lengths
		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
		# self.models[ploid][7] = p(trinuc mutates)
		self.models = []
		for n in self.modelData:
			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
			for m in n[8]:
				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
					                       DiscreteDistribution(m[1],NUCL),
					                       DiscreteDistribution(m[2],NUCL),
					                       DiscreteDistribution(m[3],NUCL)])
			self.models[-1].append([m for m in n[9]])

Example #6

Show file

File: genReads.py Project: meridith-e/neat-genreads

	[GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb'))
	GC_WINDOW_SIZE = GC_SCALE_COUNT[-1]

#	fragment length distribution
#
if PAIRED_END and not(PAIRED_END_ARTIFICIAL):
	print 'Using empirical fragment length distribution.'
	[potential_vals, potential_prob] = pickle.load(open(FRAGLEN_MODEL,'rb'))
	FRAGLEN_VALS = []
	FRAGLEN_PROB = []
	for i in xrange(len(potential_vals)):
		if potential_vals[i] > READLEN:
			FRAGLEN_VALS.append(potential_vals[i])
			FRAGLEN_PROB.append(potential_prob[i])
	# should probably add some validation and sanity-checking code here...
	FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS)
	FRAGMENT_SIZE = FRAGLEN_VALS[mean_ind_of_weighted_list(FRAGLEN_PROB)]

#	Indicate not writing FASTQ reads
#
if NO_FASTQ:
	print 'Bypassing FASTQ generation...'

"""************************************************
****            HARD-CODED CONSTANTS
************************************************"""


# target window size for read sampling. how many times bigger than read/frag length
WINDOW_TARGET_SCALE = 100
# sub-window size for read sampling windows. this is basically the finest resolution

Example #7

Show file

	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		self.UNIFORM = False
		if len(errorDat) == 4:		# uniform-error SE reads (e.g. PacBio)
			self.UNIFORM = True
			[Qscores,offQ,avgError,errorParams] = errorDat
			self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
			print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)


		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		if self.UNIFORM == False:
			# adjust length to match desired read length
			if self.readLen == len(initQ1):
				self.qIndRemap = range(self.readLen)
			else:
				print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
				self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

			# initialize probability distributions
			self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
			self.probDistByPosByPrevQ1 = [None]
			for i in xrange(1,len(initQ1)):
				self.probDistByPosByPrevQ1.append([])
				for j in xrange(len(initQ1[0])):
					if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

			if self.PE_MODELS:
				self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
				self.probDistByPosByPrevQ2 = [None]
				for i in xrange(1,len(initQ2)):
					self.probDistByPosByPrevQ2.append([])
					for j in xrange(len(initQ2[0])):
						if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
						else:
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))

Example #8

Show file

class ReadContainer:
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		self.UNIFORM = False
		if len(errorDat) == 4:		# uniform-error SE reads (e.g. PacBio)
			self.UNIFORM = True
			[Qscores,offQ,avgError,errorParams] = errorDat
			self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
			print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)


		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		if self.UNIFORM == False:
			# adjust length to match desired read length
			if self.readLen == len(initQ1):
				self.qIndRemap = range(self.readLen)
			else:
				print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
				self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

			# initialize probability distributions
			self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
			self.probDistByPosByPrevQ1 = [None]
			for i in xrange(1,len(initQ1)):
				self.probDistByPosByPrevQ1.append([])
				for j in xrange(len(initQ1[0])):
					if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

			if self.PE_MODELS:
				self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
				self.probDistByPosByPrevQ2 = [None]
				for i in xrange(1,len(initQ2)):
					self.probDistByPosByPrevQ2.append([])
					for j in xrange(len(initQ2[0])):
						if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
						else:
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))

	def getSequencingErrors(self, readData, isReverseStrand=False):

		qOut = [0]*self.readLen
		sErr = []

		if self.UNIFORM:
			myQ  = [self.uniform_qscore + self.offQ for n in xrange(self.readLen)]
			qOut = ''.join([chr(n) for n in myQ])
			for i in xrange(self.readLen):
				if random.random() < self.errorScale*self.qErrRate[self.uniform_qscore]:
					sErr.append(i)
		else:
			if self.PE_MODELS and isReverseStrand:
				myQ = self.initDistByPos2[0].sample()
			else:
				myQ = self.initDistByPos1[0].sample()

			if random.random() < self.qErrRate[myQ]:
				sErr.append(0)
			qOut[0] = myQ + self.offQ
			for i in xrange(1,self.readLen):

				if self.PE_MODELS and isReverseStrand:
					myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample()
				else:
					myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample()

				if random.random() < self.errorScale*self.qErrRate[myQ]:
					sErr.append(i)
				qOut[i] = myQ + self.offQ
			qOut = ''.join([chr(n) for n in qOut])

		if self.errorScale == 0.0:
			return (qOut,[])

		sOut = []
		nDelSoFar = 0
		# don't allow indel errors to occur on subsequent positions
		prevIndel = -2
		# don't allow other sequencing errors to occur on bases removed by deletion errors
		delBlacklist = []

		for ind in sErr[::-1]:	# for each error that we're going to insert...

			# determine error type
			isSub = True
			if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and abs(ind-prevIndel) > 1:
				if random.random() < self.errP[1]:
					isSub = False

			# errorOut = (type, len, pos, ref, alt)

			if isSub:								# insert substitution error
				myNucl  = chr(readData[ind])
				newNucl = self.errSSE[NUC_IND[myNucl]].sample()
				sOut.append(('S',1,ind,myNucl,newNucl))
			else:									# insert indel error
				indelLen = self.errSIE.sample()
				if random.random() < self.errP[4]:		# insertion error
					myNucl  = chr(readData[ind])
					newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)])
					sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl))
				elif ind < self.readLen-2-nDelSoFar:	# deletion error (prevent too many of them from stacking up)
					myNucl  = str(readData[ind:ind+indelLen+1])
					newNucl = chr(readData[ind])
					nDelSoFar += len(myNucl)-1
					sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl))
					for i in xrange(ind+1,ind+indelLen+1):
						delBlacklist.append(i)
				prevIndel = ind

		# remove blacklisted errors
		for i in xrange(len(sOut)-1,-1,-1):
			if sOut[i][2] in delBlacklist:
				del sOut[i]

		return (qOut,sOut)

Example #9

Show file

	def init_coverage(self,coverageDat,fragDist=None):
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, gc_scalars, targetCov_vals) = coverageDat
			gcCov_vals = [[] for n in self.sequences]
			trCov_vals = [[] for n in self.sequences]
			self.coverage_distribution = []
			avg_out = []
			for i in xrange(len(self.sequences)):
				# compute gc-bias
				j = 0
				while j+self.windowSize < len(self.sequences[i]):
					gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
					gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
					j += self.windowSize
				gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
				gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
				#
				trCov_vals[i].append(targetCov_vals[0])
				prevVal = self.FM_pos[i][0]
				for j in xrange(1,len(self.sequences[i])-self.readLen):
					if self.FM_pos[i][j] == None:
						trCov_vals[i].append(targetCov_vals[prevVal])
					else:
						trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
						prevVal = self.FM_pos[i][j]
					#print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
				# shift by half of read length
				trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
				# fill in missing indices
				trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))

				#
				covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
				coverage_vals = []
				for j in xrange(0,len(self.sequences[i])-self.readLen):
					coverage_vals.append(covvec[j+self.readLen] - covvec[j])
				avg_out.append(np.mean(coverage_vals)/float(self.readLen))

				if fragDist == None:
					self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
			
				# fragment length nightmare
				else:
					currentThresh = 0.
					index_list    = [0]
					for j in xrange(len(fragDist.cumP)):
						if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
							currentThresh = fragDist.cumP[j]
							index_list.append(j)
					flq = [fragDist.values[nnn] for nnn in index_list]
					if fragDist.values[-1] not in flq:
						flq.append(fragDist.values[-1])
					flq.append(LARGE_NUMBER)

					self.fraglens_indMap = {}
					for j in fragDist.values:
						bInd = bisect.bisect(flq,j)
						if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
							self.fraglens_indMap[j] = flq[bInd-1]
						else:
							self.fraglens_indMap[j] = flq[bInd]

					self.coverage_distribution.append({})
					for flv in sorted(list(set(self.fraglens_indMap.values()))):
						buffer_val = self.readLen
						for j in fragDist.values:
							if self.fraglens_indMap[j] == flv and j > buffer_val:
								buffer_val = j
						coverage_vals = []
						for j in xrange(len(self.sequences[i])-buffer_val):
							coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])

						# EXPERIMENTAL
						#quantized_covVals = quantize_list(coverage_vals)
						#self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])

						# TESTING
						#import matplotlib.pyplot as mpl
						#print len(coverage_vals),'-->',len(quantized_covVals)
						#mpl.figure(0)
						#mpl.plot(range(len(coverage_vals)),coverage_vals)
						#for qcv in quantized_covVals:
						#	mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
						#mpl.show()
						#exit(1)

						self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

			return np.mean(avg_out)

Example #10

Show file

class SequenceContainer:
	def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, onlyVCF=False):
		# initialize basic variables
		self.onlyVCF = onlyVCF
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
		# initialize mutation models
		self.init_mutModels(mutationModels, mutRate)
		# sample the number of variants that will be inserted into each ploid
		self.init_poisson()
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		# initialize trinuc snp bias
		self.init_trinucBias()

	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen):
		self.x         = xOffset
		self.ploidy    = ploidy
		self.readLen   = readLen
		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
		self.seqLen    = len(sequence)
		self.indelList = [[] for n in xrange(self.ploidy)]
		self.snpList   = [[] for n in xrange(self.ploidy)]
		self.allCigar  = [[] for n in xrange(self.ploidy)]
		self.FM_pos    = [[] for n in xrange(self.ploidy)]
		self.FM_span   = [[] for n in xrange(self.ploidy)]
		self.adj       = [None for n in xrange(self.ploidy)]
		# blackList[ploid][pos] = 0		safe to insert variant here
		# blackList[ploid][pos] = 1		indel inserted here
		# blackList[ploid][pos] = 2		snp inserted here
		# blackList[ploid][pos] = 3		invalid position for various processing reasons
		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]

		# disallow mutations to occur on window overlap points
		self.winBuffer = windowOverlap
		for p in xrange(self.ploidy):
			self.blackList[p][-self.winBuffer]   = 3
			self.blackList[p][-self.winBuffer-1] = 3

	def init_coverage(self,coverageDat,fragDist=None):
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, gc_scalars, targetCov_vals) = coverageDat
			gcCov_vals = [[] for n in self.sequences]
			trCov_vals = [[] for n in self.sequences]
			self.coverage_distribution = []
			avg_out = []
			for i in xrange(len(self.sequences)):
				# compute gc-bias
				j = 0
				while j+self.windowSize < len(self.sequences[i]):
					gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
					gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
					j += self.windowSize
				gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
				gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
				#
				trCov_vals[i].append(targetCov_vals[0])
				prevVal = self.FM_pos[i][0]
				for j in xrange(1,len(self.sequences[i])-self.readLen):
					if self.FM_pos[i][j] == None:
						trCov_vals[i].append(targetCov_vals[prevVal])
					else:
						trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
						prevVal = self.FM_pos[i][j]
					#print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
				# shift by half of read length
				trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
				# fill in missing indices
				trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))

				#
				covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
				coverage_vals = []
				for j in xrange(0,len(self.sequences[i])-self.readLen):
					coverage_vals.append(covvec[j+self.readLen] - covvec[j])
				avg_out.append(np.mean(coverage_vals)/float(self.readLen))

				if fragDist == None:
					self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
			
				# fragment length nightmare
				else:
					currentThresh = 0.
					index_list    = [0]
					for j in xrange(len(fragDist.cumP)):
						if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
							currentThresh = fragDist.cumP[j]
							index_list.append(j)
					flq = [fragDist.values[nnn] for nnn in index_list]
					if fragDist.values[-1] not in flq:
						flq.append(fragDist.values[-1])
					flq.append(LARGE_NUMBER)

					self.fraglens_indMap = {}
					for j in fragDist.values:
						bInd = bisect.bisect(flq,j)
						if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
							self.fraglens_indMap[j] = flq[bInd-1]
						else:
							self.fraglens_indMap[j] = flq[bInd]

					self.coverage_distribution.append({})
					for flv in sorted(list(set(self.fraglens_indMap.values()))):
						buffer_val = self.readLen
						for j in fragDist.values:
							if self.fraglens_indMap[j] == flv and j > buffer_val:
								buffer_val = j
						coverage_vals = []
						for j in xrange(len(self.sequences[i])-buffer_val):
							coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])

						# EXPERIMENTAL
						#quantized_covVals = quantize_list(coverage_vals)
						#self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])

						# TESTING
						#import matplotlib.pyplot as mpl
						#print len(coverage_vals),'-->',len(quantized_covVals)
						#mpl.figure(0)
						#mpl.plot(range(len(coverage_vals)),coverage_vals)
						#for qcv in quantized_covVals:
						#	mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
						#mpl.show()
						#exit(1)

						self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

			return np.mean(avg_out)

	def init_mutModels(self,mutationModels,mutRate):
		if mutationModels == []:
			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
			self.modelData = ml[:self.ploidy]
		else:
			if len(mutationModels) != self.ploidy:
				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
				exit(1)
			self.modelData = copy.deepcopy(mutationModels)

		# do we need to rescale mutation frequencies?
		mutRateSum = sum([n[0] for n in self.modelData])
		self.mutRescale = mutRate
		if self.mutRescale == None:
			self.mutScalar = 1.0
		else:
			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))

		# how are mutations spread to each ploid, based on their specified mut rates?
		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))

		# init mutation models
		#
		# self.models[ploid][0] = average mutation rate
		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
		# self.models[ploid][2] = p(mut is indel | mut occurs)
		# self.models[ploid][3] = p(insertion | indel occurs)
		# self.models[ploid][4] = distribution of insertion lengths
		# self.models[ploid][5] = distribution of deletion lengths
		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
		# self.models[ploid][7] = p(trinuc mutates)
		self.models = []
		for n in self.modelData:
			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
			for m in n[8]:
				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
					                       DiscreteDistribution(m[1],NUCL),
					                       DiscreteDistribution(m[2],NUCL),
					                       DiscreteDistribution(m[3],NUCL)])
			self.models[-1].append([m for m in n[9]])

	def init_poisson(self):
		ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		k_range    = range(int(self.seqLen*MAX_MUTFRAC))
		self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))]
		self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))]

	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))

	def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None):
		# if mutation model is changed, we have to reinitialize it...
		if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
			self.ploidy = ploidy
			self.mutRescale = mutRate
			self.init_mutModels(mutationModels, mutRate)
		# if sequence length is different than previous window, we have to redo snp/indel poissons
		if len(sequence) != self.seqLen:
			self.seqLen = len(sequence)
			self.init_poisson()
		# basic vars
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		#print (self.indelsToAdd,self.snpsToAdd)
		# initialize trinuc snp bias
		if not IGNORE_TRINUC:
			self.init_trinucBias()

	def insert_mutations(self, inputList):
		#
		#	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
		#
		for inpV in inputList:
			whichPloid = []
			wps = inpV[4][0]
			if wps == None:	# if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
				whichPloid.append(self.ploidMutPrior.sample())
				whichAlt = [0]
			else:
				#if 'WP=' in wps:
				#	whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1']
				#	print 'WHICH:', whichPloid
				#	whichAlt   = [0]*len(whichPloid)
				#elif '/' in wps or '|' in wps:
				if '/' in wps or '|' in wps:
					if '/' in wps:
						splt = wps.split('/')
					else:
						splt = wps.split('|')
					whichPloid = []
					whichAlt   = []
					for i in xrange(len(splt)):
						if splt[i] == '1':
							whichPloid.append(i)
						#whichAlt.append(int(splt[i])-1)
					# assume we're just using first alt for inserted variants?
					whichAlt = [0 for n in whichPloid]
				else:	# otherwise assume monoploidy
					whichPloid = [0]
					whichAlt   = [0]

			# ignore invalid ploids
			for i in xrange(len(whichPloid)-1,-1,-1):
				if whichPloid[i] >= self.ploidy:
					del whichPloid[i]
						
			for i in xrange(len(whichPloid)):
				p = whichPloid[i]
				myAlt = inpV[2][whichAlt[i]]
				myVar = (inpV[0]-self.x,inpV[1],myAlt)
				inLen = max([len(inpV[1]),len(myAlt)])
				#print myVar, chr(self.sequences[p][myVar[0]])
				if myVar[0] < 0 or myVar[0] >= len(self.blackList[p]):
					print '\nError: Attempting to insert variant out of window bounds:'
					print myVar, '--> blackList[0:'+str(len(self.blackList[p]))+']\n'
					exit(1)
				if len(inpV[1]) == 1 and len(myAlt) == 1:
					if self.blackList[p][myVar[0]]:
						continue
					self.snpList[p].append(myVar)
					self.blackList[p][myVar[0]] = 2
				else:
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						if self.blackList[p][k]:
							continue
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						self.blackList[p][k] = 1
					self.indelList[p].append(myVar)

	def random_mutations(self):
		
		#	add random indels
		all_indels  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.indelsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous indel
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous indel
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert indels
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					eventPos = random.randint(self.winBuffer,self.seqLen-1)
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				if random.random() <= self.models[i][3]:	# insertion
					inLen   = self.models[i][4].sample()
					# sequence content of random insertions is uniformly random (change this later)
					inSeq   = ''.join([random.choice(NUCL) for n in xrange(inLen)])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl,refNucl+inSeq)
				else:										# deletion
					inLen   = self.models[i][5].sample()
					if eventPos+inLen+1 >= len(self.sequences[i]):	# skip if deletion too close to boundary
						continue
					if inLen == 1:
						inSeq = chr(self.sequences[i][eventPos+1])
					else:
						inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl+inSeq,refNucl)

				# if event too close to boundary, skip. if event conflicts with other indel, skip.
				skipEvent = False
				if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1:
					skipEvent = True
				if skipEvent:
					continue
				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						if self.blackList[p][k]:
							skipEvent = True
				if skipEvent:
					continue

				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						self.blackList[p][k] = 1
					all_indels[p].append(myIndel)

		for i in xrange(len(all_indels)):
			all_indels[i].extend(self.indelList[i])
		all_indels = [sorted(n,reverse=True) for n in all_indels]
		#print all_indels

		#	add random snps
		all_snps  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.snpsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous SNP
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous SNP
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert snps
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					# based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
					# (if there are multiple ploids, choose one at random)
					if IGNORE_TRINUC:
						eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
					else:
						ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)]
						eventPos     = self.trinuc_bias[ploid_to_use].sample()
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				refNucl = chr(self.sequences[i][eventPos])
				context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1]))
				# sample from tri-nucleotide substitution matrices to get SNP alt allele
				newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample()
				mySNP   = (eventPos,refNucl,newNucl)

				for p in whichPloid:
					all_snps[p].append(mySNP)
					self.blackList[p][mySNP[0]] = 2

		# combine random snps with inserted snps, remove any snps that overlap indels
		for p in xrange(len(all_snps)):
			all_snps[p].extend(self.snpList[p])
			all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1]

		# modify reference sequences
		for i in xrange(len(all_snps)):
			for j in xrange(len(all_snps[i])):
				# sanity checking (for debugging purposes)
				vPos = all_snps[i][j][0]
				if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
					print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n'
					exit(1)
				else:
					self.sequences[i][vPos] = all_snps[i][j][2]

		adjToAdd = [[] for n in xrange(self.ploidy)]
		for i in xrange(len(all_indels)):
			for j in xrange(len(all_indels[i])):
				# sanity checking (for debugging purposes)
				vPos  = all_indels[i][j][0]
				vPos2 = vPos + len(all_indels[i][j][1])
				#print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
				#print len(self.sequences[i]),'-->',
				if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
					print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n'
					exit(1)
				else:
					self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:]
					adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1])))
				#print len(self.sequences[i])
			adjToAdd[i].sort()
			#print adjToAdd[i]

			self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4')
			indSoFar = 0
			valSoFar = 0
			for j in xrange(len(self.adj[i])):
				if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1:
					valSoFar += adjToAdd[i][indSoFar][1]
					indSoFar += 1
				self.adj[i][j] = valSoFar

			# precompute cigar strings (we can skip this is going for only vcf output)
			if not self.onlyVCF:
				tempSymbolString = ['M']
				prevVal = self.adj[i][0]
				j = 1
				while j < len(self.adj[i]):
					diff = self.adj[i][j] - prevVal
					prevVal = self.adj[i][j]
					if diff > 0:	# insertion
						tempSymbolString.extend(['I']*abs(diff))
						j += abs(diff)
					elif diff < 0:	# deletion
						tempSymbolString.append('D'*abs(diff)+'M')
						j += 1
					else:
						tempSymbolString.append('M')
						j += 1

				for j in xrange(len(tempSymbolString)-self.readLen):
					self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString())
					# pre-compute reference position of first matching base
					my_fm_pos = None
					for k in xrange(self.readLen):
						if 'M' in tempSymbolString[j+k]:
							my_fm_pos = j+k
							break
					if my_fm_pos == None:
						self.FM_pos[i].append(None)
						self.FM_span[i].append(None)
					else:
						self.FM_pos[i].append(my_fm_pos-self.adj[i][my_fm_pos])
						span_dif = len([nnn for nnn in tempSymbolString[j:j+self.readLen] if 'M' in nnn])
						self.FM_span[i].append(self.FM_pos[i][-1] + span_dif)

		# tally up variants implemented
		countDict = {}
		all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)]
		for i in xrange(len(all_variants)):
			for j in xrange(len(all_variants[i])):
				all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:]
				t = tuple(all_variants[i][j])
				if t not in countDict:
					countDict[t] = []
				countDict[t].append(i)

		#
		#	TODO: combine multiple variants that happened to occur at same position into single vcf entry
		#

		output_variants = []
		for k in sorted(countDict.keys()):
			output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)]))
			ploid_string = ['0' for n in xrange(self.ploidy)]
			for k2 in [n for n in countDict[k]]:
				ploid_string[k2] = '1'
			output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)])
		return output_variants


	def sample_read(self, sequencingModel, fragLen=None):
		
		# choose a ploid
		myPloid = random.randint(0,self.ploidy-1)

		# stop attempting to find a valid position if we fail enough times
		MAX_READPOS_ATTEMPTS = 100
		attempts_thus_far    = 0

		# choose a random position within the ploid, and generate quality scores / sequencing errors
		readsToSample = []
		if fragLen == None:
			rPos = self.coverage_distribution[myPloid].sample()
			#####rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random
			####
			##### decide which subsection of the sequence to sample from using coverage probabilities
			####coords_bad = True
			####while coords_bad:
			####	attempts_thus_far += 1
			####	if attempts_thus_far > MAX_READPOS_ATTEMPTS:
			####		return None
			####	myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
			####	coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
			####	if coords_to_select_from[0] >= len(self.adj[myPloid]):	# prevent going beyond region boundaries
			####		continue
			####	coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
			####	coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]
			####	if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
			####		continue
			####	if coords_to_select_from[1] - coords_to_select_from[0] <= 2:	# we don't span enough coords to sample
			####		continue
			####	if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen:
			####		coords_bad = False
			####rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)

			# sample read position and call function to compute quality scores / sequencing errors
			rDat = self.sequences[myPloid][rPos:rPos+self.readLen]
			(myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
			readsToSample.append([rPos,myQual,myErrors,rDat])

		else:
			rPos1 = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
			
			# EXPERIMENTAL
			#coords_to_select_from = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
			#rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1])

			#####rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random
			####
			##### decide which subsection of the sequence to sample from using coverage probabilities
			####coords_bad = True
			####while coords_bad:
			####	attempts_thus_far += 1
			####	if attempts_thus_far > MAX_READPOS_ATTEMPTS:
			####		#print coords_to_select_from
			####		return None
			####	myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
			####	coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
			####	if coords_to_select_from[0] >= len(self.adj[myPloid]):	# prevent going beyond region boundaries
			####		continue
			####	coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
			####	coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]	# both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
			####	if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
			####		continue
			####	if coords_to_select_from[1] - coords_to_select_from[0] <= 2:	# we don't span enough coords to sample
			####		continue
			####	rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
			####	# for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
			####	if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen:
			####		rPos1 -= fragLen - self.readLen
			####	if rPos1 < len(self.sequences[myPloid])-fragLen:
			####		coords_bad = False

			rPos2 = rPos1 + fragLen - self.readLen
			rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen]
			rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen]
			#print len(rDat1), rPos1, len(self.sequences[myPloid])
			(myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
			(myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True)
			readsToSample.append([rPos1,myQual1,myErrors1,rDat1])
			readsToSample.append([rPos2,myQual2,myErrors2,rDat2])

		# error format:
		# myError[i] = (type, len, pos, ref, alt)

		# examine sequencing errors to-be-inserted.
		#	- remove deletions that don't have enough bordering sequence content to "fill in"
		# if error is valid, make the changes to the read data
		rOut = []
		for r in readsToSample:
			try:
				myCigar = self.allCigar[myPloid][r[0]]
			except IndexError:
				print 'Index error when attempting to find cigar string.'
				print len(self.allCigar[myPloid]), r[0]
				if fragLen != None:
					print (rPos1, rPos2)
				print myPloid, fragLen, self.fraglens_indMap[fragLen]
				exit(1)
			totalD  = sum([error[1] for error in r[2] if error[0] == 'D'])
			totalI  = sum([error[1] for error in r[2] if error[0] == 'I'])
			availB  = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
			# add buffer sequence to fill in positions that get deleted
			r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD]
			expandedCigar = []
			extraCigar    = []
			adj           = 0
			sse_adj       = [0 for n in xrange(self.readLen + max(sequencingModel.errP[3]))]
			anyIndelErr   = False

			# sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
			# secondarily, sort by index
			arrangedErrors = {'D':[],'I':[],'S':[]}
			for error in r[2]:
				arrangedErrors[error[0]].append((error[2],error))
			sortedErrors = []
			for k in sorted(arrangedErrors.keys()):
				sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])

			skipIndels = False

			for error in sortedErrors:
				#print '-se-',r[0], error
				#print sse_adj
				eLen = error[1]
				ePos = error[2]
				if error[0] == 'D' or error[0] == 'I':
					anyIndelErr   = True
					extraCigarVal = []
					if totalD > availB:	# if not enough bases to fill-in deletions, skip all indel erors
						continue
					if expandedCigar == []:
						expandedCigar = CigarString(stringIn=myCigar).getList()
						fillToGo = totalD - totalI + 1
						if fillToGo > 0:
							try:
								extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:]
							except IndexError:	# applying the deletions we want requires going beyond region boundaries. skip all indel errors
								skipIndels = True

					if skipIndels:
						continue

					# insert deletion error into read and update cigar string accordingly
					if error[0] == 'D':
						myadj = sse_adj[ePos]
						pi = ePos+myadj
						pf = ePos+myadj+eLen+1
						if str(r[3][pi:pf]) == str(error[3]):
							r[3] = r[3][:pi+1] + r[3][pf:]
							expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:]
							if pi+1 == len(expandedCigar):	# weird edge case with del at very end of region. Make a guess and add a "M"
								expandedCigar.append('M')
							expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1]
						else:
							print '\nError, ref does not match alt while attempting to insert deletion error!\n'
							exit(1)
						adj -= eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] -= eLen

					# insert insertion error into read and update cigar string accordingly
					else:
						myadj = sse_adj[ePos]
						if chr(r[3][ePos+myadj]) == error[3]:
							r[3] = r[3][:ePos+myadj] + error[4] + r[3][ePos+myadj+1:]
							expandedCigar = expandedCigar[:ePos+myadj] + ['I']*eLen + expandedCigar[ePos+myadj:]
						else:
							print '\nError, ref does not match alt while attempting to insert insertion error!\n'
							print '---',chr(r[3][ePos+myadj]), '!=', error[3]
							exit(1)
						adj += eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] += eLen

				else:	# substitution errors, much easier by comparison...
					if chr(r[3][ePos+sse_adj[ePos]]) == error[3]:
						r[3][ePos+sse_adj[ePos]] = error[4]
					else:
						print '\nError, ref does not match alt while attempting to insert substitution error!\n'
						exit(1)

			if anyIndelErr:
				if len(expandedCigar):
					relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen]
					myCigar = CigarString(listIn=relevantCigar).getString()

				r[3] = r[3][:self.readLen]

			rOut.append([self.FM_pos[myPloid][r[0]],myCigar,str(r[3]),str(r[1])])

		# rOut[i] = (pos, cigar, read_string, qual_string)
		return rOut

Example #11

Show file

File: genSeqErrorModel.py Project: GiovanniCS/PanDelosFragments

def parseFQ(inf):
    print 'reading ' + inf + '...'
    if inf[-3:] == '.gz':
        print 'detected gzip suffix...'
        f = gzip.open(inf, 'r')
    else:
        f = open(inf, 'r')

    IS_SAM = False
    if inf[-4:] == '.sam':
        print 'detected sam input...'
        IS_SAM = True

    rRead = 0
    actual_readlen = 0
    qDict = {}
    while True:

        if IS_SAM:
            data4 = f.readline()
            if not len(data4):
                break
            try:
                data4 = data4.split('\t')[10]
            except IndexError:
                break
            # need to add some input checking here? Yup, probably.
        else:
            data1 = f.readline()
            data2 = f.readline()
            data3 = f.readline()
            data4 = f.readline()
            if not all([data1, data2, data3, data4]):
                break

        if actual_readlen == 0:
            if inf[-3:] != '.gz' and not IS_SAM:
                totalSize = os.path.getsize(inf)
                entrySize = sum([len(n) for n in [data1, data2, data3, data4]])
                print 'estimated number of reads in file:', int(
                    float(totalSize) / entrySize)
            actual_readlen = len(data4) - 1
            print 'assuming read length is uniform...'
            print 'detected read length (from first read found):', actual_readlen
            priorQ = np.zeros([actual_readlen, RQ])
            totalQ = [None] + [
                np.zeros([RQ, RQ]) for n in xrange(actual_readlen - 1)
            ]

        # sanity-check readlengths
        if len(data4) - 1 != actual_readlen:
            print 'skipping read with unexpected length...'
            continue

        for i in range(len(data4) - 1):
            q = ord(data4[i]) - offQ
            qDict[q] = True
            if i == 0:
                priorQ[i][q] += 1
            else:
                totalQ[i][prevQ, q] += 1
                priorQ[i][q] += 1
            prevQ = q

        rRead += 1
        if rRead % PRINT_EVERY == 0:
            print rRead
        if MAX_READS > 0 and rRead >= MAX_READS:
            break
    f.close()

    # some sanity checking again...
    QRANGE = [min(qDict.keys()), max(qDict.keys())]
    if QRANGE[0] < 0:
        print '\nError: Read in Q-scores below 0\n'
        exit(1)
    if QRANGE[1] > RQ:
        print '\nError: Read in Q-scores above specified maximum:', QRANGE[
            1], '>', RQ, '\n'
        exit(1)

    print 'computing probabilities...'
    probQ = [None] + [[[0. for m in xrange(RQ)] for n in xrange(RQ)]
                      for p in xrange(actual_readlen - 1)]
    for p in xrange(1, actual_readlen):
        for i in xrange(RQ):
            rowSum = float(np.sum(totalQ[p][i, :])) + PROB_SMOOTH * RQ
            if rowSum <= 0.:
                continue
            for j in xrange(RQ):
                probQ[p][i][j] = (totalQ[p][i][j] + PROB_SMOOTH) / rowSum

    initQ = [[0. for m in xrange(RQ)] for n in xrange(actual_readlen)]
    for i in xrange(actual_readlen):
        rowSum = float(np.sum(priorQ[i, :])) + INIT_SMOOTH * RQ
        if rowSum <= 0.:
            continue
        for j in xrange(RQ):
            initQ[i][j] = (priorQ[i][j] + INIT_SMOOTH) / rowSum

    if PLOT_STUFF:
        mpl.rcParams.update({
            'font.size': 14,
            'font.weight': 'bold',
            'lines.linewidth': 3
        })

        mpl.figure(1)
        Z = np.array(initQ).T
        X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
        mpl.pcolormesh(X, Y, Z, vmin=0., vmax=0.25)
        mpl.axis([0, len(Z[0]), 0, len(Z)])
        mpl.yticks(range(0, len(Z), 10), range(0, len(Z), 10))
        mpl.xticks(range(0, len(Z[0]), 10), range(0, len(Z[0]), 10))
        mpl.xlabel('Read Position')
        mpl.ylabel('Quality Score')
        mpl.title('Q-Score Prior Probabilities')
        mpl.colorbar()

        mpl.show()

        VMIN_LOG = [-4, 0]
        minVal = 10**VMIN_LOG[0]
        qLabels = [
            str(n) for n in range(QRANGE[0], QRANGE[1] + 1) if n % 5 == 0
        ]
        print qLabels
        qTicksx = [int(n) + 0.5 for n in qLabels]
        qTicksy = [(RQ - int(n)) - 0.5 for n in qLabels]

        for p in xrange(1, actual_readlen, 10):
            currentDat = np.array(probQ[p])
            for i in xrange(len(currentDat)):
                for j in xrange(len(currentDat[i])):
                    currentDat[i][j] = max(minVal, currentDat[i][j])

            # matrix indices:		pcolormesh plotting:	plot labels and axes:
            #
            #      y				   ^					   ^
            #	   -->				 x |					 y |
            #  x |					    -->					    -->
            #    v 					    y					    x
            #
            # to plot a MxN matrix 'Z' with rowNames and colNames we need to:
            #
            # pcolormesh(X,Y,Z[::-1,:])		# invert x-axis
            # # swap x/y axis parameters and labels, remember x is still inverted:
            # xlim([yMin,yMax])
            # ylim([M-xMax,M-xMin])
            # xticks()
            #

            mpl.figure(p + 1)
            Z = np.log10(currentDat)
            X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
            mpl.pcolormesh(X,
                           Y,
                           Z[::-1, :],
                           vmin=VMIN_LOG[0],
                           vmax=VMIN_LOG[1],
                           cmap='jet')
            mpl.xlim([QRANGE[0], QRANGE[1] + 1])
            mpl.ylim([RQ - QRANGE[1] - 1, RQ - QRANGE[0]])
            mpl.yticks(qTicksy, qLabels)
            mpl.xticks(qTicksx, qLabels)
            mpl.xlabel('\n' + r'$Q_{i+1}$')
            mpl.ylabel(r'$Q_i$')
            mpl.title('Q-Score Transition Frequencies [Read Pos:' + str(p) +
                      ']')
            cb = mpl.colorbar()
            cb.set_ticks([-4, -3, -2, -1, 0])
            cb.set_ticklabels([
                r'$10^{-4}$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$',
                r'$10^{0}$'
            ])

        #mpl.tight_layout()
        mpl.show()

    print 'estimating average error rate via simulation...'
    Qscores = range(RQ)
    #print (len(initQ), len(initQ[0]))
    #print (len(probQ), len(probQ[1]), len(probQ[1][0]))

    initDistByPos = [
        DiscreteDistribution(initQ[i], Qscores) for i in xrange(len(initQ))
    ]
    probDistByPosByPrevQ = [None]
    for i in xrange(1, len(initQ)):
        probDistByPosByPrevQ.append([])
        for j in xrange(len(initQ[0])):
            if np.sum(
                    probQ[i][j]
            ) <= 0.:  # if we don't have sufficient data for a transition, use the previous qscore
                probDistByPosByPrevQ[-1].append(
                    DiscreteDistribution([1], [Qscores[j]],
                                         degenerateVal=Qscores[j]))
            else:
                probDistByPosByPrevQ[-1].append(
                    DiscreteDistribution(probQ[i][j], Qscores))

    countDict = {}
    for q in Qscores:
        countDict[q] = 0
    for samp in xrange(1, N_SAMP + 1):
        if samp % PRINT_EVERY == 0:
            print samp
        myQ = initDistByPos[0].sample()
        countDict[myQ] += 1
        for i in xrange(1, len(initQ)):
            myQ = probDistByPosByPrevQ[i][myQ].sample()
            countDict[myQ] += 1

    totBases = float(sum(countDict.values()))
    avgError = 0.
    for k in sorted(countDict.keys()):
        eVal = 10.**(-k / 10.)
        #print k, eVal, countDict[k]
        avgError += eVal * (countDict[k] / totBases)
    print 'AVG ERROR RATE:', avgError

    return (initQ, probQ, avgError)

Example #12

Show file

File: main.py Project: victor-armegioiu/Probabilistic-A-Star-Maze-Solver

        if print_path:
            print('Path cost is', discovered[target][1])
            stack = []
            curr = target

            while curr:
                stack.append((curr, self.original_universe[curr[0]][curr[1]]))
                print(curr)
                curr = discovered[curr][0]

            print('Path from start to target:', stack[::-1])

        return discovered[target][1]


if __name__ == '__main__':
    state = load_maze()

    universe = state.universe
    start, target = state.start, state.target
    portals = state.portals

    discrete_distribution = DiscreteDistribution(portals)
    heuristics = Heuristic(portals, target, discrete_distribution)

    solver = A_Star(universe, portals, start, target, discrete_distribution)

    report = make_statistics(solver, heuristics, discrete_distribution)
    print(report)

Example #13

Show file

class SequenceContainer:
    def __init__(self,
                 xOffset,
                 sequence,
                 ploidy,
                 windowOverlap,
                 readLen,
                 mutationModels=[],
                 mutRate=None,
                 coverageDat=None,
                 onlyVCF=False):
        # initialize basic variables
        self.onlyVCF = onlyVCF
        self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen,
                            coverageDat)
        # initialize mutation models
        self.init_mutModels(mutationModels, mutRate)
        # sample the number of variants that will be inserted into each ploid
        self.init_poisson()
        self.indelsToAdd = [n.sample() for n in self.ind_pois]
        self.snpsToAdd = [n.sample() for n in self.snp_pois]
        # initialize trinuc snp bias
        self.init_trinucBias()

    def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen,
                       coverageDat):
        self.x = xOffset
        self.ploidy = ploidy
        self.readLen = readLen
        self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
        self.seqLen = len(sequence)
        self.indelList = [[] for n in xrange(self.ploidy)]
        self.snpList = [[] for n in xrange(self.ploidy)]
        self.allCigar = [[] for n in xrange(self.ploidy)]
        self.adj = [None for n in xrange(self.ploidy)]
        # blackList[ploid][pos] = 0		safe to insert variant here
        # blackList[ploid][pos] = 1		indel inserted here
        # blackList[ploid][pos] = 2		snp inserted here
        # blackList[ploid][pos] = 3		invalid position for various processing reasons
        self.blackList = [
            np.zeros(self.seqLen, dtype='<i4') for n in xrange(self.ploidy)
        ]

        # disallow mutations to occur on window overlap points
        self.winBuffer = windowOverlap
        for p in xrange(self.ploidy):
            self.blackList[p][-self.winBuffer] = 3
            self.blackList[p][-self.winBuffer - 1] = 3

        # if we're only creating a vcf, skip some expensive initialization related to coverage depth
        if not self.onlyVCF:
            (self.windowSize, coverage_vals) = coverageDat
            self.win_per_read = int(self.readLen / float(self.windowSize) +
                                    0.5)
            self.which_bucket = DiscreteDistribution(coverage_vals,
                                                     range(len(coverage_vals)))

    def init_mutModels(self, mutationModels, mutRate):
        if mutationModels == []:
            ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
            self.modelData = ml[:self.ploidy]
        else:
            if len(mutationModels) != self.ploidy:
                print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
                exit(1)
            self.modelData = copy.deepcopy(mutationModels)

        # do we need to rescale mutation frequencies?
        mutRateSum = sum([n[0] for n in self.modelData])
        self.mutRescale = mutRate
        if self.mutRescale == None:
            self.mutScalar = 1.0
        else:
            self.mutScalar = float(
                self.mutRescale) / (mutRateSum / float(len(self.modelData)))

        # how are mutations spread to each ploid, based on their specified mut rates?
        self.ploidMutFrac = [float(n[0]) / mutRateSum for n in self.modelData]
        self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,
                                                  range(self.ploidy))

        # init mutation models
        #
        # self.models[ploid][0] = average mutation rate
        # self.models[ploid][1] = p(mut is homozygous | mutation occurs)
        # self.models[ploid][2] = p(mut is indel | mut occurs)
        # self.models[ploid][3] = p(insertion | indel occurs)
        # self.models[ploid][4] = distribution of insertion lengths
        # self.models[ploid][5] = distribution of deletion lengths
        # self.models[ploid][6] = distribution of trinucleotide SNP transitions
        # self.models[ploid][7] = p(trinuc mutates)
        self.models = []
        for n in self.modelData:
            self.models.append([
                self.mutScalar * n[0], n[1], n[2], n[3],
                DiscreteDistribution(n[5], n[4]),
                DiscreteDistribution(n[7], n[6]), []
            ])
            for m in n[8]:
                self.models[-1][6].append([
                    DiscreteDistribution(m[0], NUCL),
                    DiscreteDistribution(m[1], NUCL),
                    DiscreteDistribution(m[2], NUCL),
                    DiscreteDistribution(m[3], NUCL)
                ])
            self.models[-1].append([m for m in n[9]])

    def init_poisson(self):
        ind_l_list = [
            self.seqLen * self.models[i][0] * self.models[i][2] *
            self.ploidMutFrac[i] for i in xrange(len(self.models))
        ]
        snp_l_list = [
            self.seqLen * self.models[i][0] * (1. - self.models[i][2]) *
            self.ploidMutFrac[i] for i in xrange(len(self.models))
        ]
        k_range = range(int(self.seqLen * MAX_MUTFRAC))
        self.ind_pois = [
            poisson_list(k_range, ind_l_list[n])
            for n in xrange(len(self.models))
        ]
        self.snp_pois = [
            poisson_list(k_range, snp_l_list[n])
            for n in xrange(len(self.models))
        ]

    def init_trinucBias(self):
        # compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
        #
        # note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
        #       at positions affected by indels. At the moment I'm going to consider this negligible.
        trinuc_snp_bias = [[0. for n in xrange(self.seqLen)]
                           for m in xrange(self.ploidy)]
        self.trinuc_bias = [None for n in xrange(self.ploidy)]
        for p in xrange(self.ploidy):
            for i in xrange(self.winBuffer + 1, self.seqLen - 1):
                trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(
                    self.sequences[p][i - 1:i + 2])]]
            self.trinuc_bias[p] = DiscreteDistribution(
                trinuc_snp_bias[p][self.winBuffer + 1:self.seqLen - 1],
                range(self.winBuffer + 1, self.seqLen - 1))

    def update(self,
               xOffset,
               sequence,
               ploidy,
               windowOverlap,
               readLen,
               mutationModels=[],
               mutRate=None,
               coverageDat=None):
        # if mutation model is changed, we have to reinitialize it...
        if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
            self.ploidy = ploidy
            self.mutRescale = mutRate
            self.init_mutModels(mutationModels, mutRate)
        # if sequence length is different than previous window, we have to redo snp/indel poissons
        if len(sequence) != self.seqLen:
            self.seqLen = len(sequence)
            self.init_poisson()
        # basic vars
        self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen,
                            coverageDat)
        self.indelsToAdd = [n.sample() for n in self.ind_pois]
        self.snpsToAdd = [n.sample() for n in self.snp_pois]
        #print (self.indelsToAdd,self.snpsToAdd)
        # initialize trinuc snp bias
        self.init_trinucBias()

    def insert_mutations(self, inputList):
        #
        #	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
        #
        for inpV in inputList:
            whichPloid = []
            wps = inpV[4][0]
            if wps == None:  # if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
                whichPloid.append(self.ploidMutPrior.sample())
                whichAlt = [0]
            else:
                if 'WP=' in wps:
                    whichPloid = [
                        int(n) for n in inpV[-1][3:].split(',') if n == '1'
                    ]
                    whichAlt = [0] * len(whichPloid)
                elif '/' in wps or '|' in wps:
                    if '/' in wps:
                        splt = wps.split('/')
                    else:
                        splt = wps.split('|')
                    whichPloid = []
                    whichAlt = []
                    for i in xrange(len(splt)):
                        if splt[i] == '1':
                            whichPloid.append(i)
                        whichAlt.append(int(splt[i]) - 1)

            for i in xrange(len(whichPloid)):
                p = whichPloid[i]
                myAlt = inpV[2][whichAlt[i]]
                myVar = (inpV[0] - self.x, inpV[1], myAlt)
                inLen = max([len(inpV[1]), len(myAlt)])
                #print myVar, chr(self.sequences[p][myVar[0]])
                if len(inpV[1]) == 1 and len(myAlt) == 1:
                    if self.blackList[p][myVar[0]]:
                        continue
                    self.snpList[p].append(myVar)
                    self.blackList[p][myVar[0]] = 2
                else:
                    for k in xrange(myVar[0], myVar[0] + inLen + 1):
                        if self.blackList[p][k]:
                            continue
                    for k in xrange(myVar[0], myVar[0] + inLen + 1):
                        self.blackList[p][k] = 1
                    self.indelList[p].append(myVar)

    def random_mutations(self):

        #	add random indels
        all_indels = [[] for n in self.sequences]
        for i in xrange(self.ploidy):
            for j in xrange(self.indelsToAdd[i]):
                if random.random(
                ) <= self.models[i][1]:  # insert homozygous indel
                    whichPloid = range(self.ploidy)
                else:  # insert heterozygous indel
                    whichPloid = [self.ploidMutPrior.sample()]

                # try to find suitable places to insert indels
                eventPos = -1
                for attempt in xrange(MAX_ATTEMPTS):
                    eventPos = random.randint(self.winBuffer, self.seqLen - 1)
                    for p in whichPloid:
                        if self.blackList[p][eventPos]:
                            eventPos = -1
                    if eventPos != -1:
                        break
                if eventPos == -1:
                    continue

                if random.random() <= self.models[i][3]:  # insertion
                    inLen = self.models[i][4].sample()
                    # sequence content of random insertions is uniformly random (change this later)
                    inSeq = ''.join(
                        [random.choice(NUCL) for n in xrange(inLen)])
                    refNucl = chr(self.sequences[i][eventPos])
                    myIndel = (eventPos, refNucl, refNucl + inSeq)
                else:  # deletion
                    inLen = self.models[i][5].sample()
                    if eventPos + inLen + 1 >= len(
                            self.sequences[i]
                    ):  # skip if deletion too close to boundary
                        continue
                    if inLen == 1:
                        inSeq = chr(self.sequences[i][eventPos + 1])
                    else:
                        inSeq = str(self.sequences[i][eventPos + 1:eventPos +
                                                      inLen + 1])
                    refNucl = chr(self.sequences[i][eventPos])
                    myIndel = (eventPos, refNucl + inSeq, refNucl)

                # if event too close to boundary, skip. if event conflicts with other indel, skip.
                skipEvent = False
                if eventPos + len(
                        myIndel[1]) >= self.seqLen - self.winBuffer - 1:
                    skipEvent = True
                if skipEvent:
                    continue
                for p in whichPloid:
                    for k in xrange(eventPos, eventPos + inLen + 1):
                        if self.blackList[p][k]:
                            skipEvent = True
                if skipEvent:
                    continue

                for p in whichPloid:
                    for k in xrange(eventPos, eventPos + inLen + 1):
                        self.blackList[p][k] = 1
                    all_indels[p].append(myIndel)

        for i in xrange(len(all_indels)):
            all_indels[i].extend(self.indelList[i])
        all_indels = [sorted(n, reverse=True) for n in all_indels]
        #print all_indels

        #	add random snps
        all_snps = [[] for n in self.sequences]
        for i in xrange(self.ploidy):
            for j in xrange(self.snpsToAdd[i]):
                if random.random(
                ) <= self.models[i][1]:  # insert homozygous SNP
                    whichPloid = range(self.ploidy)
                else:  # insert heterozygous SNP
                    whichPloid = [self.ploidMutPrior.sample()]

                # try to find suitable places to insert snps
                eventPos = -1
                for attempt in xrange(MAX_ATTEMPTS):
                    # based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
                    # (if there are multiple ploids, choose one at random)
                    #eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
                    ploid_to_use = whichPloid[random.randint(
                        0,
                        len(whichPloid) - 1)]
                    eventPos = self.trinuc_bias[ploid_to_use].sample()
                    for p in whichPloid:
                        if self.blackList[p][eventPos]:
                            eventPos = -1
                    if eventPos != -1:
                        break
                if eventPos == -1:
                    continue

                refNucl = chr(self.sequences[i][eventPos])
                context = str(
                    chr(self.sequences[i][eventPos - 1]) +
                    chr(self.sequences[i][eventPos + 1]))
                # sample from tri-nucleotide substitution matrices to get SNP alt allele
                newNucl = self.models[i][6][TRI_IND[context]][
                    NUC_IND[refNucl]].sample()
                mySNP = (eventPos, refNucl, newNucl)

                for p in whichPloid:
                    all_snps[p].append(mySNP)
                    self.blackList[p][mySNP[0]] = 2

        # combine random snps with inserted snps, remove any snps that overlap indels
        for p in xrange(len(all_snps)):
            all_snps[p].extend(self.snpList[p])
            all_snps[p] = [
                n for n in all_snps[p] if self.blackList[p][n[0]] != 1
            ]

        # modify reference sequences
        for i in xrange(len(all_snps)):
            for j in xrange(len(all_snps[i])):
                # sanity checking (for debugging purposes)
                vPos = all_snps[i][j][0]
                if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
                    print '\nError: Something went wrong!\n', all_snps[i][
                        j], chr(self.sequences[i][vPos]), '\n'
                    exit(1)
                else:
                    self.sequences[i][vPos] = all_snps[i][j][2]

        adjToAdd = [[] for n in xrange(self.ploidy)]
        for i in xrange(len(all_indels)):
            for j in xrange(len(all_indels[i])):
                # sanity checking (for debugging purposes)
                vPos = all_indels[i][j][0]
                vPos2 = vPos + len(all_indels[i][j][1])
                #print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
                #print len(self.sequences[i]),'-->',
                if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
                    print '\nError: Something went wrong!\n', all_indels[i][
                        j], str(self.sequences[i][vPos:vPos2]), '\n'
                    exit(1)
                else:
                    self.sequences[i] = self.sequences[i][:vPos] + bytearray(
                        all_indels[i][j][2]) + self.sequences[i][vPos2:]
                    adjToAdd[i].append(
                        (all_indels[i][j][0],
                         len(all_indels[i][j][2]) - len(all_indels[i][j][1])))
                #print len(self.sequences[i])
            adjToAdd[i].sort()
            #print adjToAdd[i]

            self.adj[i] = np.zeros(len(self.sequences[i]), dtype='<i4')
            indSoFar = 0
            valSoFar = 0
            for j in xrange(len(self.adj[i])):
                if indSoFar < len(
                        adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0] + 1:
                    valSoFar += adjToAdd[i][indSoFar][1]
                    indSoFar += 1
                self.adj[i][j] = valSoFar

            # precompute cigar strings (we can skip this is going for only vcf output)
            if not self.onlyVCF:
                tempSymbolString = ['M']
                prevVal = self.adj[i][0]
                j = 1
                while j < len(self.adj[i]):
                    diff = self.adj[i][j] - prevVal
                    prevVal = self.adj[i][j]
                    if diff > 0:  # insertion
                        tempSymbolString.extend(['I'] * abs(diff))
                        j += abs(diff)
                    elif diff < 0:  # deletion
                        tempSymbolString.append('D' * abs(diff) + 'M')
                        j += 1
                    else:
                        tempSymbolString.append('M')
                        j += 1

                for j in xrange(len(tempSymbolString) - self.readLen):
                    self.allCigar[i].append(
                        CigarString(
                            listIn=tempSymbolString[j:j +
                                                    self.readLen]).getString())

        # tally up variants implemented
        countDict = {}
        all_variants = [
            sorted(all_snps[i] + all_indels[i]) for i in xrange(self.ploidy)
        ]
        for i in xrange(len(all_variants)):
            for j in xrange(len(all_variants[i])):
                all_variants[i][j] = tuple([all_variants[i][j][0] + self.x
                                            ]) + all_variants[i][j][1:]
                t = tuple(all_variants[i][j])
                if t not in countDict:
                    countDict[t] = []
                countDict[t].append(i)

        #
        #	TODO: combine multiple variants that happened to occur at same position into single vcf entry
        #

        output_variants = []
        for k in sorted(countDict.keys()):
            output_variants.append(
                k + tuple([len(countDict[k]) / float(self.ploidy)]))
            ploid_string = ['0' for n in xrange(self.ploidy)]
            for k2 in [n for n in countDict[k]]:
                ploid_string[k2] = '1'
            output_variants[-1] += tuple(['WP=' + '/'.join(ploid_string)])
        return output_variants

    def sample_read(self, sequencingModel, fragLen=None):

        # choose a ploid
        myPloid = random.randint(0, self.ploidy - 1)

        # choose a random position within the ploid, and generate quality scores / sequencing errors
        readsToSample = []
        if fragLen == None:
            #rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random

            # decide which subsection of the sequence to sample from using coverage probabilities
            coords_bad = True
            while coords_bad:
                myBucket = max(
                    [self.which_bucket.sample() - self.win_per_read, 0])
                coords_to_select_from = [
                    myBucket * self.windowSize,
                    (myBucket + 1) * self.windowSize
                ]
                coords_to_select_from[0] += self.adj[myPloid][
                    coords_to_select_from[0]]
                coords_to_select_from[1] += self.adj[myPloid][
                    coords_to_select_from[1]]
                if coords_to_select_from[1] < len(
                        self.sequences[myPloid]) - self.readLen:
                    coords_bad = False
            rPos = random.randint(coords_to_select_from[0],
                                  coords_to_select_from[1] - 1)

            # sample read position and call function to compute quality scores / sequencing errors
            rDat = self.sequences[myPloid][rPos:rPos + self.readLen]
            (myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
            readsToSample.append([rPos, myQual, myErrors, rDat])

        else:
            #rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random

            # decide which subsection of the sequence to sample from using coverage probabilities
            coords_bad = True
            while coords_bad:
                myBucket = max(
                    [self.which_bucket.sample() - self.win_per_read, 0])
                coords_to_select_from = [
                    myBucket * self.windowSize,
                    (myBucket + 1) * self.windowSize
                ]
                coords_to_select_from[0] += self.adj[myPloid][
                    coords_to_select_from[0]]
                coords_to_select_from[1] += self.adj[myPloid][
                    coords_to_select_from[
                        0]]  # both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
                rPos1 = random.randint(coords_to_select_from[0],
                                       coords_to_select_from[1] - 1)
                # for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
                if random.randint(1,
                                  2) == 1 and rPos1 > fragLen - self.readLen:
                    rPos1 -= fragLen - self.readLen
                if rPos1 < len(self.sequences[myPloid]) - fragLen:
                    coords_bad = False

            rPos2 = rPos1 + fragLen - self.readLen
            rDat1 = self.sequences[myPloid][rPos1:rPos1 + self.readLen]
            rDat2 = self.sequences[myPloid][rPos2:rPos2 + self.readLen]
            (myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
            (myQual2, myErrors2) = sequencingModel.getSequencingErrors(
                rDat2, isReverseStrand=True)
            readsToSample.append([rPos1, myQual1, myErrors1, rDat1])
            readsToSample.append([rPos2, myQual2, myErrors2, rDat2])

        # error format:
        # myError[i] = (type, len, pos, ref, alt)

        # examine sequencing errors to-be-inserted.
        #	- remove deletions that don't have enough bordering sequence content to "fill in"
        # if error is valid, make the changes to the read data
        rOut = []
        for r in readsToSample:
            myCigar = self.allCigar[myPloid][r[0]]
            totalD = sum([error[1] for error in r[2] if error[0] == 'D'])
            totalI = sum([error[1] for error in r[2] if error[0] == 'I'])
            availB = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
            # add buffer sequence to fill in positions that get deleted
            r[3] += self.sequences[myPloid][r[0] + self.readLen:r[0] +
                                            self.readLen + totalD]
            expandedCigar = []
            extraCigar = []
            adj = 0
            sse_adj = [0 for n in xrange(self.readLen)]
            anyIndelErr = False

            # sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
            # secondarily, sort by index
            arrangedErrors = {'D': [], 'I': [], 'S': []}
            for error in r[2]:
                arrangedErrors[error[0]].append((error[2], error))
            sortedErrors = []
            for k in sorted(arrangedErrors.keys()):
                sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])

            for error in sortedErrors:
                #print r[0], error
                eLen = error[1]
                ePos = error[2]
                if error[0] == 'D' or error[0] == 'I':
                    anyIndelErr = True
                    extraCigarVal = []
                    if totalD > availB:  # if not enough bases to fill-in deletions, skip all indel erors
                        continue
                    if expandedCigar == []:
                        expandedCigar = CigarString(stringIn=myCigar).getList()

                        fillToGo = totalD - totalI
                        if fillToGo > 0:
                            extraCigarVal = CigarString(
                                stringIn=self.allCigar[myPloid][
                                    r[0] + fillToGo]).getList()[-fillToGo:]

                    # insert deletion error into read and update cigar string accordingly
                    if error[0] == 'D':
                        pi = ePos + adj
                        pf = ePos + adj + eLen + 1
                        if str(r[3][pi:pf]) == str(error[3]):
                            r[3] = r[3][:pi + 1] + r[3][pf:]
                            expandedCigar = expandedCigar[:pi +
                                                          1] + expandedCigar[
                                                              pf:]
                            expandedCigar[pi +
                                          1] = 'D' * eLen + expandedCigar[pi +
                                                                          1]
                        else:
                            print '\nError, ref does not match alt while attempting to insert deletion error!\n'
                            exit(1)
                        adj -= eLen
                        for i in xrange(ePos, len(sse_adj)):
                            sse_adj[i] -= eLen

                    # insert insertion error into read and update cigar string accordingly
                    else:
                        if chr(r[3][ePos + adj]) == error[3]:
                            r[3] = r[3][:ePos +
                                        adj] + error[4] + r[3][ePos + adj + 1:]
                            expandedCigar = expandedCigar[:ePos + adj] + [
                                'I'
                            ] * eLen + expandedCigar[ePos + adj + 1:]
                        else:
                            print '\nError, ref does not match alt while attempting to insert insertion error!\n'
                            exit(1)
                        adj += eLen
                        for i in xrange(ePos, len(sse_adj)):
                            sse_adj[i] += eLen

                else:  # substitution errors, much easier by comparison...
                    if chr(r[3][ePos + sse_adj[ePos]]) == error[3]:
                        r[3][ePos + sse_adj[ePos]] = error[4]
                    else:
                        print '\nError, ref does not match alt while attempting to insert substitution error!\n'
                        exit(1)

            if anyIndelErr:
                if len(expandedCigar):
                    #print myCigar,'-->',
                    relevantCigar = (expandedCigar +
                                     extraCigarVal)[:self.readLen]
                    myCigar = CigarString(listIn=relevantCigar).getString()
                    #print myCigar

                r[3] = r[3][:self.readLen]
                #if len(r[3]) != self.readLen:
                #	print 'AHHHHHH_1'
                #	exit(1)
                #if len(expandedCigar+extraCigarVal) != self.readLen:
                #	print 'AHHHHHH_2'
                #	exit(1)

            rOut.append([
                r[0] - self.adj[myPloid][r[0]], myCigar,
                str(r[3]),
                str(r[1])
            ])

        # rOut[i] = (pos, cigar, read_string, qual_string)
        return rOut

Example #14

Show file

File: SequenceContainer.py Project: zstephens/neat-genreads

	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)

		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust length to match desired read length
		if self.readLen == len(initQ1):
			self.qIndRemap = range(self.readLen)
		else:
			print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
			self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		# initialize probability distributions
		self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
		self.probDistByPosByPrevQ1 = [None]
		for i in xrange(1,len(initQ1)):
			self.probDistByPosByPrevQ1.append([])
			for j in xrange(len(initQ1[0])):
				if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
				else:
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

		if self.PE_MODELS:
			self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
			self.probDistByPosByPrevQ2 = [None]
			for i in xrange(1,len(initQ2)):
				self.probDistByPosByPrevQ2.append([])
				for j in xrange(len(initQ2[0])):
					if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))

Example #15

Show file

File: SequenceContainer.py Project: zstephens/neat-genreads

class ReadContainer:
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)

		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust length to match desired read length
		if self.readLen == len(initQ1):
			self.qIndRemap = range(self.readLen)
		else:
			print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
			self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		# initialize probability distributions
		self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
		self.probDistByPosByPrevQ1 = [None]
		for i in xrange(1,len(initQ1)):
			self.probDistByPosByPrevQ1.append([])
			for j in xrange(len(initQ1[0])):
				if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
				else:
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

		if self.PE_MODELS:
			self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
			self.probDistByPosByPrevQ2 = [None]
			for i in xrange(1,len(initQ2)):
				self.probDistByPosByPrevQ2.append([])
				for j in xrange(len(initQ2[0])):
					if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))

	def getSequencingErrors(self, readData, isReverseStrand=False):

		qOut = [0]*self.readLen
		sErr = []

		if self.PE_MODELS and isReverseStrand:
			myQ = self.initDistByPos2[0].sample()
		else:
			myQ = self.initDistByPos1[0].sample()

		if random.random() < self.qErrRate[myQ]:
			sErr.append(0)
		qOut[0] = myQ + self.offQ
		for i in xrange(1,self.readLen):

			if self.PE_MODELS and isReverseStrand:
				myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample()
			else:
				myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample()

			if random.random() < self.errorScale*self.qErrRate[myQ]:
				sErr.append(i)
			qOut[i] = myQ + self.offQ
		qOut = ''.join([chr(n) for n in qOut])

		sOut = []
		nDelSoFar = 0
		# don't allow indel errors to occur on subsequent positions
		prevIndel = -2
		# don't allow other sequencing errors to occur on bases removed by deletion errors
		delBlacklist = []

		for ind in sErr[::-1]:	# for each error that we're going to insert...

			# determine error type
			isSub = True
			if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and ind > prevIndel+1:
				if random.random() < self.errP[1]:
					isSub = False

			# errorOut = (type, len, pos, ref, alt)

			if isSub:								# insert substitution error
				myNucl  = chr(readData[ind])
				newNucl = self.errSSE[NUC_IND[myNucl]].sample()
				sOut.append(('S',1,ind,myNucl,newNucl))
			else:									# insert indel error
				indelLen = self.errSIE.sample()
				if random.random() < self.errP[4]:		# insertion error
					myNucl  = chr(readData[ind])
					newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)])
					sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl))
				elif ind < self.readLen-2-nDelSoFar:	# deletion error (prevent too many of them from stacking up)
					myNucl  = str(readData[ind:ind+indelLen+1])
					newNucl = chr(readData[ind])
					nDelSoFar += len(myNucl)-1
					sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl))
					for i in xrange(ind+1,ind+indelLen+1):
						delBlacklist.append(i)
				prevIndel = ind

		# remove blacklisted errors
		for i in xrange(len(sOut)-1,-1,-1):
			if sOut[i][2] in delBlacklist:
				del sOut[i]

		return (qOut,sOut)

Example #16

Show file

File: SequenceContainer.py Project: zstephens/neat-genreads

class SequenceContainer:
	def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, coverageDat=None, onlyVCF=False):
		# initialize basic variables
		self.onlyVCF = onlyVCF
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat)
		# initialize mutation models
		self.init_mutModels(mutationModels, mutRate)
		# sample the number of variants that will be inserted into each ploid
		self.init_poisson()
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		# initialize trinuc snp bias
		self.init_trinucBias()

	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat):
		self.x         = xOffset
		self.ploidy    = ploidy
		self.readLen   = readLen
		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
		self.seqLen    = len(sequence)
		self.indelList = [[] for n in xrange(self.ploidy)]
		self.snpList   = [[] for n in xrange(self.ploidy)]
		self.allCigar  = [[] for n in xrange(self.ploidy)]
		self.adj       = [None for n in xrange(self.ploidy)]
		# blackList[ploid][pos] = 0		safe to insert variant here
		# blackList[ploid][pos] = 1		indel inserted here
		# blackList[ploid][pos] = 2		snp inserted here
		# blackList[ploid][pos] = 3		invalid position for various processing reasons
		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]

		# disallow mutations to occur on window overlap points
		self.winBuffer = windowOverlap
		for p in xrange(self.ploidy):
			self.blackList[p][-self.winBuffer]   = 3
			self.blackList[p][-self.winBuffer-1] = 3
		
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, coverage_vals) = coverageDat
			self.win_per_read = int(self.readLen/float(self.windowSize)+0.5)
			self.which_bucket = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

	def init_mutModels(self,mutationModels,mutRate):
		if mutationModels == []:
			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
			self.modelData = ml[:self.ploidy]
		else:
			if len(mutationModels) != self.ploidy:
				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
				exit(1)
			self.modelData = copy.deepcopy(mutationModels)

		# do we need to rescale mutation frequencies?
		mutRateSum = sum([n[0] for n in self.modelData])
		self.mutRescale = mutRate
		if self.mutRescale == None:
			self.mutScalar = 1.0
		else:
			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))

		# how are mutations spread to each ploid, based on their specified mut rates?
		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))

		# init mutation models
		#
		# self.models[ploid][0] = average mutation rate
		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
		# self.models[ploid][2] = p(mut is indel | mut occurs)
		# self.models[ploid][3] = p(insertion | indel occurs)
		# self.models[ploid][4] = distribution of insertion lengths
		# self.models[ploid][5] = distribution of deletion lengths
		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
		# self.models[ploid][7] = p(trinuc mutates)
		self.models = []
		for n in self.modelData:
			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
			for m in n[8]:
				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
					                       DiscreteDistribution(m[1],NUCL),
					                       DiscreteDistribution(m[2],NUCL),
					                       DiscreteDistribution(m[3],NUCL)])
			self.models[-1].append([m for m in n[9]])

	def init_poisson(self):
		ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		k_range    = range(int(self.seqLen*MAX_MUTFRAC))
		self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))]
		self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))]

	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))

	def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, coverageDat=None):
		# if mutation model is changed, we have to reinitialize it...
		if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
			self.ploidy = ploidy
			self.mutRescale = mutRate
			self.init_mutModels(mutationModels, mutRate)
		# if sequence length is different than previous window, we have to redo snp/indel poissons
		if len(sequence) != self.seqLen:
			self.seqLen = len(sequence)
			self.init_poisson()
		# basic vars
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat)
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		#print (self.indelsToAdd,self.snpsToAdd)
		# initialize trinuc snp bias
		self.init_trinucBias()

	def insert_mutations(self, inputList):
		#
		#	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
		#
		for inpV in inputList:
			whichPloid = []
			wps = inpV[4][0]
			if wps == None:	# if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
				whichPloid.append(self.ploidMutPrior.sample())
				whichAlt = [0]
			else:
				if 'WP=' in wps:
					whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1']
					whichAlt   = [0]*len(whichPloid)
				elif '/' in wps or '|' in wps:
					if '/' in wps:
						splt = wps.split('/')
					else:
						splt = wps.split('|')
					whichPloid = []
					whichAlt   = []
					for i in xrange(len(splt)):
						if splt[i] == '1':
							whichPloid.append(i)
						whichAlt.append(int(splt[i])-1)
						
			for i in xrange(len(whichPloid)):
				p = whichPloid[i]
				myAlt = inpV[2][whichAlt[i]]
				myVar = (inpV[0]-self.x,inpV[1],myAlt)
				inLen = max([len(inpV[1]),len(myAlt)])
				#print myVar, chr(self.sequences[p][myVar[0]])
				if len(inpV[1]) == 1 and len(myAlt) == 1:
					if self.blackList[p][myVar[0]]:
						continue
					self.snpList[p].append(myVar)
					self.blackList[p][myVar[0]] = 2
				else:
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						if self.blackList[p][k]:
							continue
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						self.blackList[p][k] = 1
					self.indelList[p].append(myVar)

	def random_mutations(self):
		
		#	add random indels
		all_indels  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.indelsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous indel
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous indel
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert indels
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					eventPos = random.randint(self.winBuffer,self.seqLen-1)
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				if random.random() <= self.models[i][3]:	# insertion
					inLen   = self.models[i][4].sample()
					# sequence content of random insertions is uniformly random (change this later)
					inSeq   = ''.join([random.choice(NUCL) for n in xrange(inLen)])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl,refNucl+inSeq)
				else:										# deletion
					inLen   = self.models[i][5].sample()
					if eventPos+inLen+1 >= len(self.sequences[i]):	# skip if deletion too close to boundary
						continue
					if inLen == 1:
						inSeq = chr(self.sequences[i][eventPos+1])
					else:
						inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl+inSeq,refNucl)

				# if event too close to boundary, skip. if event conflicts with other indel, skip.
				skipEvent = False
				if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1:
					skipEvent = True
				if skipEvent:
					continue
				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						if self.blackList[p][k]:
							skipEvent = True
				if skipEvent:
					continue

				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						self.blackList[p][k] = 1
					all_indels[p].append(myIndel)

		for i in xrange(len(all_indels)):
			all_indels[i].extend(self.indelList[i])
		all_indels = [sorted(n,reverse=True) for n in all_indels]
		#print all_indels

		#	add random snps
		all_snps  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.snpsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous SNP
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous SNP
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert snps
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					# based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
					# (if there are multiple ploids, choose one at random)
					#eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
					ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)]
					eventPos     = self.trinuc_bias[ploid_to_use].sample()
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				refNucl = chr(self.sequences[i][eventPos])
				context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1]))
				# sample from tri-nucleotide substitution matrices to get SNP alt allele
				newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample()
				mySNP   = (eventPos,refNucl,newNucl)

				for p in whichPloid:
					all_snps[p].append(mySNP)
					self.blackList[p][mySNP[0]] = 2

		# combine random snps with inserted snps, remove any snps that overlap indels
		for p in xrange(len(all_snps)):
			all_snps[p].extend(self.snpList[p])
			all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1]

		# modify reference sequences
		for i in xrange(len(all_snps)):
			for j in xrange(len(all_snps[i])):
				# sanity checking (for debugging purposes)
				vPos = all_snps[i][j][0]
				if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
					print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n'
					exit(1)
				else:
					self.sequences[i][vPos] = all_snps[i][j][2]

		adjToAdd = [[] for n in xrange(self.ploidy)]
		for i in xrange(len(all_indels)):
			for j in xrange(len(all_indels[i])):
				# sanity checking (for debugging purposes)
				vPos  = all_indels[i][j][0]
				vPos2 = vPos + len(all_indels[i][j][1])
				#print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
				#print len(self.sequences[i]),'-->',
				if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
					print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n'
					exit(1)
				else:
					self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:]
					adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1])))
				#print len(self.sequences[i])
			adjToAdd[i].sort()
			#print adjToAdd[i]

			self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4')
			indSoFar = 0
			valSoFar = 0
			for j in xrange(len(self.adj[i])):
				if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1:
					valSoFar += adjToAdd[i][indSoFar][1]
					indSoFar += 1
				self.adj[i][j] = valSoFar

			# precompute cigar strings (we can skip this is going for only vcf output)
			if not self.onlyVCF:
				tempSymbolString = ['M']
				prevVal = self.adj[i][0]
				j = 1
				while j < len(self.adj[i]):
					diff = self.adj[i][j] - prevVal
					prevVal = self.adj[i][j]
					if diff > 0:	# insertion
						tempSymbolString.extend(['I']*abs(diff))
						j += abs(diff)
					elif diff < 0:	# deletion
						tempSymbolString.append('D'*abs(diff)+'M')
						j += 1
					else:
						tempSymbolString.append('M')
						j += 1

				for j in xrange(len(tempSymbolString)-self.readLen):
					self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString())


		# tally up variants implemented
		countDict = {}
		all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)]
		for i in xrange(len(all_variants)):
			for j in xrange(len(all_variants[i])):
				all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:]
				t = tuple(all_variants[i][j])
				if t not in countDict:
					countDict[t] = []
				countDict[t].append(i)

		#
		#	TODO: combine multiple variants that happened to occur at same position into single vcf entry
		#

		output_variants = []
		for k in sorted(countDict.keys()):
			output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)]))
			ploid_string = ['0' for n in xrange(self.ploidy)]
			for k2 in [n for n in countDict[k]]:
				ploid_string[k2] = '1'
			output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)])
		return output_variants


	def sample_read(self, sequencingModel, fragLen=None):
		
		# choose a ploid
		myPloid = random.randint(0,self.ploidy-1)

		# choose a random position within the ploid, and generate quality scores / sequencing errors
		readsToSample = []
		if fragLen == None:
			#rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random

			# decide which subsection of the sequence to sample from using coverage probabilities
			coords_bad = True
			while coords_bad:
				myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
				coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
				coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
				coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[1]]
				if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen:
					coords_bad = False
			rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)

			# sample read position and call function to compute quality scores / sequencing errors
			rDat = self.sequences[myPloid][rPos:rPos+self.readLen]
			(myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
			readsToSample.append([rPos,myQual,myErrors,rDat])

		else:
			#rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random

			# decide which subsection of the sequence to sample from using coverage probabilities
			coords_bad = True
			while coords_bad:
				myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
				coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
				coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
				coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]	# both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
				rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
				# for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
				if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen:
					rPos1 -= fragLen - self.readLen
				if rPos1 < len(self.sequences[myPloid])-fragLen:
					coords_bad = False

			rPos2 = rPos1 + fragLen - self.readLen
			rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen]
			rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen]
			(myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
			(myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True)
			readsToSample.append([rPos1,myQual1,myErrors1,rDat1])
			readsToSample.append([rPos2,myQual2,myErrors2,rDat2])

		# error format:
		# myError[i] = (type, len, pos, ref, alt)

		# examine sequencing errors to-be-inserted.
		#	- remove deletions that don't have enough bordering sequence content to "fill in"
		# if error is valid, make the changes to the read data
		rOut = []
		for r in readsToSample:
			myCigar = self.allCigar[myPloid][r[0]]
			totalD  = sum([error[1] for error in r[2] if error[0] == 'D'])
			totalI  = sum([error[1] for error in r[2] if error[0] == 'I'])
			availB  = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
			# add buffer sequence to fill in positions that get deleted
			r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD]
			expandedCigar = []
			extraCigar    = []
			adj           = 0
			sse_adj       = [0 for n in xrange(self.readLen)]
			anyIndelErr   = False

			# sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
			# secondarily, sort by index
			arrangedErrors = {'D':[],'I':[],'S':[]}
			for error in r[2]:
				arrangedErrors[error[0]].append((error[2],error))
			sortedErrors = []
			for k in sorted(arrangedErrors.keys()):
				sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])

			for error in sortedErrors:
				#print r[0], error
				eLen = error[1]
				ePos = error[2]
				if error[0] == 'D' or error[0] == 'I':
					anyIndelErr   = True
					extraCigarVal = []
					if totalD > availB:	# if not enough bases to fill-in deletions, skip all indel erors
						continue
					if expandedCigar == []:
						expandedCigar = CigarString(stringIn=myCigar).getList()

						fillToGo = totalD - totalI
						if fillToGo > 0:
							extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:]

					# insert deletion error into read and update cigar string accordingly
					if error[0] == 'D':
						pi = ePos+adj
						pf = ePos+adj+eLen+1
						if str(r[3][pi:pf]) == str(error[3]):
							r[3] = r[3][:pi+1] + r[3][pf:]
							expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:]
							expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1]
						else:
							print '\nError, ref does not match alt while attempting to insert deletion error!\n'
							exit(1)
						adj -= eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] -= eLen

					# insert insertion error into read and update cigar string accordingly
					else:
						if chr(r[3][ePos+adj]) == error[3]:
							r[3] = r[3][:ePos+adj] + error[4] + r[3][ePos+adj+1:]
							expandedCigar = expandedCigar[:ePos+adj] + ['I']*eLen + expandedCigar[ePos+adj+1:]
						else:
							print '\nError, ref does not match alt while attempting to insert insertion error!\n'
							exit(1)
						adj += eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] += eLen

				else:	# substitution errors, much easier by comparison...
					if chr(r[3][ePos+sse_adj[ePos]]) == error[3]:
						r[3][ePos+sse_adj[ePos]] = error[4]
					else:
						print '\nError, ref does not match alt while attempting to insert substitution error!\n'
						exit(1)

			if anyIndelErr:
				if len(expandedCigar):
					#print myCigar,'-->',
					relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen]
					myCigar = CigarString(listIn=relevantCigar).getString()
					#print myCigar

				r[3] = r[3][:self.readLen]
				#if len(r[3]) != self.readLen:
				#	print 'AHHHHHH_1'
				#	exit(1)
				#if len(expandedCigar+extraCigarVal) != self.readLen:
				#	print 'AHHHHHH_2'
				#	exit(1)

			rOut.append([r[0]-self.adj[myPloid][r[0]],myCigar,str(r[3]),str(r[1])])

		# rOut[i] = (pos, cigar, read_string, qual_string)
		return rOut