def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen,
                       coverageDat):
        self.x = xOffset
        self.ploidy = ploidy
        self.readLen = readLen
        self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
        self.seqLen = len(sequence)
        self.indelList = [[] for n in xrange(self.ploidy)]
        self.snpList = [[] for n in xrange(self.ploidy)]
        self.allCigar = [[] for n in xrange(self.ploidy)]
        self.adj = [None for n in xrange(self.ploidy)]
        # blackList[ploid][pos] = 0		safe to insert variant here
        # blackList[ploid][pos] = 1		indel inserted here
        # blackList[ploid][pos] = 2		snp inserted here
        # blackList[ploid][pos] = 3		invalid position for various processing reasons
        self.blackList = [
            np.zeros(self.seqLen, dtype='<i4') for n in xrange(self.ploidy)
        ]

        # disallow mutations to occur on window overlap points
        self.winBuffer = windowOverlap
        for p in xrange(self.ploidy):
            self.blackList[p][-self.winBuffer] = 3
            self.blackList[p][-self.winBuffer - 1] = 3

        # if we're only creating a vcf, skip some expensive initialization related to coverage depth
        if not self.onlyVCF:
            (self.windowSize, coverage_vals) = coverageDat
            self.win_per_read = int(self.readLen / float(self.windowSize) +
                                    0.5)
            self.which_bucket = DiscreteDistribution(coverage_vals,
                                                     range(len(coverage_vals)))
	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat):
		self.x         = xOffset
		self.ploidy    = ploidy
		self.readLen   = readLen
		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
		self.seqLen    = len(sequence)
		self.indelList = [[] for n in xrange(self.ploidy)]
		self.snpList   = [[] for n in xrange(self.ploidy)]
		self.allCigar  = [[] for n in xrange(self.ploidy)]
		self.adj       = [None for n in xrange(self.ploidy)]
		# blackList[ploid][pos] = 0		safe to insert variant here
		# blackList[ploid][pos] = 1		indel inserted here
		# blackList[ploid][pos] = 2		snp inserted here
		# blackList[ploid][pos] = 3		invalid position for various processing reasons
		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]

		# disallow mutations to occur on window overlap points
		self.winBuffer = windowOverlap
		for p in xrange(self.ploidy):
			self.blackList[p][-self.winBuffer]   = 3
			self.blackList[p][-self.winBuffer-1] = 3
		
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, coverage_vals) = coverageDat
			self.win_per_read = int(self.readLen/float(self.windowSize)+0.5)
			self.which_bucket = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))
Example #3
0
	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))
    def init_mutModels(self, mutationModels, mutRate):
        if mutationModels == []:
            ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
            self.modelData = ml[:self.ploidy]
        else:
            if len(mutationModels) != self.ploidy:
                print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
                exit(1)
            self.modelData = copy.deepcopy(mutationModels)

        # do we need to rescale mutation frequencies?
        mutRateSum = sum([n[0] for n in self.modelData])
        self.mutRescale = mutRate
        if self.mutRescale == None:
            self.mutScalar = 1.0
        else:
            self.mutScalar = float(
                self.mutRescale) / (mutRateSum / float(len(self.modelData)))

        # how are mutations spread to each ploid, based on their specified mut rates?
        self.ploidMutFrac = [float(n[0]) / mutRateSum for n in self.modelData]
        self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,
                                                  range(self.ploidy))

        # init mutation models
        #
        # self.models[ploid][0] = average mutation rate
        # self.models[ploid][1] = p(mut is homozygous | mutation occurs)
        # self.models[ploid][2] = p(mut is indel | mut occurs)
        # self.models[ploid][3] = p(insertion | indel occurs)
        # self.models[ploid][4] = distribution of insertion lengths
        # self.models[ploid][5] = distribution of deletion lengths
        # self.models[ploid][6] = distribution of trinucleotide SNP transitions
        # self.models[ploid][7] = p(trinuc mutates)
        self.models = []
        for n in self.modelData:
            self.models.append([
                self.mutScalar * n[0], n[1], n[2], n[3],
                DiscreteDistribution(n[5], n[4]),
                DiscreteDistribution(n[7], n[6]), []
            ])
            for m in n[8]:
                self.models[-1][6].append([
                    DiscreteDistribution(m[0], NUCL),
                    DiscreteDistribution(m[1], NUCL),
                    DiscreteDistribution(m[2], NUCL),
                    DiscreteDistribution(m[3], NUCL)
                ])
            self.models[-1].append([m for m in n[9]])
	def init_mutModels(self,mutationModels,mutRate):
		if mutationModels == []:
			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
			self.modelData = ml[:self.ploidy]
		else:
			if len(mutationModels) != self.ploidy:
				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
				exit(1)
			self.modelData = copy.deepcopy(mutationModels)

		# do we need to rescale mutation frequencies?
		mutRateSum = sum([n[0] for n in self.modelData])
		self.mutRescale = mutRate
		if self.mutRescale == None:
			self.mutScalar = 1.0
		else:
			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))

		# how are mutations spread to each ploid, based on their specified mut rates?
		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))

		# init mutation models
		#
		# self.models[ploid][0] = average mutation rate
		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
		# self.models[ploid][2] = p(mut is indel | mut occurs)
		# self.models[ploid][3] = p(insertion | indel occurs)
		# self.models[ploid][4] = distribution of insertion lengths
		# self.models[ploid][5] = distribution of deletion lengths
		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
		# self.models[ploid][7] = p(trinuc mutates)
		self.models = []
		for n in self.modelData:
			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
			for m in n[8]:
				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
					                       DiscreteDistribution(m[1],NUCL),
					                       DiscreteDistribution(m[2],NUCL),
					                       DiscreteDistribution(m[3],NUCL)])
			self.models[-1].append([m for m in n[9]])
Example #6
0
	[GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb'))
	GC_WINDOW_SIZE = GC_SCALE_COUNT[-1]

#	fragment length distribution
#
if PAIRED_END and not(PAIRED_END_ARTIFICIAL):
	print 'Using empirical fragment length distribution.'
	[potential_vals, potential_prob] = pickle.load(open(FRAGLEN_MODEL,'rb'))
	FRAGLEN_VALS = []
	FRAGLEN_PROB = []
	for i in xrange(len(potential_vals)):
		if potential_vals[i] > READLEN:
			FRAGLEN_VALS.append(potential_vals[i])
			FRAGLEN_PROB.append(potential_prob[i])
	# should probably add some validation and sanity-checking code here...
	FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS)
	FRAGMENT_SIZE = FRAGLEN_VALS[mean_ind_of_weighted_list(FRAGLEN_PROB)]

#	Indicate not writing FASTQ reads
#
if NO_FASTQ:
	print 'Bypassing FASTQ generation...'

"""************************************************
****            HARD-CODED CONSTANTS
************************************************"""


# target window size for read sampling. how many times bigger than read/frag length
WINDOW_TARGET_SCALE = 100
# sub-window size for read sampling windows. this is basically the finest resolution
Example #7
0
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		self.UNIFORM = False
		if len(errorDat) == 4:		# uniform-error SE reads (e.g. PacBio)
			self.UNIFORM = True
			[Qscores,offQ,avgError,errorParams] = errorDat
			self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
			print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)


		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		if self.UNIFORM == False:
			# adjust length to match desired read length
			if self.readLen == len(initQ1):
				self.qIndRemap = range(self.readLen)
			else:
				print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
				self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

			# initialize probability distributions
			self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
			self.probDistByPosByPrevQ1 = [None]
			for i in xrange(1,len(initQ1)):
				self.probDistByPosByPrevQ1.append([])
				for j in xrange(len(initQ1[0])):
					if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

			if self.PE_MODELS:
				self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
				self.probDistByPosByPrevQ2 = [None]
				for i in xrange(1,len(initQ2)):
					self.probDistByPosByPrevQ2.append([])
					for j in xrange(len(initQ2[0])):
						if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
						else:
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))
Example #8
0
class ReadContainer:
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		self.UNIFORM = False
		if len(errorDat) == 4:		# uniform-error SE reads (e.g. PacBio)
			self.UNIFORM = True
			[Qscores,offQ,avgError,errorParams] = errorDat
			self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
			print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)


		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		if self.UNIFORM == False:
			# adjust length to match desired read length
			if self.readLen == len(initQ1):
				self.qIndRemap = range(self.readLen)
			else:
				print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
				self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

			# initialize probability distributions
			self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
			self.probDistByPosByPrevQ1 = [None]
			for i in xrange(1,len(initQ1)):
				self.probDistByPosByPrevQ1.append([])
				for j in xrange(len(initQ1[0])):
					if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

			if self.PE_MODELS:
				self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
				self.probDistByPosByPrevQ2 = [None]
				for i in xrange(1,len(initQ2)):
					self.probDistByPosByPrevQ2.append([])
					for j in xrange(len(initQ2[0])):
						if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
						else:
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))

	def getSequencingErrors(self, readData, isReverseStrand=False):

		qOut = [0]*self.readLen
		sErr = []

		if self.UNIFORM:
			myQ  = [self.uniform_qscore + self.offQ for n in xrange(self.readLen)]
			qOut = ''.join([chr(n) for n in myQ])
			for i in xrange(self.readLen):
				if random.random() < self.errorScale*self.qErrRate[self.uniform_qscore]:
					sErr.append(i)
		else:
			if self.PE_MODELS and isReverseStrand:
				myQ = self.initDistByPos2[0].sample()
			else:
				myQ = self.initDistByPos1[0].sample()

			if random.random() < self.qErrRate[myQ]:
				sErr.append(0)
			qOut[0] = myQ + self.offQ
			for i in xrange(1,self.readLen):

				if self.PE_MODELS and isReverseStrand:
					myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample()
				else:
					myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample()

				if random.random() < self.errorScale*self.qErrRate[myQ]:
					sErr.append(i)
				qOut[i] = myQ + self.offQ
			qOut = ''.join([chr(n) for n in qOut])

		if self.errorScale == 0.0:
			return (qOut,[])

		sOut = []
		nDelSoFar = 0
		# don't allow indel errors to occur on subsequent positions
		prevIndel = -2
		# don't allow other sequencing errors to occur on bases removed by deletion errors
		delBlacklist = []

		for ind in sErr[::-1]:	# for each error that we're going to insert...

			# determine error type
			isSub = True
			if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and abs(ind-prevIndel) > 1:
				if random.random() < self.errP[1]:
					isSub = False

			# errorOut = (type, len, pos, ref, alt)

			if isSub:								# insert substitution error
				myNucl  = chr(readData[ind])
				newNucl = self.errSSE[NUC_IND[myNucl]].sample()
				sOut.append(('S',1,ind,myNucl,newNucl))
			else:									# insert indel error
				indelLen = self.errSIE.sample()
				if random.random() < self.errP[4]:		# insertion error
					myNucl  = chr(readData[ind])
					newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)])
					sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl))
				elif ind < self.readLen-2-nDelSoFar:	# deletion error (prevent too many of them from stacking up)
					myNucl  = str(readData[ind:ind+indelLen+1])
					newNucl = chr(readData[ind])
					nDelSoFar += len(myNucl)-1
					sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl))
					for i in xrange(ind+1,ind+indelLen+1):
						delBlacklist.append(i)
				prevIndel = ind

		# remove blacklisted errors
		for i in xrange(len(sOut)-1,-1,-1):
			if sOut[i][2] in delBlacklist:
				del sOut[i]

		return (qOut,sOut)
Example #9
0
	def init_coverage(self,coverageDat,fragDist=None):
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, gc_scalars, targetCov_vals) = coverageDat
			gcCov_vals = [[] for n in self.sequences]
			trCov_vals = [[] for n in self.sequences]
			self.coverage_distribution = []
			avg_out = []
			for i in xrange(len(self.sequences)):
				# compute gc-bias
				j = 0
				while j+self.windowSize < len(self.sequences[i]):
					gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
					gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
					j += self.windowSize
				gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
				gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
				#
				trCov_vals[i].append(targetCov_vals[0])
				prevVal = self.FM_pos[i][0]
				for j in xrange(1,len(self.sequences[i])-self.readLen):
					if self.FM_pos[i][j] == None:
						trCov_vals[i].append(targetCov_vals[prevVal])
					else:
						trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
						prevVal = self.FM_pos[i][j]
					#print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
				# shift by half of read length
				trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
				# fill in missing indices
				trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))

				#
				covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
				coverage_vals = []
				for j in xrange(0,len(self.sequences[i])-self.readLen):
					coverage_vals.append(covvec[j+self.readLen] - covvec[j])
				avg_out.append(np.mean(coverage_vals)/float(self.readLen))

				if fragDist == None:
					self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
			
				# fragment length nightmare
				else:
					currentThresh = 0.
					index_list    = [0]
					for j in xrange(len(fragDist.cumP)):
						if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
							currentThresh = fragDist.cumP[j]
							index_list.append(j)
					flq = [fragDist.values[nnn] for nnn in index_list]
					if fragDist.values[-1] not in flq:
						flq.append(fragDist.values[-1])
					flq.append(LARGE_NUMBER)

					self.fraglens_indMap = {}
					for j in fragDist.values:
						bInd = bisect.bisect(flq,j)
						if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
							self.fraglens_indMap[j] = flq[bInd-1]
						else:
							self.fraglens_indMap[j] = flq[bInd]

					self.coverage_distribution.append({})
					for flv in sorted(list(set(self.fraglens_indMap.values()))):
						buffer_val = self.readLen
						for j in fragDist.values:
							if self.fraglens_indMap[j] == flv and j > buffer_val:
								buffer_val = j
						coverage_vals = []
						for j in xrange(len(self.sequences[i])-buffer_val):
							coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])

						# EXPERIMENTAL
						#quantized_covVals = quantize_list(coverage_vals)
						#self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])

						# TESTING
						#import matplotlib.pyplot as mpl
						#print len(coverage_vals),'-->',len(quantized_covVals)
						#mpl.figure(0)
						#mpl.plot(range(len(coverage_vals)),coverage_vals)
						#for qcv in quantized_covVals:
						#	mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
						#mpl.show()
						#exit(1)

						self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

			return np.mean(avg_out)
Example #10
0
class SequenceContainer:
	def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, onlyVCF=False):
		# initialize basic variables
		self.onlyVCF = onlyVCF
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
		# initialize mutation models
		self.init_mutModels(mutationModels, mutRate)
		# sample the number of variants that will be inserted into each ploid
		self.init_poisson()
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		# initialize trinuc snp bias
		self.init_trinucBias()

	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen):
		self.x         = xOffset
		self.ploidy    = ploidy
		self.readLen   = readLen
		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
		self.seqLen    = len(sequence)
		self.indelList = [[] for n in xrange(self.ploidy)]
		self.snpList   = [[] for n in xrange(self.ploidy)]
		self.allCigar  = [[] for n in xrange(self.ploidy)]
		self.FM_pos    = [[] for n in xrange(self.ploidy)]
		self.FM_span   = [[] for n in xrange(self.ploidy)]
		self.adj       = [None for n in xrange(self.ploidy)]
		# blackList[ploid][pos] = 0		safe to insert variant here
		# blackList[ploid][pos] = 1		indel inserted here
		# blackList[ploid][pos] = 2		snp inserted here
		# blackList[ploid][pos] = 3		invalid position for various processing reasons
		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]

		# disallow mutations to occur on window overlap points
		self.winBuffer = windowOverlap
		for p in xrange(self.ploidy):
			self.blackList[p][-self.winBuffer]   = 3
			self.blackList[p][-self.winBuffer-1] = 3

	def init_coverage(self,coverageDat,fragDist=None):
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, gc_scalars, targetCov_vals) = coverageDat
			gcCov_vals = [[] for n in self.sequences]
			trCov_vals = [[] for n in self.sequences]
			self.coverage_distribution = []
			avg_out = []
			for i in xrange(len(self.sequences)):
				# compute gc-bias
				j = 0
				while j+self.windowSize < len(self.sequences[i]):
					gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
					gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
					j += self.windowSize
				gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
				gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
				#
				trCov_vals[i].append(targetCov_vals[0])
				prevVal = self.FM_pos[i][0]
				for j in xrange(1,len(self.sequences[i])-self.readLen):
					if self.FM_pos[i][j] == None:
						trCov_vals[i].append(targetCov_vals[prevVal])
					else:
						trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
						prevVal = self.FM_pos[i][j]
					#print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
				# shift by half of read length
				trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
				# fill in missing indices
				trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))

				#
				covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
				coverage_vals = []
				for j in xrange(0,len(self.sequences[i])-self.readLen):
					coverage_vals.append(covvec[j+self.readLen] - covvec[j])
				avg_out.append(np.mean(coverage_vals)/float(self.readLen))

				if fragDist == None:
					self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
			
				# fragment length nightmare
				else:
					currentThresh = 0.
					index_list    = [0]
					for j in xrange(len(fragDist.cumP)):
						if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
							currentThresh = fragDist.cumP[j]
							index_list.append(j)
					flq = [fragDist.values[nnn] for nnn in index_list]
					if fragDist.values[-1] not in flq:
						flq.append(fragDist.values[-1])
					flq.append(LARGE_NUMBER)

					self.fraglens_indMap = {}
					for j in fragDist.values:
						bInd = bisect.bisect(flq,j)
						if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
							self.fraglens_indMap[j] = flq[bInd-1]
						else:
							self.fraglens_indMap[j] = flq[bInd]

					self.coverage_distribution.append({})
					for flv in sorted(list(set(self.fraglens_indMap.values()))):
						buffer_val = self.readLen
						for j in fragDist.values:
							if self.fraglens_indMap[j] == flv and j > buffer_val:
								buffer_val = j
						coverage_vals = []
						for j in xrange(len(self.sequences[i])-buffer_val):
							coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])

						# EXPERIMENTAL
						#quantized_covVals = quantize_list(coverage_vals)
						#self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])

						# TESTING
						#import matplotlib.pyplot as mpl
						#print len(coverage_vals),'-->',len(quantized_covVals)
						#mpl.figure(0)
						#mpl.plot(range(len(coverage_vals)),coverage_vals)
						#for qcv in quantized_covVals:
						#	mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
						#mpl.show()
						#exit(1)

						self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

			return np.mean(avg_out)

	def init_mutModels(self,mutationModels,mutRate):
		if mutationModels == []:
			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
			self.modelData = ml[:self.ploidy]
		else:
			if len(mutationModels) != self.ploidy:
				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
				exit(1)
			self.modelData = copy.deepcopy(mutationModels)

		# do we need to rescale mutation frequencies?
		mutRateSum = sum([n[0] for n in self.modelData])
		self.mutRescale = mutRate
		if self.mutRescale == None:
			self.mutScalar = 1.0
		else:
			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))

		# how are mutations spread to each ploid, based on their specified mut rates?
		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))

		# init mutation models
		#
		# self.models[ploid][0] = average mutation rate
		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
		# self.models[ploid][2] = p(mut is indel | mut occurs)
		# self.models[ploid][3] = p(insertion | indel occurs)
		# self.models[ploid][4] = distribution of insertion lengths
		# self.models[ploid][5] = distribution of deletion lengths
		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
		# self.models[ploid][7] = p(trinuc mutates)
		self.models = []
		for n in self.modelData:
			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
			for m in n[8]:
				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
					                       DiscreteDistribution(m[1],NUCL),
					                       DiscreteDistribution(m[2],NUCL),
					                       DiscreteDistribution(m[3],NUCL)])
			self.models[-1].append([m for m in n[9]])

	def init_poisson(self):
		ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		k_range    = range(int(self.seqLen*MAX_MUTFRAC))
		self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))]
		self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))]

	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))

	def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None):
		# if mutation model is changed, we have to reinitialize it...
		if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
			self.ploidy = ploidy
			self.mutRescale = mutRate
			self.init_mutModels(mutationModels, mutRate)
		# if sequence length is different than previous window, we have to redo snp/indel poissons
		if len(sequence) != self.seqLen:
			self.seqLen = len(sequence)
			self.init_poisson()
		# basic vars
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		#print (self.indelsToAdd,self.snpsToAdd)
		# initialize trinuc snp bias
		if not IGNORE_TRINUC:
			self.init_trinucBias()

	def insert_mutations(self, inputList):
		#
		#	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
		#
		for inpV in inputList:
			whichPloid = []
			wps = inpV[4][0]
			if wps == None:	# if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
				whichPloid.append(self.ploidMutPrior.sample())
				whichAlt = [0]
			else:
				#if 'WP=' in wps:
				#	whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1']
				#	print 'WHICH:', whichPloid
				#	whichAlt   = [0]*len(whichPloid)
				#elif '/' in wps or '|' in wps:
				if '/' in wps or '|' in wps:
					if '/' in wps:
						splt = wps.split('/')
					else:
						splt = wps.split('|')
					whichPloid = []
					whichAlt   = []
					for i in xrange(len(splt)):
						if splt[i] == '1':
							whichPloid.append(i)
						#whichAlt.append(int(splt[i])-1)
					# assume we're just using first alt for inserted variants?
					whichAlt = [0 for n in whichPloid]
				else:	# otherwise assume monoploidy
					whichPloid = [0]
					whichAlt   = [0]

			# ignore invalid ploids
			for i in xrange(len(whichPloid)-1,-1,-1):
				if whichPloid[i] >= self.ploidy:
					del whichPloid[i]
						
			for i in xrange(len(whichPloid)):
				p = whichPloid[i]
				myAlt = inpV[2][whichAlt[i]]
				myVar = (inpV[0]-self.x,inpV[1],myAlt)
				inLen = max([len(inpV[1]),len(myAlt)])
				#print myVar, chr(self.sequences[p][myVar[0]])
				if myVar[0] < 0 or myVar[0] >= len(self.blackList[p]):
					print '\nError: Attempting to insert variant out of window bounds:'
					print myVar, '--> blackList[0:'+str(len(self.blackList[p]))+']\n'
					exit(1)
				if len(inpV[1]) == 1 and len(myAlt) == 1:
					if self.blackList[p][myVar[0]]:
						continue
					self.snpList[p].append(myVar)
					self.blackList[p][myVar[0]] = 2
				else:
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						if self.blackList[p][k]:
							continue
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						self.blackList[p][k] = 1
					self.indelList[p].append(myVar)

	def random_mutations(self):
		
		#	add random indels
		all_indels  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.indelsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous indel
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous indel
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert indels
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					eventPos = random.randint(self.winBuffer,self.seqLen-1)
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				if random.random() <= self.models[i][3]:	# insertion
					inLen   = self.models[i][4].sample()
					# sequence content of random insertions is uniformly random (change this later)
					inSeq   = ''.join([random.choice(NUCL) for n in xrange(inLen)])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl,refNucl+inSeq)
				else:										# deletion
					inLen   = self.models[i][5].sample()
					if eventPos+inLen+1 >= len(self.sequences[i]):	# skip if deletion too close to boundary
						continue
					if inLen == 1:
						inSeq = chr(self.sequences[i][eventPos+1])
					else:
						inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl+inSeq,refNucl)

				# if event too close to boundary, skip. if event conflicts with other indel, skip.
				skipEvent = False
				if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1:
					skipEvent = True
				if skipEvent:
					continue
				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						if self.blackList[p][k]:
							skipEvent = True
				if skipEvent:
					continue

				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						self.blackList[p][k] = 1
					all_indels[p].append(myIndel)

		for i in xrange(len(all_indels)):
			all_indels[i].extend(self.indelList[i])
		all_indels = [sorted(n,reverse=True) for n in all_indels]
		#print all_indels

		#	add random snps
		all_snps  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.snpsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous SNP
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous SNP
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert snps
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					# based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
					# (if there are multiple ploids, choose one at random)
					if IGNORE_TRINUC:
						eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
					else:
						ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)]
						eventPos     = self.trinuc_bias[ploid_to_use].sample()
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				refNucl = chr(self.sequences[i][eventPos])
				context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1]))
				# sample from tri-nucleotide substitution matrices to get SNP alt allele
				newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample()
				mySNP   = (eventPos,refNucl,newNucl)

				for p in whichPloid:
					all_snps[p].append(mySNP)
					self.blackList[p][mySNP[0]] = 2

		# combine random snps with inserted snps, remove any snps that overlap indels
		for p in xrange(len(all_snps)):
			all_snps[p].extend(self.snpList[p])
			all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1]

		# modify reference sequences
		for i in xrange(len(all_snps)):
			for j in xrange(len(all_snps[i])):
				# sanity checking (for debugging purposes)
				vPos = all_snps[i][j][0]
				if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
					print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n'
					exit(1)
				else:
					self.sequences[i][vPos] = all_snps[i][j][2]

		adjToAdd = [[] for n in xrange(self.ploidy)]
		for i in xrange(len(all_indels)):
			for j in xrange(len(all_indels[i])):
				# sanity checking (for debugging purposes)
				vPos  = all_indels[i][j][0]
				vPos2 = vPos + len(all_indels[i][j][1])
				#print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
				#print len(self.sequences[i]),'-->',
				if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
					print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n'
					exit(1)
				else:
					self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:]
					adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1])))
				#print len(self.sequences[i])
			adjToAdd[i].sort()
			#print adjToAdd[i]

			self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4')
			indSoFar = 0
			valSoFar = 0
			for j in xrange(len(self.adj[i])):
				if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1:
					valSoFar += adjToAdd[i][indSoFar][1]
					indSoFar += 1
				self.adj[i][j] = valSoFar

			# precompute cigar strings (we can skip this is going for only vcf output)
			if not self.onlyVCF:
				tempSymbolString = ['M']
				prevVal = self.adj[i][0]
				j = 1
				while j < len(self.adj[i]):
					diff = self.adj[i][j] - prevVal
					prevVal = self.adj[i][j]
					if diff > 0:	# insertion
						tempSymbolString.extend(['I']*abs(diff))
						j += abs(diff)
					elif diff < 0:	# deletion
						tempSymbolString.append('D'*abs(diff)+'M')
						j += 1
					else:
						tempSymbolString.append('M')
						j += 1

				for j in xrange(len(tempSymbolString)-self.readLen):
					self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString())
					# pre-compute reference position of first matching base
					my_fm_pos = None
					for k in xrange(self.readLen):
						if 'M' in tempSymbolString[j+k]:
							my_fm_pos = j+k
							break
					if my_fm_pos == None:
						self.FM_pos[i].append(None)
						self.FM_span[i].append(None)
					else:
						self.FM_pos[i].append(my_fm_pos-self.adj[i][my_fm_pos])
						span_dif = len([nnn for nnn in tempSymbolString[j:j+self.readLen] if 'M' in nnn])
						self.FM_span[i].append(self.FM_pos[i][-1] + span_dif)

		# tally up variants implemented
		countDict = {}
		all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)]
		for i in xrange(len(all_variants)):
			for j in xrange(len(all_variants[i])):
				all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:]
				t = tuple(all_variants[i][j])
				if t not in countDict:
					countDict[t] = []
				countDict[t].append(i)

		#
		#	TODO: combine multiple variants that happened to occur at same position into single vcf entry
		#

		output_variants = []
		for k in sorted(countDict.keys()):
			output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)]))
			ploid_string = ['0' for n in xrange(self.ploidy)]
			for k2 in [n for n in countDict[k]]:
				ploid_string[k2] = '1'
			output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)])
		return output_variants


	def sample_read(self, sequencingModel, fragLen=None):
		
		# choose a ploid
		myPloid = random.randint(0,self.ploidy-1)

		# stop attempting to find a valid position if we fail enough times
		MAX_READPOS_ATTEMPTS = 100
		attempts_thus_far    = 0

		# choose a random position within the ploid, and generate quality scores / sequencing errors
		readsToSample = []
		if fragLen == None:
			rPos = self.coverage_distribution[myPloid].sample()
			#####rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random
			####
			##### decide which subsection of the sequence to sample from using coverage probabilities
			####coords_bad = True
			####while coords_bad:
			####	attempts_thus_far += 1
			####	if attempts_thus_far > MAX_READPOS_ATTEMPTS:
			####		return None
			####	myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
			####	coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
			####	if coords_to_select_from[0] >= len(self.adj[myPloid]):	# prevent going beyond region boundaries
			####		continue
			####	coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
			####	coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]
			####	if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
			####		continue
			####	if coords_to_select_from[1] - coords_to_select_from[0] <= 2:	# we don't span enough coords to sample
			####		continue
			####	if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen:
			####		coords_bad = False
			####rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)

			# sample read position and call function to compute quality scores / sequencing errors
			rDat = self.sequences[myPloid][rPos:rPos+self.readLen]
			(myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
			readsToSample.append([rPos,myQual,myErrors,rDat])

		else:
			rPos1 = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
			
			# EXPERIMENTAL
			#coords_to_select_from = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
			#rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1])

			#####rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random
			####
			##### decide which subsection of the sequence to sample from using coverage probabilities
			####coords_bad = True
			####while coords_bad:
			####	attempts_thus_far += 1
			####	if attempts_thus_far > MAX_READPOS_ATTEMPTS:
			####		#print coords_to_select_from
			####		return None
			####	myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
			####	coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
			####	if coords_to_select_from[0] >= len(self.adj[myPloid]):	# prevent going beyond region boundaries
			####		continue
			####	coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
			####	coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]	# both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
			####	if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
			####		continue
			####	if coords_to_select_from[1] - coords_to_select_from[0] <= 2:	# we don't span enough coords to sample
			####		continue
			####	rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
			####	# for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
			####	if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen:
			####		rPos1 -= fragLen - self.readLen
			####	if rPos1 < len(self.sequences[myPloid])-fragLen:
			####		coords_bad = False

			rPos2 = rPos1 + fragLen - self.readLen
			rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen]
			rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen]
			#print len(rDat1), rPos1, len(self.sequences[myPloid])
			(myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
			(myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True)
			readsToSample.append([rPos1,myQual1,myErrors1,rDat1])
			readsToSample.append([rPos2,myQual2,myErrors2,rDat2])

		# error format:
		# myError[i] = (type, len, pos, ref, alt)

		# examine sequencing errors to-be-inserted.
		#	- remove deletions that don't have enough bordering sequence content to "fill in"
		# if error is valid, make the changes to the read data
		rOut = []
		for r in readsToSample:
			try:
				myCigar = self.allCigar[myPloid][r[0]]
			except IndexError:
				print 'Index error when attempting to find cigar string.'
				print len(self.allCigar[myPloid]), r[0]
				if fragLen != None:
					print (rPos1, rPos2)
				print myPloid, fragLen, self.fraglens_indMap[fragLen]
				exit(1)
			totalD  = sum([error[1] for error in r[2] if error[0] == 'D'])
			totalI  = sum([error[1] for error in r[2] if error[0] == 'I'])
			availB  = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
			# add buffer sequence to fill in positions that get deleted
			r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD]
			expandedCigar = []
			extraCigar    = []
			adj           = 0
			sse_adj       = [0 for n in xrange(self.readLen + max(sequencingModel.errP[3]))]
			anyIndelErr   = False

			# sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
			# secondarily, sort by index
			arrangedErrors = {'D':[],'I':[],'S':[]}
			for error in r[2]:
				arrangedErrors[error[0]].append((error[2],error))
			sortedErrors = []
			for k in sorted(arrangedErrors.keys()):
				sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])

			skipIndels = False

			for error in sortedErrors:
				#print '-se-',r[0], error
				#print sse_adj
				eLen = error[1]
				ePos = error[2]
				if error[0] == 'D' or error[0] == 'I':
					anyIndelErr   = True
					extraCigarVal = []
					if totalD > availB:	# if not enough bases to fill-in deletions, skip all indel erors
						continue
					if expandedCigar == []:
						expandedCigar = CigarString(stringIn=myCigar).getList()
						fillToGo = totalD - totalI + 1
						if fillToGo > 0:
							try:
								extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:]
							except IndexError:	# applying the deletions we want requires going beyond region boundaries. skip all indel errors
								skipIndels = True

					if skipIndels:
						continue

					# insert deletion error into read and update cigar string accordingly
					if error[0] == 'D':
						myadj = sse_adj[ePos]
						pi = ePos+myadj
						pf = ePos+myadj+eLen+1
						if str(r[3][pi:pf]) == str(error[3]):
							r[3] = r[3][:pi+1] + r[3][pf:]
							expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:]
							if pi+1 == len(expandedCigar):	# weird edge case with del at very end of region. Make a guess and add a "M"
								expandedCigar.append('M')
							expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1]
						else:
							print '\nError, ref does not match alt while attempting to insert deletion error!\n'
							exit(1)
						adj -= eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] -= eLen

					# insert insertion error into read and update cigar string accordingly
					else:
						myadj = sse_adj[ePos]
						if chr(r[3][ePos+myadj]) == error[3]:
							r[3] = r[3][:ePos+myadj] + error[4] + r[3][ePos+myadj+1:]
							expandedCigar = expandedCigar[:ePos+myadj] + ['I']*eLen + expandedCigar[ePos+myadj:]
						else:
							print '\nError, ref does not match alt while attempting to insert insertion error!\n'
							print '---',chr(r[3][ePos+myadj]), '!=', error[3]
							exit(1)
						adj += eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] += eLen

				else:	# substitution errors, much easier by comparison...
					if chr(r[3][ePos+sse_adj[ePos]]) == error[3]:
						r[3][ePos+sse_adj[ePos]] = error[4]
					else:
						print '\nError, ref does not match alt while attempting to insert substitution error!\n'
						exit(1)

			if anyIndelErr:
				if len(expandedCigar):
					relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen]
					myCigar = CigarString(listIn=relevantCigar).getString()

				r[3] = r[3][:self.readLen]

			rOut.append([self.FM_pos[myPloid][r[0]],myCigar,str(r[3]),str(r[1])])

		# rOut[i] = (pos, cigar, read_string, qual_string)
		return rOut
def parseFQ(inf):
    print 'reading ' + inf + '...'
    if inf[-3:] == '.gz':
        print 'detected gzip suffix...'
        f = gzip.open(inf, 'r')
    else:
        f = open(inf, 'r')

    IS_SAM = False
    if inf[-4:] == '.sam':
        print 'detected sam input...'
        IS_SAM = True

    rRead = 0
    actual_readlen = 0
    qDict = {}
    while True:

        if IS_SAM:
            data4 = f.readline()
            if not len(data4):
                break
            try:
                data4 = data4.split('\t')[10]
            except IndexError:
                break
            # need to add some input checking here? Yup, probably.
        else:
            data1 = f.readline()
            data2 = f.readline()
            data3 = f.readline()
            data4 = f.readline()
            if not all([data1, data2, data3, data4]):
                break

        if actual_readlen == 0:
            if inf[-3:] != '.gz' and not IS_SAM:
                totalSize = os.path.getsize(inf)
                entrySize = sum([len(n) for n in [data1, data2, data3, data4]])
                print 'estimated number of reads in file:', int(
                    float(totalSize) / entrySize)
            actual_readlen = len(data4) - 1
            print 'assuming read length is uniform...'
            print 'detected read length (from first read found):', actual_readlen
            priorQ = np.zeros([actual_readlen, RQ])
            totalQ = [None] + [
                np.zeros([RQ, RQ]) for n in xrange(actual_readlen - 1)
            ]

        # sanity-check readlengths
        if len(data4) - 1 != actual_readlen:
            print 'skipping read with unexpected length...'
            continue

        for i in range(len(data4) - 1):
            q = ord(data4[i]) - offQ
            qDict[q] = True
            if i == 0:
                priorQ[i][q] += 1
            else:
                totalQ[i][prevQ, q] += 1
                priorQ[i][q] += 1
            prevQ = q

        rRead += 1
        if rRead % PRINT_EVERY == 0:
            print rRead
        if MAX_READS > 0 and rRead >= MAX_READS:
            break
    f.close()

    # some sanity checking again...
    QRANGE = [min(qDict.keys()), max(qDict.keys())]
    if QRANGE[0] < 0:
        print '\nError: Read in Q-scores below 0\n'
        exit(1)
    if QRANGE[1] > RQ:
        print '\nError: Read in Q-scores above specified maximum:', QRANGE[
            1], '>', RQ, '\n'
        exit(1)

    print 'computing probabilities...'
    probQ = [None] + [[[0. for m in xrange(RQ)] for n in xrange(RQ)]
                      for p in xrange(actual_readlen - 1)]
    for p in xrange(1, actual_readlen):
        for i in xrange(RQ):
            rowSum = float(np.sum(totalQ[p][i, :])) + PROB_SMOOTH * RQ
            if rowSum <= 0.:
                continue
            for j in xrange(RQ):
                probQ[p][i][j] = (totalQ[p][i][j] + PROB_SMOOTH) / rowSum

    initQ = [[0. for m in xrange(RQ)] for n in xrange(actual_readlen)]
    for i in xrange(actual_readlen):
        rowSum = float(np.sum(priorQ[i, :])) + INIT_SMOOTH * RQ
        if rowSum <= 0.:
            continue
        for j in xrange(RQ):
            initQ[i][j] = (priorQ[i][j] + INIT_SMOOTH) / rowSum

    if PLOT_STUFF:
        mpl.rcParams.update({
            'font.size': 14,
            'font.weight': 'bold',
            'lines.linewidth': 3
        })

        mpl.figure(1)
        Z = np.array(initQ).T
        X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
        mpl.pcolormesh(X, Y, Z, vmin=0., vmax=0.25)
        mpl.axis([0, len(Z[0]), 0, len(Z)])
        mpl.yticks(range(0, len(Z), 10), range(0, len(Z), 10))
        mpl.xticks(range(0, len(Z[0]), 10), range(0, len(Z[0]), 10))
        mpl.xlabel('Read Position')
        mpl.ylabel('Quality Score')
        mpl.title('Q-Score Prior Probabilities')
        mpl.colorbar()

        mpl.show()

        VMIN_LOG = [-4, 0]
        minVal = 10**VMIN_LOG[0]
        qLabels = [
            str(n) for n in range(QRANGE[0], QRANGE[1] + 1) if n % 5 == 0
        ]
        print qLabels
        qTicksx = [int(n) + 0.5 for n in qLabels]
        qTicksy = [(RQ - int(n)) - 0.5 for n in qLabels]

        for p in xrange(1, actual_readlen, 10):
            currentDat = np.array(probQ[p])
            for i in xrange(len(currentDat)):
                for j in xrange(len(currentDat[i])):
                    currentDat[i][j] = max(minVal, currentDat[i][j])

            # matrix indices:		pcolormesh plotting:	plot labels and axes:
            #
            #      y				   ^					   ^
            #	   -->				 x |					 y |
            #  x |					    -->					    -->
            #    v 					    y					    x
            #
            # to plot a MxN matrix 'Z' with rowNames and colNames we need to:
            #
            # pcolormesh(X,Y,Z[::-1,:])		# invert x-axis
            # # swap x/y axis parameters and labels, remember x is still inverted:
            # xlim([yMin,yMax])
            # ylim([M-xMax,M-xMin])
            # xticks()
            #

            mpl.figure(p + 1)
            Z = np.log10(currentDat)
            X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
            mpl.pcolormesh(X,
                           Y,
                           Z[::-1, :],
                           vmin=VMIN_LOG[0],
                           vmax=VMIN_LOG[1],
                           cmap='jet')
            mpl.xlim([QRANGE[0], QRANGE[1] + 1])
            mpl.ylim([RQ - QRANGE[1] - 1, RQ - QRANGE[0]])
            mpl.yticks(qTicksy, qLabels)
            mpl.xticks(qTicksx, qLabels)
            mpl.xlabel('\n' + r'$Q_{i+1}$')
            mpl.ylabel(r'$Q_i$')
            mpl.title('Q-Score Transition Frequencies [Read Pos:' + str(p) +
                      ']')
            cb = mpl.colorbar()
            cb.set_ticks([-4, -3, -2, -1, 0])
            cb.set_ticklabels([
                r'$10^{-4}$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$',
                r'$10^{0}$'
            ])

        #mpl.tight_layout()
        mpl.show()

    print 'estimating average error rate via simulation...'
    Qscores = range(RQ)
    #print (len(initQ), len(initQ[0]))
    #print (len(probQ), len(probQ[1]), len(probQ[1][0]))

    initDistByPos = [
        DiscreteDistribution(initQ[i], Qscores) for i in xrange(len(initQ))
    ]
    probDistByPosByPrevQ = [None]
    for i in xrange(1, len(initQ)):
        probDistByPosByPrevQ.append([])
        for j in xrange(len(initQ[0])):
            if np.sum(
                    probQ[i][j]
            ) <= 0.:  # if we don't have sufficient data for a transition, use the previous qscore
                probDistByPosByPrevQ[-1].append(
                    DiscreteDistribution([1], [Qscores[j]],
                                         degenerateVal=Qscores[j]))
            else:
                probDistByPosByPrevQ[-1].append(
                    DiscreteDistribution(probQ[i][j], Qscores))

    countDict = {}
    for q in Qscores:
        countDict[q] = 0
    for samp in xrange(1, N_SAMP + 1):
        if samp % PRINT_EVERY == 0:
            print samp
        myQ = initDistByPos[0].sample()
        countDict[myQ] += 1
        for i in xrange(1, len(initQ)):
            myQ = probDistByPosByPrevQ[i][myQ].sample()
            countDict[myQ] += 1

    totBases = float(sum(countDict.values()))
    avgError = 0.
    for k in sorted(countDict.keys()):
        eVal = 10.**(-k / 10.)
        #print k, eVal, countDict[k]
        avgError += eVal * (countDict[k] / totBases)
    print 'AVG ERROR RATE:', avgError

    return (initQ, probQ, avgError)
        if print_path:
            print('Path cost is', discovered[target][1])
            stack = []
            curr = target

            while curr:
                stack.append((curr, self.original_universe[curr[0]][curr[1]]))
                print(curr)
                curr = discovered[curr][0]

            print('Path from start to target:', stack[::-1])

        return discovered[target][1]


if __name__ == '__main__':
    state = load_maze()

    universe = state.universe
    start, target = state.start, state.target
    portals = state.portals

    discrete_distribution = DiscreteDistribution(portals)
    heuristics = Heuristic(portals, target, discrete_distribution)

    solver = A_Star(universe, portals, start, target, discrete_distribution)

    report = make_statistics(solver, heuristics, discrete_distribution)
    print(report)
Example #13
0
class SequenceContainer:
    def __init__(self,
                 xOffset,
                 sequence,
                 ploidy,
                 windowOverlap,
                 readLen,
                 mutationModels=[],
                 mutRate=None,
                 coverageDat=None,
                 onlyVCF=False):
        # initialize basic variables
        self.onlyVCF = onlyVCF
        self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen,
                            coverageDat)
        # initialize mutation models
        self.init_mutModels(mutationModels, mutRate)
        # sample the number of variants that will be inserted into each ploid
        self.init_poisson()
        self.indelsToAdd = [n.sample() for n in self.ind_pois]
        self.snpsToAdd = [n.sample() for n in self.snp_pois]
        # initialize trinuc snp bias
        self.init_trinucBias()

    def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen,
                       coverageDat):
        self.x = xOffset
        self.ploidy = ploidy
        self.readLen = readLen
        self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
        self.seqLen = len(sequence)
        self.indelList = [[] for n in xrange(self.ploidy)]
        self.snpList = [[] for n in xrange(self.ploidy)]
        self.allCigar = [[] for n in xrange(self.ploidy)]
        self.adj = [None for n in xrange(self.ploidy)]
        # blackList[ploid][pos] = 0		safe to insert variant here
        # blackList[ploid][pos] = 1		indel inserted here
        # blackList[ploid][pos] = 2		snp inserted here
        # blackList[ploid][pos] = 3		invalid position for various processing reasons
        self.blackList = [
            np.zeros(self.seqLen, dtype='<i4') for n in xrange(self.ploidy)
        ]

        # disallow mutations to occur on window overlap points
        self.winBuffer = windowOverlap
        for p in xrange(self.ploidy):
            self.blackList[p][-self.winBuffer] = 3
            self.blackList[p][-self.winBuffer - 1] = 3

        # if we're only creating a vcf, skip some expensive initialization related to coverage depth
        if not self.onlyVCF:
            (self.windowSize, coverage_vals) = coverageDat
            self.win_per_read = int(self.readLen / float(self.windowSize) +
                                    0.5)
            self.which_bucket = DiscreteDistribution(coverage_vals,
                                                     range(len(coverage_vals)))

    def init_mutModels(self, mutationModels, mutRate):
        if mutationModels == []:
            ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
            self.modelData = ml[:self.ploidy]
        else:
            if len(mutationModels) != self.ploidy:
                print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
                exit(1)
            self.modelData = copy.deepcopy(mutationModels)

        # do we need to rescale mutation frequencies?
        mutRateSum = sum([n[0] for n in self.modelData])
        self.mutRescale = mutRate
        if self.mutRescale == None:
            self.mutScalar = 1.0
        else:
            self.mutScalar = float(
                self.mutRescale) / (mutRateSum / float(len(self.modelData)))

        # how are mutations spread to each ploid, based on their specified mut rates?
        self.ploidMutFrac = [float(n[0]) / mutRateSum for n in self.modelData]
        self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,
                                                  range(self.ploidy))

        # init mutation models
        #
        # self.models[ploid][0] = average mutation rate
        # self.models[ploid][1] = p(mut is homozygous | mutation occurs)
        # self.models[ploid][2] = p(mut is indel | mut occurs)
        # self.models[ploid][3] = p(insertion | indel occurs)
        # self.models[ploid][4] = distribution of insertion lengths
        # self.models[ploid][5] = distribution of deletion lengths
        # self.models[ploid][6] = distribution of trinucleotide SNP transitions
        # self.models[ploid][7] = p(trinuc mutates)
        self.models = []
        for n in self.modelData:
            self.models.append([
                self.mutScalar * n[0], n[1], n[2], n[3],
                DiscreteDistribution(n[5], n[4]),
                DiscreteDistribution(n[7], n[6]), []
            ])
            for m in n[8]:
                self.models[-1][6].append([
                    DiscreteDistribution(m[0], NUCL),
                    DiscreteDistribution(m[1], NUCL),
                    DiscreteDistribution(m[2], NUCL),
                    DiscreteDistribution(m[3], NUCL)
                ])
            self.models[-1].append([m for m in n[9]])

    def init_poisson(self):
        ind_l_list = [
            self.seqLen * self.models[i][0] * self.models[i][2] *
            self.ploidMutFrac[i] for i in xrange(len(self.models))
        ]
        snp_l_list = [
            self.seqLen * self.models[i][0] * (1. - self.models[i][2]) *
            self.ploidMutFrac[i] for i in xrange(len(self.models))
        ]
        k_range = range(int(self.seqLen * MAX_MUTFRAC))
        self.ind_pois = [
            poisson_list(k_range, ind_l_list[n])
            for n in xrange(len(self.models))
        ]
        self.snp_pois = [
            poisson_list(k_range, snp_l_list[n])
            for n in xrange(len(self.models))
        ]

    def init_trinucBias(self):
        # compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
        #
        # note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
        #       at positions affected by indels. At the moment I'm going to consider this negligible.
        trinuc_snp_bias = [[0. for n in xrange(self.seqLen)]
                           for m in xrange(self.ploidy)]
        self.trinuc_bias = [None for n in xrange(self.ploidy)]
        for p in xrange(self.ploidy):
            for i in xrange(self.winBuffer + 1, self.seqLen - 1):
                trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(
                    self.sequences[p][i - 1:i + 2])]]
            self.trinuc_bias[p] = DiscreteDistribution(
                trinuc_snp_bias[p][self.winBuffer + 1:self.seqLen - 1],
                range(self.winBuffer + 1, self.seqLen - 1))

    def update(self,
               xOffset,
               sequence,
               ploidy,
               windowOverlap,
               readLen,
               mutationModels=[],
               mutRate=None,
               coverageDat=None):
        # if mutation model is changed, we have to reinitialize it...
        if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
            self.ploidy = ploidy
            self.mutRescale = mutRate
            self.init_mutModels(mutationModels, mutRate)
        # if sequence length is different than previous window, we have to redo snp/indel poissons
        if len(sequence) != self.seqLen:
            self.seqLen = len(sequence)
            self.init_poisson()
        # basic vars
        self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen,
                            coverageDat)
        self.indelsToAdd = [n.sample() for n in self.ind_pois]
        self.snpsToAdd = [n.sample() for n in self.snp_pois]
        #print (self.indelsToAdd,self.snpsToAdd)
        # initialize trinuc snp bias
        self.init_trinucBias()

    def insert_mutations(self, inputList):
        #
        #	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
        #
        for inpV in inputList:
            whichPloid = []
            wps = inpV[4][0]
            if wps == None:  # if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
                whichPloid.append(self.ploidMutPrior.sample())
                whichAlt = [0]
            else:
                if 'WP=' in wps:
                    whichPloid = [
                        int(n) for n in inpV[-1][3:].split(',') if n == '1'
                    ]
                    whichAlt = [0] * len(whichPloid)
                elif '/' in wps or '|' in wps:
                    if '/' in wps:
                        splt = wps.split('/')
                    else:
                        splt = wps.split('|')
                    whichPloid = []
                    whichAlt = []
                    for i in xrange(len(splt)):
                        if splt[i] == '1':
                            whichPloid.append(i)
                        whichAlt.append(int(splt[i]) - 1)

            for i in xrange(len(whichPloid)):
                p = whichPloid[i]
                myAlt = inpV[2][whichAlt[i]]
                myVar = (inpV[0] - self.x, inpV[1], myAlt)
                inLen = max([len(inpV[1]), len(myAlt)])
                #print myVar, chr(self.sequences[p][myVar[0]])
                if len(inpV[1]) == 1 and len(myAlt) == 1:
                    if self.blackList[p][myVar[0]]:
                        continue
                    self.snpList[p].append(myVar)
                    self.blackList[p][myVar[0]] = 2
                else:
                    for k in xrange(myVar[0], myVar[0] + inLen + 1):
                        if self.blackList[p][k]:
                            continue
                    for k in xrange(myVar[0], myVar[0] + inLen + 1):
                        self.blackList[p][k] = 1
                    self.indelList[p].append(myVar)

    def random_mutations(self):

        #	add random indels
        all_indels = [[] for n in self.sequences]
        for i in xrange(self.ploidy):
            for j in xrange(self.indelsToAdd[i]):
                if random.random(
                ) <= self.models[i][1]:  # insert homozygous indel
                    whichPloid = range(self.ploidy)
                else:  # insert heterozygous indel
                    whichPloid = [self.ploidMutPrior.sample()]

                # try to find suitable places to insert indels
                eventPos = -1
                for attempt in xrange(MAX_ATTEMPTS):
                    eventPos = random.randint(self.winBuffer, self.seqLen - 1)
                    for p in whichPloid:
                        if self.blackList[p][eventPos]:
                            eventPos = -1
                    if eventPos != -1:
                        break
                if eventPos == -1:
                    continue

                if random.random() <= self.models[i][3]:  # insertion
                    inLen = self.models[i][4].sample()
                    # sequence content of random insertions is uniformly random (change this later)
                    inSeq = ''.join(
                        [random.choice(NUCL) for n in xrange(inLen)])
                    refNucl = chr(self.sequences[i][eventPos])
                    myIndel = (eventPos, refNucl, refNucl + inSeq)
                else:  # deletion
                    inLen = self.models[i][5].sample()
                    if eventPos + inLen + 1 >= len(
                            self.sequences[i]
                    ):  # skip if deletion too close to boundary
                        continue
                    if inLen == 1:
                        inSeq = chr(self.sequences[i][eventPos + 1])
                    else:
                        inSeq = str(self.sequences[i][eventPos + 1:eventPos +
                                                      inLen + 1])
                    refNucl = chr(self.sequences[i][eventPos])
                    myIndel = (eventPos, refNucl + inSeq, refNucl)

                # if event too close to boundary, skip. if event conflicts with other indel, skip.
                skipEvent = False
                if eventPos + len(
                        myIndel[1]) >= self.seqLen - self.winBuffer - 1:
                    skipEvent = True
                if skipEvent:
                    continue
                for p in whichPloid:
                    for k in xrange(eventPos, eventPos + inLen + 1):
                        if self.blackList[p][k]:
                            skipEvent = True
                if skipEvent:
                    continue

                for p in whichPloid:
                    for k in xrange(eventPos, eventPos + inLen + 1):
                        self.blackList[p][k] = 1
                    all_indels[p].append(myIndel)

        for i in xrange(len(all_indels)):
            all_indels[i].extend(self.indelList[i])
        all_indels = [sorted(n, reverse=True) for n in all_indels]
        #print all_indels

        #	add random snps
        all_snps = [[] for n in self.sequences]
        for i in xrange(self.ploidy):
            for j in xrange(self.snpsToAdd[i]):
                if random.random(
                ) <= self.models[i][1]:  # insert homozygous SNP
                    whichPloid = range(self.ploidy)
                else:  # insert heterozygous SNP
                    whichPloid = [self.ploidMutPrior.sample()]

                # try to find suitable places to insert snps
                eventPos = -1
                for attempt in xrange(MAX_ATTEMPTS):
                    # based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
                    # (if there are multiple ploids, choose one at random)
                    #eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
                    ploid_to_use = whichPloid[random.randint(
                        0,
                        len(whichPloid) - 1)]
                    eventPos = self.trinuc_bias[ploid_to_use].sample()
                    for p in whichPloid:
                        if self.blackList[p][eventPos]:
                            eventPos = -1
                    if eventPos != -1:
                        break
                if eventPos == -1:
                    continue

                refNucl = chr(self.sequences[i][eventPos])
                context = str(
                    chr(self.sequences[i][eventPos - 1]) +
                    chr(self.sequences[i][eventPos + 1]))
                # sample from tri-nucleotide substitution matrices to get SNP alt allele
                newNucl = self.models[i][6][TRI_IND[context]][
                    NUC_IND[refNucl]].sample()
                mySNP = (eventPos, refNucl, newNucl)

                for p in whichPloid:
                    all_snps[p].append(mySNP)
                    self.blackList[p][mySNP[0]] = 2

        # combine random snps with inserted snps, remove any snps that overlap indels
        for p in xrange(len(all_snps)):
            all_snps[p].extend(self.snpList[p])
            all_snps[p] = [
                n for n in all_snps[p] if self.blackList[p][n[0]] != 1
            ]

        # modify reference sequences
        for i in xrange(len(all_snps)):
            for j in xrange(len(all_snps[i])):
                # sanity checking (for debugging purposes)
                vPos = all_snps[i][j][0]
                if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
                    print '\nError: Something went wrong!\n', all_snps[i][
                        j], chr(self.sequences[i][vPos]), '\n'
                    exit(1)
                else:
                    self.sequences[i][vPos] = all_snps[i][j][2]

        adjToAdd = [[] for n in xrange(self.ploidy)]
        for i in xrange(len(all_indels)):
            for j in xrange(len(all_indels[i])):
                # sanity checking (for debugging purposes)
                vPos = all_indels[i][j][0]
                vPos2 = vPos + len(all_indels[i][j][1])
                #print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
                #print len(self.sequences[i]),'-->',
                if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
                    print '\nError: Something went wrong!\n', all_indels[i][
                        j], str(self.sequences[i][vPos:vPos2]), '\n'
                    exit(1)
                else:
                    self.sequences[i] = self.sequences[i][:vPos] + bytearray(
                        all_indels[i][j][2]) + self.sequences[i][vPos2:]
                    adjToAdd[i].append(
                        (all_indels[i][j][0],
                         len(all_indels[i][j][2]) - len(all_indels[i][j][1])))
                #print len(self.sequences[i])
            adjToAdd[i].sort()
            #print adjToAdd[i]

            self.adj[i] = np.zeros(len(self.sequences[i]), dtype='<i4')
            indSoFar = 0
            valSoFar = 0
            for j in xrange(len(self.adj[i])):
                if indSoFar < len(
                        adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0] + 1:
                    valSoFar += adjToAdd[i][indSoFar][1]
                    indSoFar += 1
                self.adj[i][j] = valSoFar

            # precompute cigar strings (we can skip this is going for only vcf output)
            if not self.onlyVCF:
                tempSymbolString = ['M']
                prevVal = self.adj[i][0]
                j = 1
                while j < len(self.adj[i]):
                    diff = self.adj[i][j] - prevVal
                    prevVal = self.adj[i][j]
                    if diff > 0:  # insertion
                        tempSymbolString.extend(['I'] * abs(diff))
                        j += abs(diff)
                    elif diff < 0:  # deletion
                        tempSymbolString.append('D' * abs(diff) + 'M')
                        j += 1
                    else:
                        tempSymbolString.append('M')
                        j += 1

                for j in xrange(len(tempSymbolString) - self.readLen):
                    self.allCigar[i].append(
                        CigarString(
                            listIn=tempSymbolString[j:j +
                                                    self.readLen]).getString())

        # tally up variants implemented
        countDict = {}
        all_variants = [
            sorted(all_snps[i] + all_indels[i]) for i in xrange(self.ploidy)
        ]
        for i in xrange(len(all_variants)):
            for j in xrange(len(all_variants[i])):
                all_variants[i][j] = tuple([all_variants[i][j][0] + self.x
                                            ]) + all_variants[i][j][1:]
                t = tuple(all_variants[i][j])
                if t not in countDict:
                    countDict[t] = []
                countDict[t].append(i)

        #
        #	TODO: combine multiple variants that happened to occur at same position into single vcf entry
        #

        output_variants = []
        for k in sorted(countDict.keys()):
            output_variants.append(
                k + tuple([len(countDict[k]) / float(self.ploidy)]))
            ploid_string = ['0' for n in xrange(self.ploidy)]
            for k2 in [n for n in countDict[k]]:
                ploid_string[k2] = '1'
            output_variants[-1] += tuple(['WP=' + '/'.join(ploid_string)])
        return output_variants

    def sample_read(self, sequencingModel, fragLen=None):

        # choose a ploid
        myPloid = random.randint(0, self.ploidy - 1)

        # choose a random position within the ploid, and generate quality scores / sequencing errors
        readsToSample = []
        if fragLen == None:
            #rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random

            # decide which subsection of the sequence to sample from using coverage probabilities
            coords_bad = True
            while coords_bad:
                myBucket = max(
                    [self.which_bucket.sample() - self.win_per_read, 0])
                coords_to_select_from = [
                    myBucket * self.windowSize,
                    (myBucket + 1) * self.windowSize
                ]
                coords_to_select_from[0] += self.adj[myPloid][
                    coords_to_select_from[0]]
                coords_to_select_from[1] += self.adj[myPloid][
                    coords_to_select_from[1]]
                if coords_to_select_from[1] < len(
                        self.sequences[myPloid]) - self.readLen:
                    coords_bad = False
            rPos = random.randint(coords_to_select_from[0],
                                  coords_to_select_from[1] - 1)

            # sample read position and call function to compute quality scores / sequencing errors
            rDat = self.sequences[myPloid][rPos:rPos + self.readLen]
            (myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
            readsToSample.append([rPos, myQual, myErrors, rDat])

        else:
            #rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random

            # decide which subsection of the sequence to sample from using coverage probabilities
            coords_bad = True
            while coords_bad:
                myBucket = max(
                    [self.which_bucket.sample() - self.win_per_read, 0])
                coords_to_select_from = [
                    myBucket * self.windowSize,
                    (myBucket + 1) * self.windowSize
                ]
                coords_to_select_from[0] += self.adj[myPloid][
                    coords_to_select_from[0]]
                coords_to_select_from[1] += self.adj[myPloid][
                    coords_to_select_from[
                        0]]  # both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
                rPos1 = random.randint(coords_to_select_from[0],
                                       coords_to_select_from[1] - 1)
                # for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
                if random.randint(1,
                                  2) == 1 and rPos1 > fragLen - self.readLen:
                    rPos1 -= fragLen - self.readLen
                if rPos1 < len(self.sequences[myPloid]) - fragLen:
                    coords_bad = False

            rPos2 = rPos1 + fragLen - self.readLen
            rDat1 = self.sequences[myPloid][rPos1:rPos1 + self.readLen]
            rDat2 = self.sequences[myPloid][rPos2:rPos2 + self.readLen]
            (myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
            (myQual2, myErrors2) = sequencingModel.getSequencingErrors(
                rDat2, isReverseStrand=True)
            readsToSample.append([rPos1, myQual1, myErrors1, rDat1])
            readsToSample.append([rPos2, myQual2, myErrors2, rDat2])

        # error format:
        # myError[i] = (type, len, pos, ref, alt)

        # examine sequencing errors to-be-inserted.
        #	- remove deletions that don't have enough bordering sequence content to "fill in"
        # if error is valid, make the changes to the read data
        rOut = []
        for r in readsToSample:
            myCigar = self.allCigar[myPloid][r[0]]
            totalD = sum([error[1] for error in r[2] if error[0] == 'D'])
            totalI = sum([error[1] for error in r[2] if error[0] == 'I'])
            availB = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
            # add buffer sequence to fill in positions that get deleted
            r[3] += self.sequences[myPloid][r[0] + self.readLen:r[0] +
                                            self.readLen + totalD]
            expandedCigar = []
            extraCigar = []
            adj = 0
            sse_adj = [0 for n in xrange(self.readLen)]
            anyIndelErr = False

            # sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
            # secondarily, sort by index
            arrangedErrors = {'D': [], 'I': [], 'S': []}
            for error in r[2]:
                arrangedErrors[error[0]].append((error[2], error))
            sortedErrors = []
            for k in sorted(arrangedErrors.keys()):
                sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])

            for error in sortedErrors:
                #print r[0], error
                eLen = error[1]
                ePos = error[2]
                if error[0] == 'D' or error[0] == 'I':
                    anyIndelErr = True
                    extraCigarVal = []
                    if totalD > availB:  # if not enough bases to fill-in deletions, skip all indel erors
                        continue
                    if expandedCigar == []:
                        expandedCigar = CigarString(stringIn=myCigar).getList()

                        fillToGo = totalD - totalI
                        if fillToGo > 0:
                            extraCigarVal = CigarString(
                                stringIn=self.allCigar[myPloid][
                                    r[0] + fillToGo]).getList()[-fillToGo:]

                    # insert deletion error into read and update cigar string accordingly
                    if error[0] == 'D':
                        pi = ePos + adj
                        pf = ePos + adj + eLen + 1
                        if str(r[3][pi:pf]) == str(error[3]):
                            r[3] = r[3][:pi + 1] + r[3][pf:]
                            expandedCigar = expandedCigar[:pi +
                                                          1] + expandedCigar[
                                                              pf:]
                            expandedCigar[pi +
                                          1] = 'D' * eLen + expandedCigar[pi +
                                                                          1]
                        else:
                            print '\nError, ref does not match alt while attempting to insert deletion error!\n'
                            exit(1)
                        adj -= eLen
                        for i in xrange(ePos, len(sse_adj)):
                            sse_adj[i] -= eLen

                    # insert insertion error into read and update cigar string accordingly
                    else:
                        if chr(r[3][ePos + adj]) == error[3]:
                            r[3] = r[3][:ePos +
                                        adj] + error[4] + r[3][ePos + adj + 1:]
                            expandedCigar = expandedCigar[:ePos + adj] + [
                                'I'
                            ] * eLen + expandedCigar[ePos + adj + 1:]
                        else:
                            print '\nError, ref does not match alt while attempting to insert insertion error!\n'
                            exit(1)
                        adj += eLen
                        for i in xrange(ePos, len(sse_adj)):
                            sse_adj[i] += eLen

                else:  # substitution errors, much easier by comparison...
                    if chr(r[3][ePos + sse_adj[ePos]]) == error[3]:
                        r[3][ePos + sse_adj[ePos]] = error[4]
                    else:
                        print '\nError, ref does not match alt while attempting to insert substitution error!\n'
                        exit(1)

            if anyIndelErr:
                if len(expandedCigar):
                    #print myCigar,'-->',
                    relevantCigar = (expandedCigar +
                                     extraCigarVal)[:self.readLen]
                    myCigar = CigarString(listIn=relevantCigar).getString()
                    #print myCigar

                r[3] = r[3][:self.readLen]
                #if len(r[3]) != self.readLen:
                #	print 'AHHHHHH_1'
                #	exit(1)
                #if len(expandedCigar+extraCigarVal) != self.readLen:
                #	print 'AHHHHHH_2'
                #	exit(1)

            rOut.append([
                r[0] - self.adj[myPloid][r[0]], myCigar,
                str(r[3]),
                str(r[1])
            ])

        # rOut[i] = (pos, cigar, read_string, qual_string)
        return rOut
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)

		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust length to match desired read length
		if self.readLen == len(initQ1):
			self.qIndRemap = range(self.readLen)
		else:
			print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
			self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		# initialize probability distributions
		self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
		self.probDistByPosByPrevQ1 = [None]
		for i in xrange(1,len(initQ1)):
			self.probDistByPosByPrevQ1.append([])
			for j in xrange(len(initQ1[0])):
				if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
				else:
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

		if self.PE_MODELS:
			self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
			self.probDistByPosByPrevQ2 = [None]
			for i in xrange(1,len(initQ2)):
				self.probDistByPosByPrevQ2.append([])
				for j in xrange(len(initQ2[0])):
					if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))
class ReadContainer:
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)

		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust length to match desired read length
		if self.readLen == len(initQ1):
			self.qIndRemap = range(self.readLen)
		else:
			print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
			self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		# initialize probability distributions
		self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
		self.probDistByPosByPrevQ1 = [None]
		for i in xrange(1,len(initQ1)):
			self.probDistByPosByPrevQ1.append([])
			for j in xrange(len(initQ1[0])):
				if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
				else:
					self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

		if self.PE_MODELS:
			self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
			self.probDistByPosByPrevQ2 = [None]
			for i in xrange(1,len(initQ2)):
				self.probDistByPosByPrevQ2.append([])
				for j in xrange(len(initQ2[0])):
					if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))

	def getSequencingErrors(self, readData, isReverseStrand=False):

		qOut = [0]*self.readLen
		sErr = []

		if self.PE_MODELS and isReverseStrand:
			myQ = self.initDistByPos2[0].sample()
		else:
			myQ = self.initDistByPos1[0].sample()

		if random.random() < self.qErrRate[myQ]:
			sErr.append(0)
		qOut[0] = myQ + self.offQ
		for i in xrange(1,self.readLen):

			if self.PE_MODELS and isReverseStrand:
				myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample()
			else:
				myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample()

			if random.random() < self.errorScale*self.qErrRate[myQ]:
				sErr.append(i)
			qOut[i] = myQ + self.offQ
		qOut = ''.join([chr(n) for n in qOut])

		sOut = []
		nDelSoFar = 0
		# don't allow indel errors to occur on subsequent positions
		prevIndel = -2
		# don't allow other sequencing errors to occur on bases removed by deletion errors
		delBlacklist = []

		for ind in sErr[::-1]:	# for each error that we're going to insert...

			# determine error type
			isSub = True
			if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and ind > prevIndel+1:
				if random.random() < self.errP[1]:
					isSub = False

			# errorOut = (type, len, pos, ref, alt)

			if isSub:								# insert substitution error
				myNucl  = chr(readData[ind])
				newNucl = self.errSSE[NUC_IND[myNucl]].sample()
				sOut.append(('S',1,ind,myNucl,newNucl))
			else:									# insert indel error
				indelLen = self.errSIE.sample()
				if random.random() < self.errP[4]:		# insertion error
					myNucl  = chr(readData[ind])
					newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)])
					sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl))
				elif ind < self.readLen-2-nDelSoFar:	# deletion error (prevent too many of them from stacking up)
					myNucl  = str(readData[ind:ind+indelLen+1])
					newNucl = chr(readData[ind])
					nDelSoFar += len(myNucl)-1
					sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl))
					for i in xrange(ind+1,ind+indelLen+1):
						delBlacklist.append(i)
				prevIndel = ind

		# remove blacklisted errors
		for i in xrange(len(sOut)-1,-1,-1):
			if sOut[i][2] in delBlacklist:
				del sOut[i]

		return (qOut,sOut)
class SequenceContainer:
	def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, coverageDat=None, onlyVCF=False):
		# initialize basic variables
		self.onlyVCF = onlyVCF
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat)
		# initialize mutation models
		self.init_mutModels(mutationModels, mutRate)
		# sample the number of variants that will be inserted into each ploid
		self.init_poisson()
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		# initialize trinuc snp bias
		self.init_trinucBias()

	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat):
		self.x         = xOffset
		self.ploidy    = ploidy
		self.readLen   = readLen
		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
		self.seqLen    = len(sequence)
		self.indelList = [[] for n in xrange(self.ploidy)]
		self.snpList   = [[] for n in xrange(self.ploidy)]
		self.allCigar  = [[] for n in xrange(self.ploidy)]
		self.adj       = [None for n in xrange(self.ploidy)]
		# blackList[ploid][pos] = 0		safe to insert variant here
		# blackList[ploid][pos] = 1		indel inserted here
		# blackList[ploid][pos] = 2		snp inserted here
		# blackList[ploid][pos] = 3		invalid position for various processing reasons
		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]

		# disallow mutations to occur on window overlap points
		self.winBuffer = windowOverlap
		for p in xrange(self.ploidy):
			self.blackList[p][-self.winBuffer]   = 3
			self.blackList[p][-self.winBuffer-1] = 3
		
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, coverage_vals) = coverageDat
			self.win_per_read = int(self.readLen/float(self.windowSize)+0.5)
			self.which_bucket = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

	def init_mutModels(self,mutationModels,mutRate):
		if mutationModels == []:
			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
			self.modelData = ml[:self.ploidy]
		else:
			if len(mutationModels) != self.ploidy:
				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
				exit(1)
			self.modelData = copy.deepcopy(mutationModels)

		# do we need to rescale mutation frequencies?
		mutRateSum = sum([n[0] for n in self.modelData])
		self.mutRescale = mutRate
		if self.mutRescale == None:
			self.mutScalar = 1.0
		else:
			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))

		# how are mutations spread to each ploid, based on their specified mut rates?
		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))

		# init mutation models
		#
		# self.models[ploid][0] = average mutation rate
		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
		# self.models[ploid][2] = p(mut is indel | mut occurs)
		# self.models[ploid][3] = p(insertion | indel occurs)
		# self.models[ploid][4] = distribution of insertion lengths
		# self.models[ploid][5] = distribution of deletion lengths
		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
		# self.models[ploid][7] = p(trinuc mutates)
		self.models = []
		for n in self.modelData:
			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
			for m in n[8]:
				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
					                       DiscreteDistribution(m[1],NUCL),
					                       DiscreteDistribution(m[2],NUCL),
					                       DiscreteDistribution(m[3],NUCL)])
			self.models[-1].append([m for m in n[9]])

	def init_poisson(self):
		ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))]
		k_range    = range(int(self.seqLen*MAX_MUTFRAC))
		self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))]
		self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))]

	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))

	def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, coverageDat=None):
		# if mutation model is changed, we have to reinitialize it...
		if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
			self.ploidy = ploidy
			self.mutRescale = mutRate
			self.init_mutModels(mutationModels, mutRate)
		# if sequence length is different than previous window, we have to redo snp/indel poissons
		if len(sequence) != self.seqLen:
			self.seqLen = len(sequence)
			self.init_poisson()
		# basic vars
		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen, coverageDat)
		self.indelsToAdd = [n.sample() for n in self.ind_pois]
		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
		#print (self.indelsToAdd,self.snpsToAdd)
		# initialize trinuc snp bias
		self.init_trinucBias()

	def insert_mutations(self, inputList):
		#
		#	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
		#
		for inpV in inputList:
			whichPloid = []
			wps = inpV[4][0]
			if wps == None:	# if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
				whichPloid.append(self.ploidMutPrior.sample())
				whichAlt = [0]
			else:
				if 'WP=' in wps:
					whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1']
					whichAlt   = [0]*len(whichPloid)
				elif '/' in wps or '|' in wps:
					if '/' in wps:
						splt = wps.split('/')
					else:
						splt = wps.split('|')
					whichPloid = []
					whichAlt   = []
					for i in xrange(len(splt)):
						if splt[i] == '1':
							whichPloid.append(i)
						whichAlt.append(int(splt[i])-1)
						
			for i in xrange(len(whichPloid)):
				p = whichPloid[i]
				myAlt = inpV[2][whichAlt[i]]
				myVar = (inpV[0]-self.x,inpV[1],myAlt)
				inLen = max([len(inpV[1]),len(myAlt)])
				#print myVar, chr(self.sequences[p][myVar[0]])
				if len(inpV[1]) == 1 and len(myAlt) == 1:
					if self.blackList[p][myVar[0]]:
						continue
					self.snpList[p].append(myVar)
					self.blackList[p][myVar[0]] = 2
				else:
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						if self.blackList[p][k]:
							continue
					for k in xrange(myVar[0],myVar[0]+inLen+1):
						self.blackList[p][k] = 1
					self.indelList[p].append(myVar)

	def random_mutations(self):
		
		#	add random indels
		all_indels  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.indelsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous indel
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous indel
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert indels
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					eventPos = random.randint(self.winBuffer,self.seqLen-1)
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				if random.random() <= self.models[i][3]:	# insertion
					inLen   = self.models[i][4].sample()
					# sequence content of random insertions is uniformly random (change this later)
					inSeq   = ''.join([random.choice(NUCL) for n in xrange(inLen)])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl,refNucl+inSeq)
				else:										# deletion
					inLen   = self.models[i][5].sample()
					if eventPos+inLen+1 >= len(self.sequences[i]):	# skip if deletion too close to boundary
						continue
					if inLen == 1:
						inSeq = chr(self.sequences[i][eventPos+1])
					else:
						inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1])
					refNucl = chr(self.sequences[i][eventPos])
					myIndel = (eventPos,refNucl+inSeq,refNucl)

				# if event too close to boundary, skip. if event conflicts with other indel, skip.
				skipEvent = False
				if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1:
					skipEvent = True
				if skipEvent:
					continue
				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						if self.blackList[p][k]:
							skipEvent = True
				if skipEvent:
					continue

				for p in whichPloid:
					for k in xrange(eventPos,eventPos+inLen+1):
						self.blackList[p][k] = 1
					all_indels[p].append(myIndel)

		for i in xrange(len(all_indels)):
			all_indels[i].extend(self.indelList[i])
		all_indels = [sorted(n,reverse=True) for n in all_indels]
		#print all_indels

		#	add random snps
		all_snps  = [[] for n in self.sequences]
		for i in xrange(self.ploidy):
			for j in xrange(self.snpsToAdd[i]):
				if random.random() <= self.models[i][1]:	# insert homozygous SNP
					whichPloid = range(self.ploidy)
				else:								# insert heterozygous SNP
					whichPloid = [self.ploidMutPrior.sample()]

				# try to find suitable places to insert snps
				eventPos = -1
				for attempt in xrange(MAX_ATTEMPTS):
					# based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
					# (if there are multiple ploids, choose one at random)
					#eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
					ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)]
					eventPos     = self.trinuc_bias[ploid_to_use].sample()
					for p in whichPloid:
						if self.blackList[p][eventPos]:
							eventPos = -1
					if eventPos != -1:
						break
				if eventPos == -1:
					continue

				refNucl = chr(self.sequences[i][eventPos])
				context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1]))
				# sample from tri-nucleotide substitution matrices to get SNP alt allele
				newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample()
				mySNP   = (eventPos,refNucl,newNucl)

				for p in whichPloid:
					all_snps[p].append(mySNP)
					self.blackList[p][mySNP[0]] = 2

		# combine random snps with inserted snps, remove any snps that overlap indels
		for p in xrange(len(all_snps)):
			all_snps[p].extend(self.snpList[p])
			all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1]

		# modify reference sequences
		for i in xrange(len(all_snps)):
			for j in xrange(len(all_snps[i])):
				# sanity checking (for debugging purposes)
				vPos = all_snps[i][j][0]
				if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
					print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n'
					exit(1)
				else:
					self.sequences[i][vPos] = all_snps[i][j][2]

		adjToAdd = [[] for n in xrange(self.ploidy)]
		for i in xrange(len(all_indels)):
			for j in xrange(len(all_indels[i])):
				# sanity checking (for debugging purposes)
				vPos  = all_indels[i][j][0]
				vPos2 = vPos + len(all_indels[i][j][1])
				#print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
				#print len(self.sequences[i]),'-->',
				if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
					print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n'
					exit(1)
				else:
					self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:]
					adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1])))
				#print len(self.sequences[i])
			adjToAdd[i].sort()
			#print adjToAdd[i]

			self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4')
			indSoFar = 0
			valSoFar = 0
			for j in xrange(len(self.adj[i])):
				if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1:
					valSoFar += adjToAdd[i][indSoFar][1]
					indSoFar += 1
				self.adj[i][j] = valSoFar

			# precompute cigar strings (we can skip this is going for only vcf output)
			if not self.onlyVCF:
				tempSymbolString = ['M']
				prevVal = self.adj[i][0]
				j = 1
				while j < len(self.adj[i]):
					diff = self.adj[i][j] - prevVal
					prevVal = self.adj[i][j]
					if diff > 0:	# insertion
						tempSymbolString.extend(['I']*abs(diff))
						j += abs(diff)
					elif diff < 0:	# deletion
						tempSymbolString.append('D'*abs(diff)+'M')
						j += 1
					else:
						tempSymbolString.append('M')
						j += 1

				for j in xrange(len(tempSymbolString)-self.readLen):
					self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString())


		# tally up variants implemented
		countDict = {}
		all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)]
		for i in xrange(len(all_variants)):
			for j in xrange(len(all_variants[i])):
				all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:]
				t = tuple(all_variants[i][j])
				if t not in countDict:
					countDict[t] = []
				countDict[t].append(i)

		#
		#	TODO: combine multiple variants that happened to occur at same position into single vcf entry
		#

		output_variants = []
		for k in sorted(countDict.keys()):
			output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)]))
			ploid_string = ['0' for n in xrange(self.ploidy)]
			for k2 in [n for n in countDict[k]]:
				ploid_string[k2] = '1'
			output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)])
		return output_variants


	def sample_read(self, sequencingModel, fragLen=None):
		
		# choose a ploid
		myPloid = random.randint(0,self.ploidy-1)

		# choose a random position within the ploid, and generate quality scores / sequencing errors
		readsToSample = []
		if fragLen == None:
			#rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random

			# decide which subsection of the sequence to sample from using coverage probabilities
			coords_bad = True
			while coords_bad:
				myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
				coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
				coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
				coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[1]]
				if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen:
					coords_bad = False
			rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)

			# sample read position and call function to compute quality scores / sequencing errors
			rDat = self.sequences[myPloid][rPos:rPos+self.readLen]
			(myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
			readsToSample.append([rPos,myQual,myErrors,rDat])

		else:
			#rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random

			# decide which subsection of the sequence to sample from using coverage probabilities
			coords_bad = True
			while coords_bad:
				myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
				coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
				coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
				coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]	# both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
				rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
				# for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
				if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen:
					rPos1 -= fragLen - self.readLen
				if rPos1 < len(self.sequences[myPloid])-fragLen:
					coords_bad = False

			rPos2 = rPos1 + fragLen - self.readLen
			rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen]
			rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen]
			(myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
			(myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True)
			readsToSample.append([rPos1,myQual1,myErrors1,rDat1])
			readsToSample.append([rPos2,myQual2,myErrors2,rDat2])

		# error format:
		# myError[i] = (type, len, pos, ref, alt)

		# examine sequencing errors to-be-inserted.
		#	- remove deletions that don't have enough bordering sequence content to "fill in"
		# if error is valid, make the changes to the read data
		rOut = []
		for r in readsToSample:
			myCigar = self.allCigar[myPloid][r[0]]
			totalD  = sum([error[1] for error in r[2] if error[0] == 'D'])
			totalI  = sum([error[1] for error in r[2] if error[0] == 'I'])
			availB  = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
			# add buffer sequence to fill in positions that get deleted
			r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD]
			expandedCigar = []
			extraCigar    = []
			adj           = 0
			sse_adj       = [0 for n in xrange(self.readLen)]
			anyIndelErr   = False

			# sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
			# secondarily, sort by index
			arrangedErrors = {'D':[],'I':[],'S':[]}
			for error in r[2]:
				arrangedErrors[error[0]].append((error[2],error))
			sortedErrors = []
			for k in sorted(arrangedErrors.keys()):
				sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])

			for error in sortedErrors:
				#print r[0], error
				eLen = error[1]
				ePos = error[2]
				if error[0] == 'D' or error[0] == 'I':
					anyIndelErr   = True
					extraCigarVal = []
					if totalD > availB:	# if not enough bases to fill-in deletions, skip all indel erors
						continue
					if expandedCigar == []:
						expandedCigar = CigarString(stringIn=myCigar).getList()

						fillToGo = totalD - totalI
						if fillToGo > 0:
							extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:]

					# insert deletion error into read and update cigar string accordingly
					if error[0] == 'D':
						pi = ePos+adj
						pf = ePos+adj+eLen+1
						if str(r[3][pi:pf]) == str(error[3]):
							r[3] = r[3][:pi+1] + r[3][pf:]
							expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:]
							expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1]
						else:
							print '\nError, ref does not match alt while attempting to insert deletion error!\n'
							exit(1)
						adj -= eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] -= eLen

					# insert insertion error into read and update cigar string accordingly
					else:
						if chr(r[3][ePos+adj]) == error[3]:
							r[3] = r[3][:ePos+adj] + error[4] + r[3][ePos+adj+1:]
							expandedCigar = expandedCigar[:ePos+adj] + ['I']*eLen + expandedCigar[ePos+adj+1:]
						else:
							print '\nError, ref does not match alt while attempting to insert insertion error!\n'
							exit(1)
						adj += eLen
						for i in xrange(ePos,len(sse_adj)):
							sse_adj[i] += eLen

				else:	# substitution errors, much easier by comparison...
					if chr(r[3][ePos+sse_adj[ePos]]) == error[3]:
						r[3][ePos+sse_adj[ePos]] = error[4]
					else:
						print '\nError, ref does not match alt while attempting to insert substitution error!\n'
						exit(1)

			if anyIndelErr:
				if len(expandedCigar):
					#print myCigar,'-->',
					relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen]
					myCigar = CigarString(listIn=relevantCigar).getString()
					#print myCigar

				r[3] = r[3][:self.readLen]
				#if len(r[3]) != self.readLen:
				#	print 'AHHHHHH_1'
				#	exit(1)
				#if len(expandedCigar+extraCigarVal) != self.readLen:
				#	print 'AHHHHHH_2'
				#	exit(1)

			rOut.append([r[0]-self.adj[myPloid][r[0]],myCigar,str(r[3]),str(r[1])])

		# rOut[i] = (pos, cigar, read_string, qual_string)
		return rOut