def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen,
                       coverageDat):
        self.x = xOffset
        self.ploidy = ploidy
        self.readLen = readLen
        self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
        self.seqLen = len(sequence)
        self.indelList = [[] for n in xrange(self.ploidy)]
        self.snpList = [[] for n in xrange(self.ploidy)]
        self.allCigar = [[] for n in xrange(self.ploidy)]
        self.adj = [None for n in xrange(self.ploidy)]
        # blackList[ploid][pos] = 0		safe to insert variant here
        # blackList[ploid][pos] = 1		indel inserted here
        # blackList[ploid][pos] = 2		snp inserted here
        # blackList[ploid][pos] = 3		invalid position for various processing reasons
        self.blackList = [
            np.zeros(self.seqLen, dtype='<i4') for n in xrange(self.ploidy)
        ]

        # disallow mutations to occur on window overlap points
        self.winBuffer = windowOverlap
        for p in xrange(self.ploidy):
            self.blackList[p][-self.winBuffer] = 3
            self.blackList[p][-self.winBuffer - 1] = 3

        # if we're only creating a vcf, skip some expensive initialization related to coverage depth
        if not self.onlyVCF:
            (self.windowSize, coverage_vals) = coverageDat
            self.win_per_read = int(self.readLen / float(self.windowSize) +
                                    0.5)
            self.which_bucket = DiscreteDistribution(coverage_vals,
                                                     range(len(coverage_vals)))
Example #2
0
	def init_trinucBias(self):
		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
		#
		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
		#       at positions affected by indels. At the moment I'm going to consider this negligible.
		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
		self.trinuc_bias = [None for n in xrange(self.ploidy)]
		for p in xrange(self.ploidy):
			for i in xrange(self.winBuffer+1,self.seqLen-1):
				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))
    def init_mutModels(self, mutationModels, mutRate):
        if mutationModels == []:
            ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
            self.modelData = ml[:self.ploidy]
        else:
            if len(mutationModels) != self.ploidy:
                print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
                exit(1)
            self.modelData = copy.deepcopy(mutationModels)

        # do we need to rescale mutation frequencies?
        mutRateSum = sum([n[0] for n in self.modelData])
        self.mutRescale = mutRate
        if self.mutRescale == None:
            self.mutScalar = 1.0
        else:
            self.mutScalar = float(
                self.mutRescale) / (mutRateSum / float(len(self.modelData)))

        # how are mutations spread to each ploid, based on their specified mut rates?
        self.ploidMutFrac = [float(n[0]) / mutRateSum for n in self.modelData]
        self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,
                                                  range(self.ploidy))

        # init mutation models
        #
        # self.models[ploid][0] = average mutation rate
        # self.models[ploid][1] = p(mut is homozygous | mutation occurs)
        # self.models[ploid][2] = p(mut is indel | mut occurs)
        # self.models[ploid][3] = p(insertion | indel occurs)
        # self.models[ploid][4] = distribution of insertion lengths
        # self.models[ploid][5] = distribution of deletion lengths
        # self.models[ploid][6] = distribution of trinucleotide SNP transitions
        # self.models[ploid][7] = p(trinuc mutates)
        self.models = []
        for n in self.modelData:
            self.models.append([
                self.mutScalar * n[0], n[1], n[2], n[3],
                DiscreteDistribution(n[5], n[4]),
                DiscreteDistribution(n[7], n[6]), []
            ])
            for m in n[8]:
                self.models[-1][6].append([
                    DiscreteDistribution(m[0], NUCL),
                    DiscreteDistribution(m[1], NUCL),
                    DiscreteDistribution(m[2], NUCL),
                    DiscreteDistribution(m[3], NUCL)
                ])
            self.models[-1].append([m for m in n[9]])
Example #4
0
	[GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb'))
	GC_WINDOW_SIZE = GC_SCALE_COUNT[-1]

#	fragment length distribution
#
if PAIRED_END and not(PAIRED_END_ARTIFICIAL):
	print 'Using empirical fragment length distribution.'
	[potential_vals, potential_prob] = pickle.load(open(FRAGLEN_MODEL,'rb'))
	FRAGLEN_VALS = []
	FRAGLEN_PROB = []
	for i in xrange(len(potential_vals)):
		if potential_vals[i] > READLEN:
			FRAGLEN_VALS.append(potential_vals[i])
			FRAGLEN_PROB.append(potential_prob[i])
	# should probably add some validation and sanity-checking code here...
	FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS)
	FRAGMENT_SIZE = FRAGLEN_VALS[mean_ind_of_weighted_list(FRAGLEN_PROB)]

#	Indicate not writing FASTQ reads
#
if NO_FASTQ:
	print 'Bypassing FASTQ generation...'

"""************************************************
****            HARD-CODED CONSTANTS
************************************************"""


# target window size for read sampling. how many times bigger than read/frag length
WINDOW_TARGET_SCALE = 100
# sub-window size for read sampling windows. this is basically the finest resolution
Example #5
0
	def __init__(self, readLen, errorModel, reScaledError):

		self.readLen = readLen

		errorDat = pickle.load(open(errorModel,'rb'))
		self.UNIFORM = False
		if len(errorDat) == 4:		# uniform-error SE reads (e.g. PacBio)
			self.UNIFORM = True
			[Qscores,offQ,avgError,errorParams] = errorDat
			self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
			print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = False
		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
			#print 'Using paired-read quality score profiles...'
			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
			self.PE_MODELS = True
			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
				print '\nError: R1 and R2 quality score models are of different length.\n'
				exit(1)


		self.qErrRate = [0.]*(max(Qscores)+1)
		for q in Qscores:
			self.qErrRate[q] = 10.**(-q/10.)
		self.offQ = offQ

		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
		self.errP   = errorParams
		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)

		# adjust sequencing error frequency to match desired rate
		if reScaledError == None:
			self.errorScale = 1.0
		else:
			self.errorScale = reScaledError/avgError
			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)

		if self.UNIFORM == False:
			# adjust length to match desired read length
			if self.readLen == len(initQ1):
				self.qIndRemap = range(self.readLen)
			else:
				print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
				self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]

			# initialize probability distributions
			self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
			self.probDistByPosByPrevQ1 = [None]
			for i in xrange(1,len(initQ1)):
				self.probDistByPosByPrevQ1.append([])
				for j in xrange(len(initQ1[0])):
					if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
					else:
						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))

			if self.PE_MODELS:
				self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
				self.probDistByPosByPrevQ2 = [None]
				for i in xrange(1,len(initQ2)):
					self.probDistByPosByPrevQ2.append([])
					for j in xrange(len(initQ2[0])):
						if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
						else:
							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))
Example #6
0
	def init_coverage(self,coverageDat,fragDist=None):
		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
		if not self.onlyVCF:
			(self.windowSize, gc_scalars, targetCov_vals) = coverageDat
			gcCov_vals = [[] for n in self.sequences]
			trCov_vals = [[] for n in self.sequences]
			self.coverage_distribution = []
			avg_out = []
			for i in xrange(len(self.sequences)):
				# compute gc-bias
				j = 0
				while j+self.windowSize < len(self.sequences[i]):
					gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
					gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
					j += self.windowSize
				gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
				gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
				#
				trCov_vals[i].append(targetCov_vals[0])
				prevVal = self.FM_pos[i][0]
				for j in xrange(1,len(self.sequences[i])-self.readLen):
					if self.FM_pos[i][j] == None:
						trCov_vals[i].append(targetCov_vals[prevVal])
					else:
						trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
						prevVal = self.FM_pos[i][j]
					#print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
				# shift by half of read length
				trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
				# fill in missing indices
				trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))

				#
				covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
				coverage_vals = []
				for j in xrange(0,len(self.sequences[i])-self.readLen):
					coverage_vals.append(covvec[j+self.readLen] - covvec[j])
				avg_out.append(np.mean(coverage_vals)/float(self.readLen))

				if fragDist == None:
					self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
			
				# fragment length nightmare
				else:
					currentThresh = 0.
					index_list    = [0]
					for j in xrange(len(fragDist.cumP)):
						if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
							currentThresh = fragDist.cumP[j]
							index_list.append(j)
					flq = [fragDist.values[nnn] for nnn in index_list]
					if fragDist.values[-1] not in flq:
						flq.append(fragDist.values[-1])
					flq.append(LARGE_NUMBER)

					self.fraglens_indMap = {}
					for j in fragDist.values:
						bInd = bisect.bisect(flq,j)
						if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
							self.fraglens_indMap[j] = flq[bInd-1]
						else:
							self.fraglens_indMap[j] = flq[bInd]

					self.coverage_distribution.append({})
					for flv in sorted(list(set(self.fraglens_indMap.values()))):
						buffer_val = self.readLen
						for j in fragDist.values:
							if self.fraglens_indMap[j] == flv and j > buffer_val:
								buffer_val = j
						coverage_vals = []
						for j in xrange(len(self.sequences[i])-buffer_val):
							coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])

						# EXPERIMENTAL
						#quantized_covVals = quantize_list(coverage_vals)
						#self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])

						# TESTING
						#import matplotlib.pyplot as mpl
						#print len(coverage_vals),'-->',len(quantized_covVals)
						#mpl.figure(0)
						#mpl.plot(range(len(coverage_vals)),coverage_vals)
						#for qcv in quantized_covVals:
						#	mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
						#mpl.show()
						#exit(1)

						self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))

			return np.mean(avg_out)
def parseFQ(inf):
    print 'reading ' + inf + '...'
    if inf[-3:] == '.gz':
        print 'detected gzip suffix...'
        f = gzip.open(inf, 'r')
    else:
        f = open(inf, 'r')

    IS_SAM = False
    if inf[-4:] == '.sam':
        print 'detected sam input...'
        IS_SAM = True

    rRead = 0
    actual_readlen = 0
    qDict = {}
    while True:

        if IS_SAM:
            data4 = f.readline()
            if not len(data4):
                break
            try:
                data4 = data4.split('\t')[10]
            except IndexError:
                break
            # need to add some input checking here? Yup, probably.
        else:
            data1 = f.readline()
            data2 = f.readline()
            data3 = f.readline()
            data4 = f.readline()
            if not all([data1, data2, data3, data4]):
                break

        if actual_readlen == 0:
            if inf[-3:] != '.gz' and not IS_SAM:
                totalSize = os.path.getsize(inf)
                entrySize = sum([len(n) for n in [data1, data2, data3, data4]])
                print 'estimated number of reads in file:', int(
                    float(totalSize) / entrySize)
            actual_readlen = len(data4) - 1
            print 'assuming read length is uniform...'
            print 'detected read length (from first read found):', actual_readlen
            priorQ = np.zeros([actual_readlen, RQ])
            totalQ = [None] + [
                np.zeros([RQ, RQ]) for n in xrange(actual_readlen - 1)
            ]

        # sanity-check readlengths
        if len(data4) - 1 != actual_readlen:
            print 'skipping read with unexpected length...'
            continue

        for i in range(len(data4) - 1):
            q = ord(data4[i]) - offQ
            qDict[q] = True
            if i == 0:
                priorQ[i][q] += 1
            else:
                totalQ[i][prevQ, q] += 1
                priorQ[i][q] += 1
            prevQ = q

        rRead += 1
        if rRead % PRINT_EVERY == 0:
            print rRead
        if MAX_READS > 0 and rRead >= MAX_READS:
            break
    f.close()

    # some sanity checking again...
    QRANGE = [min(qDict.keys()), max(qDict.keys())]
    if QRANGE[0] < 0:
        print '\nError: Read in Q-scores below 0\n'
        exit(1)
    if QRANGE[1] > RQ:
        print '\nError: Read in Q-scores above specified maximum:', QRANGE[
            1], '>', RQ, '\n'
        exit(1)

    print 'computing probabilities...'
    probQ = [None] + [[[0. for m in xrange(RQ)] for n in xrange(RQ)]
                      for p in xrange(actual_readlen - 1)]
    for p in xrange(1, actual_readlen):
        for i in xrange(RQ):
            rowSum = float(np.sum(totalQ[p][i, :])) + PROB_SMOOTH * RQ
            if rowSum <= 0.:
                continue
            for j in xrange(RQ):
                probQ[p][i][j] = (totalQ[p][i][j] + PROB_SMOOTH) / rowSum

    initQ = [[0. for m in xrange(RQ)] for n in xrange(actual_readlen)]
    for i in xrange(actual_readlen):
        rowSum = float(np.sum(priorQ[i, :])) + INIT_SMOOTH * RQ
        if rowSum <= 0.:
            continue
        for j in xrange(RQ):
            initQ[i][j] = (priorQ[i][j] + INIT_SMOOTH) / rowSum

    if PLOT_STUFF:
        mpl.rcParams.update({
            'font.size': 14,
            'font.weight': 'bold',
            'lines.linewidth': 3
        })

        mpl.figure(1)
        Z = np.array(initQ).T
        X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
        mpl.pcolormesh(X, Y, Z, vmin=0., vmax=0.25)
        mpl.axis([0, len(Z[0]), 0, len(Z)])
        mpl.yticks(range(0, len(Z), 10), range(0, len(Z), 10))
        mpl.xticks(range(0, len(Z[0]), 10), range(0, len(Z[0]), 10))
        mpl.xlabel('Read Position')
        mpl.ylabel('Quality Score')
        mpl.title('Q-Score Prior Probabilities')
        mpl.colorbar()

        mpl.show()

        VMIN_LOG = [-4, 0]
        minVal = 10**VMIN_LOG[0]
        qLabels = [
            str(n) for n in range(QRANGE[0], QRANGE[1] + 1) if n % 5 == 0
        ]
        print qLabels
        qTicksx = [int(n) + 0.5 for n in qLabels]
        qTicksy = [(RQ - int(n)) - 0.5 for n in qLabels]

        for p in xrange(1, actual_readlen, 10):
            currentDat = np.array(probQ[p])
            for i in xrange(len(currentDat)):
                for j in xrange(len(currentDat[i])):
                    currentDat[i][j] = max(minVal, currentDat[i][j])

            # matrix indices:		pcolormesh plotting:	plot labels and axes:
            #
            #      y				   ^					   ^
            #	   -->				 x |					 y |
            #  x |					    -->					    -->
            #    v 					    y					    x
            #
            # to plot a MxN matrix 'Z' with rowNames and colNames we need to:
            #
            # pcolormesh(X,Y,Z[::-1,:])		# invert x-axis
            # # swap x/y axis parameters and labels, remember x is still inverted:
            # xlim([yMin,yMax])
            # ylim([M-xMax,M-xMin])
            # xticks()
            #

            mpl.figure(p + 1)
            Z = np.log10(currentDat)
            X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
            mpl.pcolormesh(X,
                           Y,
                           Z[::-1, :],
                           vmin=VMIN_LOG[0],
                           vmax=VMIN_LOG[1],
                           cmap='jet')
            mpl.xlim([QRANGE[0], QRANGE[1] + 1])
            mpl.ylim([RQ - QRANGE[1] - 1, RQ - QRANGE[0]])
            mpl.yticks(qTicksy, qLabels)
            mpl.xticks(qTicksx, qLabels)
            mpl.xlabel('\n' + r'$Q_{i+1}$')
            mpl.ylabel(r'$Q_i$')
            mpl.title('Q-Score Transition Frequencies [Read Pos:' + str(p) +
                      ']')
            cb = mpl.colorbar()
            cb.set_ticks([-4, -3, -2, -1, 0])
            cb.set_ticklabels([
                r'$10^{-4}$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$',
                r'$10^{0}$'
            ])

        #mpl.tight_layout()
        mpl.show()

    print 'estimating average error rate via simulation...'
    Qscores = range(RQ)
    #print (len(initQ), len(initQ[0]))
    #print (len(probQ), len(probQ[1]), len(probQ[1][0]))

    initDistByPos = [
        DiscreteDistribution(initQ[i], Qscores) for i in xrange(len(initQ))
    ]
    probDistByPosByPrevQ = [None]
    for i in xrange(1, len(initQ)):
        probDistByPosByPrevQ.append([])
        for j in xrange(len(initQ[0])):
            if np.sum(
                    probQ[i][j]
            ) <= 0.:  # if we don't have sufficient data for a transition, use the previous qscore
                probDistByPosByPrevQ[-1].append(
                    DiscreteDistribution([1], [Qscores[j]],
                                         degenerateVal=Qscores[j]))
            else:
                probDistByPosByPrevQ[-1].append(
                    DiscreteDistribution(probQ[i][j], Qscores))

    countDict = {}
    for q in Qscores:
        countDict[q] = 0
    for samp in xrange(1, N_SAMP + 1):
        if samp % PRINT_EVERY == 0:
            print samp
        myQ = initDistByPos[0].sample()
        countDict[myQ] += 1
        for i in xrange(1, len(initQ)):
            myQ = probDistByPosByPrevQ[i][myQ].sample()
            countDict[myQ] += 1

    totBases = float(sum(countDict.values()))
    avgError = 0.
    for k in sorted(countDict.keys()):
        eVal = 10.**(-k / 10.)
        #print k, eVal, countDict[k]
        avgError += eVal * (countDict[k] / totBases)
    print 'AVG ERROR RATE:', avgError

    return (initQ, probQ, avgError)
        if print_path:
            print('Path cost is', discovered[target][1])
            stack = []
            curr = target

            while curr:
                stack.append((curr, self.original_universe[curr[0]][curr[1]]))
                print(curr)
                curr = discovered[curr][0]

            print('Path from start to target:', stack[::-1])

        return discovered[target][1]


if __name__ == '__main__':
    state = load_maze()

    universe = state.universe
    start, target = state.start, state.target
    portals = state.portals

    discrete_distribution = DiscreteDistribution(portals)
    heuristics = Heuristic(portals, target, discrete_distribution)

    solver = A_Star(universe, portals, start, target, discrete_distribution)

    report = make_statistics(solver, heuristics, discrete_distribution)
    print(report)