Python getAllRefRegions Exemples, refFunc.getAllRefRegions Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : genReads.py Projet : muppetjones/neat-genreads

def main():

	# index reference
	refIndex = indexRef(REFERENCE)
	if PAIRED_END:
		N_HANDLING = ('random',FRAGMENT_SIZE)
	else:
		N_HANDLING = ('ignore',READLEN)
	indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))}

	# parse input variants, if present
	inputVariants = []
	if INPUT_VCF != None:
		if CANCER:
			(sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS)
			tumorInd  = sampNames.index('TUMOR')
			normalInd = sampNames.index('NORMAL')
		else:
			(sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS)
		for k in sorted(inputVariants.keys()):
			inputVariants[k].sort()

	# parse input targeted regions, if present
	inputRegions = {}
	if INPUT_BED != None:
		with open(INPUT_BED,'r') as f:
			for line in f:
				[myChr,pos1,pos2] = line.strip().split('\t')[:3]
				if myChr not in inputRegions:
					inputRegions[myChr] = [-1]
				inputRegions[myChr].extend([int(pos1),int(pos2)])

	# parse input mutation rate rescaling regions, if present
	mutRateRegions = {}
	mutRateValues  = {}
	if MUT_BED != None:
		with open(MUT_BED,'r') as f:
			for line in f:
				[myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4]
				mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';')
				(pos1,pos2) = (int(pos1),int(pos2))
				if len(mutStr) and (pos2-pos1) > 1:
					# mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount
					mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])])
					if myChr not in inputRegions:
						mutRateRegions[myChr] = [-1]
						mutRateValues[myChr]  = [0.0]
					mutRateRegions[myChr].extend([pos1,pos2])
					mutRateValues.extend([mutRate*(pos2-pos1)]*2)

	# initialize output files (part I)
	bamHeader = None
	if SAVE_BAM:
		bamHeader = [copy.deepcopy(refIndex)]
	vcfHeader = None
	if SAVE_VCF:
		vcfHeader = [REFERENCE]
	
	# If processing jobs in parallel, precompute the independent regions that can be process separately
	if NJOBS > 1:
		parallelRegionList  = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N)
		(myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS)
		if not len(myRegions):
			print 'This job id has no regions to process, exiting...'
			exit(1)
		for i in xrange(len(refIndex)-1,-1,-1):	# delete reference not used in our job
			if not refIndex[i][0] in myRefs:
				del refIndex[i]
		# if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion
		corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])])
	else:
		corrected_nJobs = 1

	# initialize output files (part II)
	if CANCER:
		OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT)
		OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs))
	else:
		OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs))
	OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1]


	"""************************************************
	****        LET'S GET THIS PARTY STARTED...
	************************************************"""


	readNameCount = 1	# keep track of the number of reads we've sampled, for read-names

	for RI in xrange(len(refIndex)):

		# read in reference sequence and notate blocks of Ns
		(refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING)

		# if we're processing jobs in parallel only take the regions relevant for the current job
		if NJOBS > 1:
			for i in xrange(len(N_regions['non_N'])-1,-1,-1):
				if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions:
					del N_regions['non_N'][i]

		# count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators)
		total_bp_span   = sum([n[1]-n[0] for n in N_regions['non_N']])
		currentProgress = 0
		currentPercent  = 0

		# prune invalid input variants, e.g variants that:
		#		- try to delete or alter any N characters
		#		- don't match the reference base at their specified position
		#		- any alt allele contains anything other than allowed characters
		validVariants = []
		nSkipped = [0,0,0]
		if refIndex[RI][0] in inputVariants:
			for n in inputVariants[refIndex[RI][0]]:
				span = (n[0],n[0]+len(n[1]))
				rseq = str(refSequence[span[0]-1:span[1]-1])	# -1 because going from VCF coords to array coords
				anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist])
				if rseq != n[1]:
					nSkipped[0] += 1
					continue
				elif 'N' in rseq:
					nSkipped[1] += 1
					continue
				elif anyBadChr:
					nSkipped[2] += 1
					continue
				#if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2:
				#	continue
				validVariants.append(n)
			print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...'
			if any(nSkipped):
				print sum(nSkipped),'variants skipped...'
				print ' - ['+str(nSkipped[0])+'] ref allele does not match reference'
				print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region'
				print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters'

		# add large random structural variants
		#
		#	TBD!!!

		# determine which structural variants will affect our sampling window positions
		structuralVars = []
		for n in validVariants:
			bufferNeeded = max([max([len(n[1])-len(alt_allele),1]) for alt_allele in n[2]])
			structuralVars.append((n[0]-1,bufferNeeded))	# -1 because going from VCF coords to array coords

		# determine sampling windows based on read length, large N regions, and structural mutations.
		# in order to obtain uniform coverage, windows should overlap by:
		#		- READLEN, if single-end reads
		#		- FRAGMENT_SIZE (mean), if paired-end reads
		# ploidy is fixed per large sampling window,
		# coverage distributions due to GC% and targeted regions are specified within these windows
		samplingWindows  = []
		ALL_VARIANTS_OUT = {}
		sequences        = None
		if PAIRED_END:
			targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE
			overlap  = FRAGMENT_SIZE
		else:
			targSize = WINDOW_TARGET_SCALE*READLEN
			overlap  = READLEN

		print '--------------------------------'
		print 'sampling reads...'
		for i in xrange(len(N_regions['non_N'])):
			(pi,pf) = N_regions['non_N'][i]
			nTargWindows = max([1,(pf-pi)/targSize])
			bpd = int((pf-pi)/float(nTargWindows))
			bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE

			#print len(refSequence), (pi,pf), nTargWindows
			#print structuralVars

			# if for some reason our region is too small to process, skip it! (sorry)
			if nTargWindows == 1 and (pf-pi) < overlap-1:
				#print 'Does this ever happen?'
				continue

			start = pi
			end   = min([start+bpd,pf])
			#print '------------------RAWR:', (pi,pf), bpd
			currentVariantInd = 0
			varsFromPrevOverlap = []
			varsCancerFromPrevOverlap = []
			vindFromPrev = 0
			isLastTime = False
			while True:
				####print (start,end)
				# adjust end-position of window based on inserted structural mutations
				relevantVars = []
				if len(structuralVars) and currentVariantInd < len(structuralVars):
					prevVarInd = currentVariantInd
					while structuralVars[currentVariantInd][0] <= end:
						delta = (end-1) - (structuralVars[currentVariantInd][0] + structuralVars[currentVariantInd][1])
						if delta <= 0:
							####print 'DELTA:', delta
							end -= (delta-1)
						currentVariantInd += 1
						if currentVariantInd == len(structuralVars):
							break
					relevantVars = structuralVars[prevVarInd:currentVariantInd]
				next_start = end-overlap
				next_end   = min([next_start+bpd,pf])
				if next_end-next_start < bpd:
					end = next_end
					isLastTime = True

				# print progress indicator
				#print 'PROCESSING WINDOW:',(start,end)
				currentProgress += end-start
				newPercent = int((currentProgress*100)/float(total_bp_span))
				if newPercent > currentPercent:
					sys.stdout.write(str(newPercent)+'% ')
					sys.stdout.flush()
					currentPercent = newPercent

				# which inserted variants are in this window?
				varsInWindow = []
				updated = False
				for j in xrange(vindFromPrev,len(validVariants)):
					vPos = validVariants[j][0]
					if vPos >= start and vPos < end:
						varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:])))	# vcf --> array coords
					if vPos >= end-overlap-1 and updated == False:
						updated = True
						vindFromPrev = j
					if vPos >= end:
						break

				# if computing only VCF, we can skip this...
				if ONLY_VCF:
					coverage_dat = None
					coverage_avg = None
				else:
					# pre-compute gc-bias and targeted sequencing coverage modifiers
					nSubWindows  = (end-start)/GC_WINDOW_SIZE
					coverage_dat = (GC_WINDOW_SIZE,[])
					for j in xrange(nSubWindows):
						rInd = start + j*GC_WINDOW_SIZE
						if INPUT_BED == None: tCov = True
						else: tCov = not(bisect.bisect(inputRegions[myChr],rInd)%2) or not(bisect.bisect(inputRegions[myChr],rInd+GC_WINDOW_SIZE)%2)
						if tCov: tScl = 1.0
						else: tScl = OFFTARGET_SCALAR
						gc_v = refSequence[rInd:rInd+GC_WINDOW_SIZE].count('G') + refSequence[rInd:rInd+GC_WINDOW_SIZE].count('C')
						gScl = GC_SCALE_VAL[gc_v]
						coverage_dat[1].append(1.0*tScl*gScl)
					coverage_avg = np.mean(coverage_dat[1])

				# pre-compute mutation rate tracks
				# PROVIDED MUTATION RATES OVERRIDE AVERAGE VALUE

				# construct sequence data that we will sample reads from
				if sequences == None:
					sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat,onlyVCF=ONLY_VCF)
				else:
					sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat)

				# adjust position of all inserted variants to match current window offset
				#variants_to_insert = []
				#for n in varsFromPrevOverlap:
				#	ln = [n[0]-start] + list(n[1:])
				#	variants_to_insert.append(tuple(ln))
				#for n in varsInWindow:
				#	ln = [n[0]-start] + list(n[1:])
				#	variants_to_insert.append(tuple(ln))
				#sequences.insert_mutations(variants_to_insert)
				sequences.insert_mutations(varsFromPrevOverlap + varsInWindow)
				all_inserted_variants = sequences.random_mutations()
				#print all_inserted_variants

				if CANCER:
					tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat)
					tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants)
					all_cancer_variants = tumor_sequences.random_mutations()

				# which variants do we need to keep for next time (because of window overlap)?
				varsFromPrevOverlap       = []
				varsCancerFromPrevOverlap = []
				for n in all_inserted_variants:
					if n[0] >= end-overlap-1:
						varsFromPrevOverlap.append(n)
				if CANCER:
					for n in all_cancer_variants:
						if n[0] >= end-overlap-1:
							varsCancerFromPrevOverlap.append(n)

				# if we're only producing VCF, no need to go through the hassle of generating reads
				if ONLY_VCF:
					pass
				else:
					# for each sampling window, construct sub-windows with coverage information
					covWindows = [COVERAGE for n in xrange((end-start)/SMALL_WINDOW)]
					if (end-start)%SMALL_WINDOW:
						covWindows.append(COVERAGE)
					meanCov = sum(covWindows)/float(len(covWindows))
					if PAIRED_END:
						readsToSample = int(((end-start)*meanCov*coverage_avg)/(2*READLEN))+1
					else:
						readsToSample = int(((end-start)*meanCov*coverage_avg)/(READLEN))+1

					# sample reads from altered reference
					for i in xrange(readsToSample):

						if PAIRED_END:
							myFraglen = FRAGLEN_DISTRIBUTION.sample()
							myReadData = sequences.sample_read(SE_CLASS,myFraglen)
							myReadData[0][0] += start	# adjust mapping position based on window start
							myReadData[1][0] += start
						else:
							myReadData = sequences.sample_read(SE_CLASS)
							myReadData[0][0] += start	# adjust mapping position based on window start
					
						if NJOBS > 1:
							myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount)
						else:
							myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount)
						readNameCount += len(myReadData)

						# if desired, replace all low-quality bases with Ns
						if N_MAX_QUAL > -1:
							for j in xrange(len(myReadData)):
								myReadString = [n for n in myReadData[j][2]]
								for k in xrange(len(myReadData[j][3])):
									adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ
									if adjusted_qual <= N_MAX_QUAL:
										myReadString[k] = 'N'
								myReadData[j][2] = ''.join(myReadString)

						# write read data out to FASTQ and BAM files, bypass FASTQ if option specified
						myRefIndex = indices_by_refName[refIndex[RI][0]]
						if len(myReadData) == 1:
							if NO_FASTQ != True:
								OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3])
							if SAVE_BAM:
								OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=0)
						elif len(myReadData) == 2:
							if NO_FASTQ != True:
								OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3])
							if SAVE_BAM:
								OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=99,  matePos=myReadData[1][0])
								OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=147, matePos=myReadData[0][0])
						else:
							print '\nError: Unexpected number of reads generated...\n'
							exit(1)

				# tally up all the variants that got successfully introduced
				for n in all_inserted_variants:
					ALL_VARIANTS_OUT[n] = True

				# prepare indices of next window
				start = next_start
				end   = next_end
				if isLastTime:
					break
				if end >= pf:
					isLastTime = True

		if currentPercent != 100:
			print '100%'
		else:
			print ''

		# write all output variants for this reference
		if SAVE_VCF:
			for k in sorted(ALL_VARIANTS_OUT.keys()):
				currentRef = refIndex[RI][0]
				myID       = '.'
				myQual     = '.'
				myFilt     = 'PASS'
				# k[0] + 1 because we're going back to 1-based vcf coords
				OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4])

		#break

	# close output files
	OFW.closeFiles()
	if CANCER:
		OFW_CANCER.closeFiles()

Exemple #2

0

Afficher le fichier

Fichier : genReads.py Projet : meridith-e/neat-genreads

def main():

	# index reference
	refIndex = indexRef(REFERENCE)
	if PAIRED_END:
		N_HANDLING = ('random',FRAGMENT_SIZE)
	else:
		N_HANDLING = ('ignore',READLEN)
	indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))}

	# parse input variants, if present
	inputVariants = []
	if INPUT_VCF != None:
		if CANCER:
			(sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS)
			tumorInd  = sampNames.index('TUMOR')
			normalInd = sampNames.index('NORMAL')
		else:
			(sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS)
		for k in sorted(inputVariants.keys()):
			inputVariants[k].sort()

	# parse input targeted regions, if present
	refList      = [n[0] for n in refIndex]
	inputRegions = {}
	if INPUT_BED != None:
		f = open(INPUT_BED,'r')
		for line in f:
			[myChr,pos1,pos2] = line.strip().split('\t')[:3]
			if myChr not in inputRegions:
				inputRegions[myChr] = [-1]
			inputRegions[myChr].extend([int(pos1),int(pos2)])
		f.close()
		# some validation
		nInBedOnly = 0
		nInRefOnly = 0
		for k in refList:
			if k not in inputRegions:
				nInRefOnly += 1
		for k in inputRegions.keys():
			if not k in refList:
				nInBedOnly += 1
				del inputRegions[k]
		if nInRefOnly > 0:
			print 'Warning: Reference contains sequences not found in targeted regions BED file.'
		if nInBedOnly > 0:
			print 'Warning: Targeted regions BED file contains sequence names not found in reference (regions ignored).'
	# parse discard bed similarly
	discardRegions = {}
	if DISCARD_BED != None:
		f = open(DISCARD_BED,'r')
		for line in f:
			[myChr,pos1,pos2] = line.strip().split('\t')[:3]
			if myChr not in discardRegions:
				discardRegions[myChr] = [-1]
			discardRegions[myChr].extend([int(pos1),int(pos2)])
		f.close()

	# parse input mutation rate rescaling regions, if present
	mutRateRegions = {}
	mutRateValues  = {}
	if MUT_BED != None:
		with open(MUT_BED,'r') as f:
			for line in f:
				[myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4]
				mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';')
				(pos1,pos2) = (int(pos1),int(pos2))
				if len(mutStr) and (pos2-pos1) > 1:
					# mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount
					mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])])
					if myChr not in mutRateRegions:
						mutRateRegions[myChr] = [-1]
						mutRateValues[myChr]  = [0.0]
					mutRateRegions[myChr].extend([pos1,pos2])
					mutRateValues.extend([mutRate*(pos2-pos1)]*2)

	# initialize output files (part I)
	bamHeader = None
	if SAVE_BAM:
		bamHeader = [copy.deepcopy(refIndex)]
	vcfHeader = None
	if SAVE_VCF:
		vcfHeader = [REFERENCE]
	
	# If processing jobs in parallel, precompute the independent regions that can be process separately
	if NJOBS > 1:
		parallelRegionList  = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N)
		(myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS)
		if not len(myRegions):
			print 'This job id has no regions to process, exiting...'
			exit(1)
		for i in xrange(len(refIndex)-1,-1,-1):	# delete reference not used in our job
			if not refIndex[i][0] in myRefs:
				del refIndex[i]
		# if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion
		corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])])
	else:
		corrected_nJobs = 1

	# initialize output files (part II)
	if CANCER:
		OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,noFASTQ=NO_FASTQ,FASTA_instead=FASTA_INSTEAD)
		OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ,FASTA_instead=FASTA_INSTEAD)
	else:
		OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ,FASTA_instead=FASTA_INSTEAD)
	OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1]


	"""************************************************
	****        LET'S GET THIS PARTY STARTED...
	************************************************"""


	readNameCount = 1	# keep track of the number of reads we've sampled, for read-names
	unmapped_records = []

	for RI in xrange(len(refIndex)):

		# read in reference sequence and notate blocks of Ns
		(refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING)

		# if we're processing jobs in parallel only take the regions relevant for the current job
		if NJOBS > 1:
			for i in xrange(len(N_regions['non_N'])-1,-1,-1):
				if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions:
					del N_regions['non_N'][i]

		# count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators)
		total_bp_span   = sum([n[1]-n[0] for n in N_regions['non_N']])
		currentProgress = 0
		currentPercent  = 0
		havePrinted100  = False

		# prune invalid input variants, e.g variants that:
		#		- try to delete or alter any N characters
		#		- don't match the reference base at their specified position
		#		- any alt allele contains anything other than allowed characters
		validVariants = []
		nSkipped = [0,0,0]
		if refIndex[RI][0] in inputVariants:
			for n in inputVariants[refIndex[RI][0]]:
				span = (n[0],n[0]+len(n[1]))
				rseq = str(refSequence[span[0]-1:span[1]-1])	# -1 because going from VCF coords to array coords
				anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist])
				if rseq != n[1]:
					nSkipped[0] += 1
					continue
				elif 'N' in rseq:
					nSkipped[1] += 1
					continue
				elif anyBadChr:
					nSkipped[2] += 1
					continue
				#if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2:
				#	continue
				validVariants.append(n)
			print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...'
			if any(nSkipped):
				print sum(nSkipped),'variants skipped...'
				print ' - ['+str(nSkipped[0])+'] ref allele does not match reference'
				print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region'
				print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters'


		# add large random structural variants
		#
		#	TBD!!!


		# determine sampling windows based on read length, large N regions, and structural mutations.
		# in order to obtain uniform coverage, windows should overlap by:
		#		- READLEN, if single-end reads
		#		- FRAGMENT_SIZE (mean), if paired-end reads
		# ploidy is fixed per large sampling window,
		# coverage distributions due to GC% and targeted regions are specified within these windows
		samplingWindows  = []
		ALL_VARIANTS_OUT = {}
		sequences        = None
		if PAIRED_END:
			targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE
			overlap  = FRAGMENT_SIZE
			overlap_minWindowSize = max(FRAGLEN_DISTRIBUTION.values) + 10
		else:
			targSize = WINDOW_TARGET_SCALE*READLEN
			overlap  = READLEN
			overlap_minWindowSize = READLEN + 10

		print '--------------------------------'
		if ONLY_VCF:
			print 'generating vcf...'
		else:
			print 'sampling reads...'
		tt = time.time()

		for i in xrange(len(N_regions['non_N'])):
			(pi,pf) = N_regions['non_N'][i]
			nTargWindows = max([1,(pf-pi)/targSize])
			bpd = int((pf-pi)/float(nTargWindows))
			#bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE

			#print len(refSequence), (pi,pf), nTargWindows
			#print structuralVars

			# if for some reason our region is too small to process, skip it! (sorry)
			if nTargWindows == 1 and (pf-pi) < overlap_minWindowSize:
				#print 'Does this ever happen?'
				continue

			start = pi
			end   = min([start+bpd,pf])
			#print '------------------RAWR:', (pi,pf), nTargWindows, bpd
			varsFromPrevOverlap = []
			varsCancerFromPrevOverlap = []
			vindFromPrev = 0
			isLastTime = False
			havePrinted100 = False

			while True:
				
				# which inserted variants are in this window?
				varsInWindow = []
				updated = False
				for j in xrange(vindFromPrev,len(validVariants)):
					vPos = validVariants[j][0]
					if vPos > start and vPos < end:	# update: changed >= to >, so variant cannot be inserted in first position
						varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:])))	# vcf --> array coords
					if vPos >= end-overlap-1 and updated == False:
						updated = True
						vindFromPrev = j
					if vPos >= end:
						break

				# determine which structural variants will affect our sampling window positions
				structuralVars = []
				for n in varsInWindow:
					bufferNeeded = max([max([abs(len(n[1])-len(alt_allele)),1]) for alt_allele in n[2]]) # change: added abs() so that insertions are also buffered.
					structuralVars.append((n[0]-1,bufferNeeded))	# -1 because going from VCF coords to array coords

				# adjust end-position of window based on inserted structural mutations
				buffer_added = 0
				keepGoing = True
				while keepGoing:
					keepGoing = False
					for n in structuralVars:
						# adding "overlap" here to prevent SVs from being introduced in overlap regions
						# (which can cause problems if random mutations from the previous window land on top of them)
						delta = (end-1) - (n[0] + n[1]) - 2 - overlap
						if delta < 0:
							#print 'DELTA:', delta, 'END:', end, '-->',
							buffer_added = -delta
							end += buffer_added
							####print end
							keepGoing = True
							break
				next_start = end-overlap
				next_end   = min([next_start+bpd,pf])
				if next_end-next_start < bpd:
					end = next_end
					isLastTime = True

				# print progress indicator
				#print 'PROCESSING WINDOW:',(start,end), [buffer_added], 'next:', (next_start,next_end), 'isLastTime:', isLastTime
				currentProgress += end-start
				newPercent = int((currentProgress*100)/float(total_bp_span))
				if newPercent > currentPercent:
					if newPercent <= 99 or (newPercent == 100 and not havePrinted100):
						sys.stdout.write(str(newPercent)+'% ')
						sys.stdout.flush()
					currentPercent = newPercent
					if currentPercent == 100:
						havePrinted100 = True

				skip_this_window = False

				# compute coverage modifiers
				coverage_avg = None
				coverage_dat = [GC_WINDOW_SIZE,GC_SCALE_VAL,[]]
				target_hits  = 0
				if INPUT_BED == None:
					coverage_dat[2] = [1.0]*(end-start)
				else:
					if refIndex[RI][0] not in inputRegions:
						coverage_dat[2] = [OFFTARGET_SCALAR]*(end-start)
					else:
						for j in xrange(start,end):
							if not(bisect.bisect(inputRegions[refIndex[RI][0]],j)%2):
								coverage_dat[2].append(1.0)
								target_hits += 1
							else:
								coverage_dat[2].append(OFFTARGET_SCALAR)

				# offtarget and we're not interested?
				if OFFTARGET_DISCARD and target_hits <= READLEN:
					coverage_avg = 0.0
					skip_this_window = True

				#print len(coverage_dat[2]), sum(coverage_dat[2])
				if sum(coverage_dat[2]) < LOW_COV_THRESH:
					coverage_avg = 0.0
					skip_this_window = True

				# check for small window sizes
				if (end-start) < overlap_minWindowSize:
					skip_this_window = True

				if skip_this_window:
					# skip window, save cpu time
					start = next_start
					end   = next_end
					if isLastTime:
						break
					if end >= pf:
						isLastTime = True
					varsFromPrevOverlap = []
					continue

				# construct sequence data that we will sample reads from
				if sequences == None:
					sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,onlyVCF=ONLY_VCF)
				else:
					sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE)

				# insert variants
				sequences.insert_mutations(varsFromPrevOverlap + varsInWindow)
				all_inserted_variants = sequences.random_mutations()
				#print all_inserted_variants

				# init coverage
				if sum(coverage_dat[2]) >= LOW_COV_THRESH:
					if PAIRED_END:
						coverage_avg = sequences.init_coverage(tuple(coverage_dat),fragDist=FRAGLEN_DISTRIBUTION)
					else:
						coverage_avg = sequences.init_coverage(tuple(coverage_dat))

				# unused cancer stuff
				if CANCER:
					tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat)
					tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants)
					all_cancer_variants = tumor_sequences.random_mutations()

				# which variants do we need to keep for next time (because of window overlap)?
				varsFromPrevOverlap       = []
				varsCancerFromPrevOverlap = []
				for n in all_inserted_variants:
					if n[0] >= end-overlap-1:
						varsFromPrevOverlap.append(n)
				if CANCER:
					for n in all_cancer_variants:
						if n[0] >= end-overlap-1:
							varsCancerFromPrevOverlap.append(n)

				# if we're only producing VCF, no need to go through the hassle of generating reads
				if ONLY_VCF:
					pass
				else:
					windowSpan = end-start
					if PAIRED_END:
						if FORCE_COVERAGE:
							readsToSample = int((windowSpan*float(COVERAGE))/(2*READLEN))+1
						else:
							readsToSample = int((windowSpan*float(COVERAGE)*coverage_avg)/(2*READLEN))+1
					else:
						if FORCE_COVERAGE:
							readsToSample = int((windowSpan*float(COVERAGE))/READLEN)+1
						else:
							readsToSample = int((windowSpan*float(COVERAGE)*coverage_avg)/READLEN)+1

					# if coverage is so low such that no reads are to be sampled, skip region
					#      (i.e., remove buffer of +1 reads we add to every window)
					if readsToSample == 1 and sum(coverage_dat[2]) < LOW_COV_THRESH:
						readsToSample = 0

					# sample reads
					ASDF2_TT = time.time()
					for i in xrange(readsToSample):

						isUnmapped = []
						if PAIRED_END:
							myFraglen = FRAGLEN_DISTRIBUTION.sample()
							myReadData = sequences.sample_read(SE_CLASS,myFraglen)
							if myReadData == None:	# skip if we failed to find a valid position to sample read
								continue
							if myReadData[0][0] == None:
								isUnmapped.append(True)
							else:
								isUnmapped.append(False)
								myReadData[0][0] += start	# adjust mapping position based on window start
							if myReadData[1][0] == None:
								isUnmapped.append(True)
							else:
								isUnmapped.append(False)
								myReadData[1][0] += start
						else:
							myReadData = sequences.sample_read(SE_CLASS)
							if myReadData == None:	# skip if we failed to find a valid position to sample read
								continue
							if myReadData[0][0] == None:	# unmapped read (lives in large insertion)
								isUnmapped = [True]
							else:
								isUnmapped = [False]
								myReadData[0][0] += start	# adjust mapping position based on window start

						# are we discarding offtargets?
						outside_boundaries = []
						if OFFTARGET_DISCARD and INPUT_BED != None:
							outside_boundaries += [bisect.bisect(inputRegions[refIndex[RI][0]],n[0])%2 for n in myReadData]
							outside_boundaries += [bisect.bisect(inputRegions[refIndex[RI][0]],n[0]+len(n[2]))%2 for n in myReadData]
						if DISCARD_BED != None:
							outside_boundaries += [bisect.bisect(discardRegions[refIndex[RI][0]],n[0])%2 for n in myReadData]
							outside_boundaries += [bisect.bisect(discardRegions[refIndex[RI][0]],n[0]+len(n[2]))%2 for n in myReadData]
						if len(outside_boundaries) and any(outside_boundaries):
							continue

						if NJOBS > 1:
							myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount)
						else:
							myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount)
						readNameCount += len(myReadData)

						# if desired, replace all low-quality bases with Ns
						if N_MAX_QUAL > -1:
							for j in xrange(len(myReadData)):
								myReadString = [n for n in myReadData[j][2]]
								for k in xrange(len(myReadData[j][3])):
									adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ
									if adjusted_qual <= N_MAX_QUAL:
										myReadString[k] = 'N'
								myReadData[j][2] = ''.join(myReadString)

						# flip a coin, are we forward or reverse strand?
						isForward = (random.random() < 0.5)

						# if read (or read + mate for PE) are unmapped, put them at end of bam file
						if all(isUnmapped):
							if PAIRED_END:
								if isForward:
									flag1 = sam_flag(['paired','unmapped','mate_unmapped','first','mate_reverse'])
									flag2 = sam_flag(['paired','unmapped','mate_unmapped','second','reverse'])
								else:
									flag1 = sam_flag(['paired','unmapped','mate_unmapped','second','mate_reverse'])
									flag2 = sam_flag(['paired','unmapped','mate_unmapped','first','reverse'])
								unmapped_records.append((myReadName+'/1',myReadData[0],flag1))
								unmapped_records.append((myReadName+'/2',myReadData[1],flag2))
							else:
								flag1 = sam_flag(['unmapped'])
								unmapped_records.append((myReadName+'/1',myReadData[0],flag1))

						myRefIndex = indices_by_refName[refIndex[RI][0]]
						
						#
						# write SE output
						#
						if len(myReadData) == 1:
							if NO_FASTQ != True:
								if isForward:
									OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3])
								else:
									OFW.writeFASTQRecord(myReadName,RC(myReadData[0][2]),myReadData[0][3][::-1])
							if SAVE_BAM:
								if isUnmapped[0] == False:
									if isForward:
										flag1 = 0
										OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1)
									else:
										flag1 = sam_flag(['reverse'])
										OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1)
						#
						# write PE output
						#
						elif len(myReadData) == 2:
							if NO_FASTQ != True:
								OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3],orientation=isForward)
							if SAVE_BAM:
								if isUnmapped[0] == False and isUnmapped[1] == False:
									if isForward:
										flag1 = sam_flag(['paired','proper','first','mate_reverse'])
										flag2 = sam_flag(['paired','proper','second','reverse'])
									else:
										flag1 = sam_flag(['paired','proper','second','mate_reverse'])
										flag2 = sam_flag(['paired','proper','first','reverse'])
									OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1, matePos=myReadData[1][0])
									OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=flag2, matePos=myReadData[0][0])
								elif isUnmapped[0] == False and isUnmapped[1] == True:
									if isForward:
										flag1 = sam_flag(['paired','first', 'mate_unmapped', 'mate_reverse'])
										flag2 = sam_flag(['paired','second', 'unmapped', 'reverse'])
									else:
										flag1 = sam_flag(['paired','second', 'mate_unmapped', 'mate_reverse'])
										flag2 = sam_flag(['paired','first', 'unmapped', 'reverse'])
									OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1, matePos=myReadData[0][0])
									OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=flag2, matePos=myReadData[0][0], alnMapQual=0)
								elif isUnmapped[0] == True and isUnmapped[1] == False:
									if isForward:
										flag1 = sam_flag(['paired','first', 'unmapped', 'mate_reverse'])
										flag2 = sam_flag(['paired','second', 'mate_unmapped', 'reverse'])
									else:
										flag1 = sam_flag(['paired','second', 'unmapped', 'mate_reverse'])
										flag2 = sam_flag(['paired','first', 'mate_unmapped', 'reverse'])
									OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[1][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1, matePos=myReadData[1][0], alnMapQual=0)
									OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=flag2, matePos=myReadData[1][0])
						else:
							print '\nError: Unexpected number of reads generated...\n'
							exit(1)
					#print 'READS:',time.time()-ASDF2_TT

					if not isLastTime:
						OFW.flushBuffers(bamMax=next_start)
					else:
						OFW.flushBuffers(bamMax=end+1)

				# tally up all the variants that got successfully introduced
				for n in all_inserted_variants:
					ALL_VARIANTS_OUT[n] = True

				# prepare indices of next window
				start = next_start
				end   = next_end
				if isLastTime:
					break
				if end >= pf:
					isLastTime = True

		if currentPercent != 100 and not havePrinted100:
			print '100%'
		else:
			print ''
		if ONLY_VCF:
			print 'VCF generation completed in',
		else:
			print 'Read sampling completed in',
		print int(time.time()-tt),'(sec)'

		# write all output variants for this reference
		if SAVE_VCF:
			print 'Writing output VCF...'
			for k in sorted(ALL_VARIANTS_OUT.keys()):
				currentRef = refIndex[RI][0]
				myID       = '.'
				myQual     = '.'
				myFilt     = 'PASS'
				# k[0] + 1 because we're going back to 1-based vcf coords
				OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4])

		#break

	# write unmapped reads to bam file
	if SAVE_BAM and len(unmapped_records):
		print 'writing unmapped reads to bam file...'
		for umr in unmapped_records:
			if PAIRED_END:
				OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], matePos=0, alnMapQual=0)
			else:
				OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], alnMapQual=0)

	# close output files
	OFW.closeFiles()
	if CANCER:
		OFW_CANCER.closeFiles()

Exemple #3

0

Afficher le fichier

Fichier : genReads.py Projet : zstephens/neat-genreads

def main():

	# index reference
	refIndex = indexRef(REFERENCE)
	if PAIRED_END:
		N_HANDLING = ('random',FRAGMENT_SIZE)
	else:
		N_HANDLING = ('ignore',READLEN)
	indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))}

	# parse input variants, if present
	inputVariants = []
	if INPUT_VCF != None:
		if CANCER:
			(sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS)
			tumorInd  = sampNames.index('TUMOR')
			normalInd = sampNames.index('NORMAL')
		else:
			(sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS)
		for k in sorted(inputVariants.keys()):
			inputVariants[k].sort()

	# parse input targeted regions, if present
	inputRegions = {}
	if INPUT_BED != None:
		with open(INPUT_BED,'r') as f:
			for line in f:
				[myChr,pos1,pos2] = line.strip().split('\t')[:3]
				if myChr not in inputRegions:
					inputRegions[myChr] = [-1]
				inputRegions[myChr].extend([int(pos1),int(pos2)])

	# parse input mutation rate rescaling regions, if present
	mutRateRegions = {}
	mutRateValues  = {}
	if MUT_BED != None:
		with open(MUT_BED,'r') as f:
			for line in f:
				[myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4]
				mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';')
				(pos1,pos2) = (int(pos1),int(pos2))
				if len(mutStr) and (pos2-pos1) > 1:
					# mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount
					mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])])
					if myChr not in mutRateRegions:
						mutRateRegions[myChr] = [-1]
						mutRateValues[myChr]  = [0.0]
					mutRateRegions[myChr].extend([pos1,pos2])
					mutRateValues.extend([mutRate*(pos2-pos1)]*2)

	# initialize output files (part I)
	bamHeader = None
	if SAVE_BAM:
		bamHeader = [copy.deepcopy(refIndex)]
	vcfHeader = None
	if SAVE_VCF:
		vcfHeader = [REFERENCE]
	
	# If processing jobs in parallel, precompute the independent regions that can be process separately
	if NJOBS > 1:
		parallelRegionList  = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N)
		(myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS)
		if not len(myRegions):
			print 'This job id has no regions to process, exiting...'
			exit(1)
		for i in xrange(len(refIndex)-1,-1,-1):	# delete reference not used in our job
			if not refIndex[i][0] in myRefs:
				del refIndex[i]
		# if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion
		corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])])
	else:
		corrected_nJobs = 1

	# initialize output files (part II)
	if CANCER:
		OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT)
		OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs))
	else:
		OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs))
	OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1]


	"""************************************************
	****        LET'S GET THIS PARTY STARTED...
	************************************************"""


	readNameCount = 1	# keep track of the number of reads we've sampled, for read-names

	for RI in xrange(len(refIndex)):

		# read in reference sequence and notate blocks of Ns
		(refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING)

		# if we're processing jobs in parallel only take the regions relevant for the current job
		if NJOBS > 1:
			for i in xrange(len(N_regions['non_N'])-1,-1,-1):
				if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions:
					del N_regions['non_N'][i]

		# count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators)
		total_bp_span   = sum([n[1]-n[0] for n in N_regions['non_N']])
		currentProgress = 0
		currentPercent  = 0

		# prune invalid input variants, e.g variants that:
		#		- try to delete or alter any N characters
		#		- don't match the reference base at their specified position
		#		- any alt allele contains anything other than allowed characters
		validVariants = []
		nSkipped = [0,0,0]
		if refIndex[RI][0] in inputVariants:
			for n in inputVariants[refIndex[RI][0]]:
				span = (n[0],n[0]+len(n[1]))
				rseq = str(refSequence[span[0]-1:span[1]-1])	# -1 because going from VCF coords to array coords
				anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist])
				if rseq != n[1]:
					nSkipped[0] += 1
					continue
				elif 'N' in rseq:
					nSkipped[1] += 1
					continue
				elif anyBadChr:
					nSkipped[2] += 1
					continue
				#if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2:
				#	continue
				validVariants.append(n)
			print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...'
			if any(nSkipped):
				print sum(nSkipped),'variants skipped...'
				print ' - ['+str(nSkipped[0])+'] ref allele does not match reference'
				print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region'
				print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters'

		# add large random structural variants
		#
		#	TBD!!!

		# determine which structural variants will affect our sampling window positions
		structuralVars = []
		for n in validVariants:
			bufferNeeded = max([max([len(n[1])-len(alt_allele),1]) for alt_allele in n[2]])
			structuralVars.append((n[0]-1,bufferNeeded))	# -1 because going from VCF coords to array coords

		# determine sampling windows based on read length, large N regions, and structural mutations.
		# in order to obtain uniform coverage, windows should overlap by:
		#		- READLEN, if single-end reads
		#		- FRAGMENT_SIZE (mean), if paired-end reads
		# ploidy is fixed per large sampling window,
		# coverage distributions due to GC% and targeted regions are specified within these windows
		samplingWindows  = []
		ALL_VARIANTS_OUT = {}
		sequences        = None
		if PAIRED_END:
			targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE
			overlap  = FRAGMENT_SIZE
		else:
			targSize = WINDOW_TARGET_SCALE*READLEN
			overlap  = READLEN

		print '--------------------------------'
		print 'sampling reads...'
		for i in xrange(len(N_regions['non_N'])):
			(pi,pf) = N_regions['non_N'][i]
			nTargWindows = max([1,(pf-pi)/targSize])
			bpd = int((pf-pi)/float(nTargWindows))
			bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE

			#print len(refSequence), (pi,pf), nTargWindows
			#print structuralVars

			# if for some reason our region is too small to process, skip it! (sorry)
			if nTargWindows == 1 and (pf-pi) < overlap-1:
				#print 'Does this ever happen?'
				continue

			start = pi
			end   = min([start+bpd,pf])
			#print '------------------RAWR:', (pi,pf), bpd
			currentVariantInd = 0
			varsFromPrevOverlap = []
			varsCancerFromPrevOverlap = []
			vindFromPrev = 0
			isLastTime = False
			while True:
				####print (start,end)
				# adjust end-position of window based on inserted structural mutations
				relevantVars = []
				if len(structuralVars) and currentVariantInd < len(structuralVars):
					prevVarInd = currentVariantInd
					while structuralVars[currentVariantInd][0] <= end:
						delta = (end-1) - (structuralVars[currentVariantInd][0] + structuralVars[currentVariantInd][1])
						if delta <= 0:
							####print 'DELTA:', delta
							end -= (delta-1)
						currentVariantInd += 1
						if currentVariantInd == len(structuralVars):
							break
					relevantVars = structuralVars[prevVarInd:currentVariantInd]
				next_start = end-overlap
				next_end   = min([next_start+bpd,pf])
				if next_end-next_start < bpd:
					end = next_end
					isLastTime = True

				# print progress indicator
				#print 'PROCESSING WINDOW:',(start,end)
				currentProgress += end-start
				newPercent = int((currentProgress*100)/float(total_bp_span))
				if newPercent > currentPercent:
					sys.stdout.write(str(newPercent)+'% ')
					sys.stdout.flush()
					currentPercent = newPercent

				# which inserted variants are in this window?
				varsInWindow = []
				updated = False
				for j in xrange(vindFromPrev,len(validVariants)):
					vPos = validVariants[j][0]
					if vPos >= start and vPos < end:
						varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:])))	# vcf --> array coords
					if vPos >= end-overlap-1 and updated == False:
						updated = True
						vindFromPrev = j
					if vPos >= end:
						break

				# if computing only VCF, we can skip this...
				if ONLY_VCF:
					coverage_dat = None
					coverage_avg = None
				else:
					# pre-compute gc-bias and targeted sequencing coverage modifiers
					nSubWindows  = (end-start)/GC_WINDOW_SIZE
					coverage_dat = (GC_WINDOW_SIZE,[])
					for j in xrange(nSubWindows):
						rInd = start + j*GC_WINDOW_SIZE
						if INPUT_BED == None: tCov = True
						else: tCov = not(bisect.bisect(inputRegions[myChr],rInd)%2) or not(bisect.bisect(inputRegions[myChr],rInd+GC_WINDOW_SIZE)%2)
						if tCov: tScl = 1.0
						else: tScl = OFFTARGET_SCALAR
						gc_v = refSequence[rInd:rInd+GC_WINDOW_SIZE].count('G') + refSequence[rInd:rInd+GC_WINDOW_SIZE].count('C')
						gScl = GC_SCALE_VAL[gc_v]
						coverage_dat[1].append(1.0*tScl*gScl)
					coverage_avg = np.mean(coverage_dat[1])

				# pre-compute mutation rate tracks
				# PROVIDED MUTATION RATES OVERRIDE AVERAGE VALUE

				# construct sequence data that we will sample reads from
				if sequences == None:
					sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat,onlyVCF=ONLY_VCF)
				else:
					sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat)

				# adjust position of all inserted variants to match current window offset
				#variants_to_insert = []
				#for n in varsFromPrevOverlap:
				#	ln = [n[0]-start] + list(n[1:])
				#	variants_to_insert.append(tuple(ln))
				#for n in varsInWindow:
				#	ln = [n[0]-start] + list(n[1:])
				#	variants_to_insert.append(tuple(ln))
				#sequences.insert_mutations(variants_to_insert)
				sequences.insert_mutations(varsFromPrevOverlap + varsInWindow)
				all_inserted_variants = sequences.random_mutations()
				#print all_inserted_variants

				if CANCER:
					tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat)
					tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants)
					all_cancer_variants = tumor_sequences.random_mutations()

				# which variants do we need to keep for next time (because of window overlap)?
				varsFromPrevOverlap       = []
				varsCancerFromPrevOverlap = []
				for n in all_inserted_variants:
					if n[0] >= end-overlap-1:
						varsFromPrevOverlap.append(n)
				if CANCER:
					for n in all_cancer_variants:
						if n[0] >= end-overlap-1:
							varsCancerFromPrevOverlap.append(n)

				# if we're only producing VCF, no need to go through the hassle of generating reads
				if ONLY_VCF:
					pass
				else:
					# for each sampling window, construct sub-windows with coverage information
					covWindows = [COVERAGE for n in xrange((end-start)/SMALL_WINDOW)]
					if (end-start)%SMALL_WINDOW:
						covWindows.append(COVERAGE)
					meanCov = sum(covWindows)/float(len(covWindows))
					if PAIRED_END:
						readsToSample = int(((end-start)*meanCov*coverage_avg)/(2*READLEN))+1
					else:
						readsToSample = int(((end-start)*meanCov*coverage_avg)/(READLEN))+1

					# sample reads from altered reference
					for i in xrange(readsToSample):

						if PAIRED_END:
							myFraglen = FRAGLEN_DISTRIBUTION.sample()
							myReadData = sequences.sample_read(SE_CLASS,myFraglen)
							myReadData[0][0] += start	# adjust mapping position based on window start
							myReadData[1][0] += start
						else:
							myReadData = sequences.sample_read(SE_CLASS)
							myReadData[0][0] += start	# adjust mapping position based on window start
					
						if NJOBS > 1:
							myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount)
						else:
							myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount)
						readNameCount += len(myReadData)

						# if desired, replace all low-quality bases with Ns
						if N_MAX_QUAL > -1:
							for j in xrange(len(myReadData)):
								myReadString = [n for n in myReadData[j][2]]
								for k in xrange(len(myReadData[j][3])):
									adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ
									if adjusted_qual <= N_MAX_QUAL:
										myReadString[k] = 'N'
								myReadData[j][2] = ''.join(myReadString)

						# write read data out to FASTQ and BAM files, bypass FASTQ if option specified
						myRefIndex = indices_by_refName[refIndex[RI][0]]
						if len(myReadData) == 1:
							if NO_FASTQ != True:
								OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3])
							if SAVE_BAM:
								OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=0)
						elif len(myReadData) == 2:
							if NO_FASTQ != True:
								OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3])
							if SAVE_BAM:
								OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=99,  matePos=myReadData[1][0])
								OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=147, matePos=myReadData[0][0])
						else:
							print '\nError: Unexpected number of reads generated...\n'
							exit(1)

				# tally up all the variants that got successfully introduced
				for n in all_inserted_variants:
					ALL_VARIANTS_OUT[n] = True

				# prepare indices of next window
				start = next_start
				end   = next_end
				if isLastTime:
					break
				if end >= pf:
					isLastTime = True

		if currentPercent != 100:
			print '100%'
		else:
			print ''

		# write all output variants for this reference
		if SAVE_VCF:
			for k in sorted(ALL_VARIANTS_OUT.keys()):
				currentRef = refIndex[RI][0]
				myID       = '.'
				myQual     = '.'
				myFilt     = 'PASS'
				# k[0] + 1 because we're going back to 1-based vcf coords
				OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4])

		#break

	# close output files
	OFW.closeFiles()
	if CANCER:
		OFW_CANCER.closeFiles()