def main(): # index reference refIndex = indexRef(REFERENCE) if PAIRED_END: N_HANDLING = ('random',FRAGMENT_SIZE) else: N_HANDLING = ('ignore',READLEN) indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))} # parse input variants, if present inputVariants = [] if INPUT_VCF != None: if CANCER: (sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS) tumorInd = sampNames.index('TUMOR') normalInd = sampNames.index('NORMAL') else: (sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS) for k in sorted(inputVariants.keys()): inputVariants[k].sort() # parse input targeted regions, if present inputRegions = {} if INPUT_BED != None: with open(INPUT_BED,'r') as f: for line in f: [myChr,pos1,pos2] = line.strip().split('\t')[:3] if myChr not in inputRegions: inputRegions[myChr] = [-1] inputRegions[myChr].extend([int(pos1),int(pos2)]) # parse input mutation rate rescaling regions, if present mutRateRegions = {} mutRateValues = {} if MUT_BED != None: with open(MUT_BED,'r') as f: for line in f: [myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4] mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';') (pos1,pos2) = (int(pos1),int(pos2)) if len(mutStr) and (pos2-pos1) > 1: # mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])]) if myChr not in inputRegions: mutRateRegions[myChr] = [-1] mutRateValues[myChr] = [0.0] mutRateRegions[myChr].extend([pos1,pos2]) mutRateValues.extend([mutRate*(pos2-pos1)]*2) # initialize output files (part I) bamHeader = None if SAVE_BAM: bamHeader = [copy.deepcopy(refIndex)] vcfHeader = None if SAVE_VCF: vcfHeader = [REFERENCE] # If processing jobs in parallel, precompute the independent regions that can be process separately if NJOBS > 1: parallelRegionList = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N) (myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS) if not len(myRegions): print 'This job id has no regions to process, exiting...' exit(1) for i in xrange(len(refIndex)-1,-1,-1): # delete reference not used in our job if not refIndex[i][0] in myRefs: del refIndex[i] # if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])]) else: corrected_nJobs = 1 # initialize output files (part II) if CANCER: OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT) OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs)) else: OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs)) OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1] """************************************************ **** LET'S GET THIS PARTY STARTED... ************************************************""" readNameCount = 1 # keep track of the number of reads we've sampled, for read-names for RI in xrange(len(refIndex)): # read in reference sequence and notate blocks of Ns (refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING) # if we're processing jobs in parallel only take the regions relevant for the current job if NJOBS > 1: for i in xrange(len(N_regions['non_N'])-1,-1,-1): if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions: del N_regions['non_N'][i] # count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators) total_bp_span = sum([n[1]-n[0] for n in N_regions['non_N']]) currentProgress = 0 currentPercent = 0 # prune invalid input variants, e.g variants that: # - try to delete or alter any N characters # - don't match the reference base at their specified position # - any alt allele contains anything other than allowed characters validVariants = [] nSkipped = [0,0,0] if refIndex[RI][0] in inputVariants: for n in inputVariants[refIndex[RI][0]]: span = (n[0],n[0]+len(n[1])) rseq = str(refSequence[span[0]-1:span[1]-1]) # -1 because going from VCF coords to array coords anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist]) if rseq != n[1]: nSkipped[0] += 1 continue elif 'N' in rseq: nSkipped[1] += 1 continue elif anyBadChr: nSkipped[2] += 1 continue #if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2: # continue validVariants.append(n) print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...' if any(nSkipped): print sum(nSkipped),'variants skipped...' print ' - ['+str(nSkipped[0])+'] ref allele does not match reference' print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region' print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters' # add large random structural variants # # TBD!!! # determine which structural variants will affect our sampling window positions structuralVars = [] for n in validVariants: bufferNeeded = max([max([len(n[1])-len(alt_allele),1]) for alt_allele in n[2]]) structuralVars.append((n[0]-1,bufferNeeded)) # -1 because going from VCF coords to array coords # determine sampling windows based on read length, large N regions, and structural mutations. # in order to obtain uniform coverage, windows should overlap by: # - READLEN, if single-end reads # - FRAGMENT_SIZE (mean), if paired-end reads # ploidy is fixed per large sampling window, # coverage distributions due to GC% and targeted regions are specified within these windows samplingWindows = [] ALL_VARIANTS_OUT = {} sequences = None if PAIRED_END: targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE overlap = FRAGMENT_SIZE else: targSize = WINDOW_TARGET_SCALE*READLEN overlap = READLEN print '--------------------------------' print 'sampling reads...' for i in xrange(len(N_regions['non_N'])): (pi,pf) = N_regions['non_N'][i] nTargWindows = max([1,(pf-pi)/targSize]) bpd = int((pf-pi)/float(nTargWindows)) bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE #print len(refSequence), (pi,pf), nTargWindows #print structuralVars # if for some reason our region is too small to process, skip it! (sorry) if nTargWindows == 1 and (pf-pi) < overlap-1: #print 'Does this ever happen?' continue start = pi end = min([start+bpd,pf]) #print '------------------RAWR:', (pi,pf), bpd currentVariantInd = 0 varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] vindFromPrev = 0 isLastTime = False while True: ####print (start,end) # adjust end-position of window based on inserted structural mutations relevantVars = [] if len(structuralVars) and currentVariantInd < len(structuralVars): prevVarInd = currentVariantInd while structuralVars[currentVariantInd][0] <= end: delta = (end-1) - (structuralVars[currentVariantInd][0] + structuralVars[currentVariantInd][1]) if delta <= 0: ####print 'DELTA:', delta end -= (delta-1) currentVariantInd += 1 if currentVariantInd == len(structuralVars): break relevantVars = structuralVars[prevVarInd:currentVariantInd] next_start = end-overlap next_end = min([next_start+bpd,pf]) if next_end-next_start < bpd: end = next_end isLastTime = True # print progress indicator #print 'PROCESSING WINDOW:',(start,end) currentProgress += end-start newPercent = int((currentProgress*100)/float(total_bp_span)) if newPercent > currentPercent: sys.stdout.write(str(newPercent)+'% ') sys.stdout.flush() currentPercent = newPercent # which inserted variants are in this window? varsInWindow = [] updated = False for j in xrange(vindFromPrev,len(validVariants)): vPos = validVariants[j][0] if vPos >= start and vPos < end: varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:]))) # vcf --> array coords if vPos >= end-overlap-1 and updated == False: updated = True vindFromPrev = j if vPos >= end: break # if computing only VCF, we can skip this... if ONLY_VCF: coverage_dat = None coverage_avg = None else: # pre-compute gc-bias and targeted sequencing coverage modifiers nSubWindows = (end-start)/GC_WINDOW_SIZE coverage_dat = (GC_WINDOW_SIZE,[]) for j in xrange(nSubWindows): rInd = start + j*GC_WINDOW_SIZE if INPUT_BED == None: tCov = True else: tCov = not(bisect.bisect(inputRegions[myChr],rInd)%2) or not(bisect.bisect(inputRegions[myChr],rInd+GC_WINDOW_SIZE)%2) if tCov: tScl = 1.0 else: tScl = OFFTARGET_SCALAR gc_v = refSequence[rInd:rInd+GC_WINDOW_SIZE].count('G') + refSequence[rInd:rInd+GC_WINDOW_SIZE].count('C') gScl = GC_SCALE_VAL[gc_v] coverage_dat[1].append(1.0*tScl*gScl) coverage_avg = np.mean(coverage_dat[1]) # pre-compute mutation rate tracks # PROVIDED MUTATION RATES OVERRIDE AVERAGE VALUE # construct sequence data that we will sample reads from if sequences == None: sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat,onlyVCF=ONLY_VCF) else: sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat) # adjust position of all inserted variants to match current window offset #variants_to_insert = [] #for n in varsFromPrevOverlap: # ln = [n[0]-start] + list(n[1:]) # variants_to_insert.append(tuple(ln)) #for n in varsInWindow: # ln = [n[0]-start] + list(n[1:]) # variants_to_insert.append(tuple(ln)) #sequences.insert_mutations(variants_to_insert) sequences.insert_mutations(varsFromPrevOverlap + varsInWindow) all_inserted_variants = sequences.random_mutations() #print all_inserted_variants if CANCER: tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat) tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants) all_cancer_variants = tumor_sequences.random_mutations() # which variants do we need to keep for next time (because of window overlap)? varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] for n in all_inserted_variants: if n[0] >= end-overlap-1: varsFromPrevOverlap.append(n) if CANCER: for n in all_cancer_variants: if n[0] >= end-overlap-1: varsCancerFromPrevOverlap.append(n) # if we're only producing VCF, no need to go through the hassle of generating reads if ONLY_VCF: pass else: # for each sampling window, construct sub-windows with coverage information covWindows = [COVERAGE for n in xrange((end-start)/SMALL_WINDOW)] if (end-start)%SMALL_WINDOW: covWindows.append(COVERAGE) meanCov = sum(covWindows)/float(len(covWindows)) if PAIRED_END: readsToSample = int(((end-start)*meanCov*coverage_avg)/(2*READLEN))+1 else: readsToSample = int(((end-start)*meanCov*coverage_avg)/(READLEN))+1 # sample reads from altered reference for i in xrange(readsToSample): if PAIRED_END: myFraglen = FRAGLEN_DISTRIBUTION.sample() myReadData = sequences.sample_read(SE_CLASS,myFraglen) myReadData[0][0] += start # adjust mapping position based on window start myReadData[1][0] += start else: myReadData = sequences.sample_read(SE_CLASS) myReadData[0][0] += start # adjust mapping position based on window start if NJOBS > 1: myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount) else: myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount) readNameCount += len(myReadData) # if desired, replace all low-quality bases with Ns if N_MAX_QUAL > -1: for j in xrange(len(myReadData)): myReadString = [n for n in myReadData[j][2]] for k in xrange(len(myReadData[j][3])): adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ if adjusted_qual <= N_MAX_QUAL: myReadString[k] = 'N' myReadData[j][2] = ''.join(myReadString) # write read data out to FASTQ and BAM files, bypass FASTQ if option specified myRefIndex = indices_by_refName[refIndex[RI][0]] if len(myReadData) == 1: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3]) if SAVE_BAM: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=0) elif len(myReadData) == 2: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3]) if SAVE_BAM: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=99, matePos=myReadData[1][0]) OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=147, matePos=myReadData[0][0]) else: print '\nError: Unexpected number of reads generated...\n' exit(1) # tally up all the variants that got successfully introduced for n in all_inserted_variants: ALL_VARIANTS_OUT[n] = True # prepare indices of next window start = next_start end = next_end if isLastTime: break if end >= pf: isLastTime = True if currentPercent != 100: print '100%' else: print '' # write all output variants for this reference if SAVE_VCF: for k in sorted(ALL_VARIANTS_OUT.keys()): currentRef = refIndex[RI][0] myID = '.' myQual = '.' myFilt = 'PASS' # k[0] + 1 because we're going back to 1-based vcf coords OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4]) #break # close output files OFW.closeFiles() if CANCER: OFW_CANCER.closeFiles()
def main(): # index reference refIndex = indexRef(REFERENCE) if PAIRED_END: N_HANDLING = ('random',FRAGMENT_SIZE) else: N_HANDLING = ('ignore',READLEN) indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))} # parse input variants, if present inputVariants = [] if INPUT_VCF != None: if CANCER: (sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS) tumorInd = sampNames.index('TUMOR') normalInd = sampNames.index('NORMAL') else: (sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS) for k in sorted(inputVariants.keys()): inputVariants[k].sort() # parse input targeted regions, if present refList = [n[0] for n in refIndex] inputRegions = {} if INPUT_BED != None: f = open(INPUT_BED,'r') for line in f: [myChr,pos1,pos2] = line.strip().split('\t')[:3] if myChr not in inputRegions: inputRegions[myChr] = [-1] inputRegions[myChr].extend([int(pos1),int(pos2)]) f.close() # some validation nInBedOnly = 0 nInRefOnly = 0 for k in refList: if k not in inputRegions: nInRefOnly += 1 for k in inputRegions.keys(): if not k in refList: nInBedOnly += 1 del inputRegions[k] if nInRefOnly > 0: print 'Warning: Reference contains sequences not found in targeted regions BED file.' if nInBedOnly > 0: print 'Warning: Targeted regions BED file contains sequence names not found in reference (regions ignored).' # parse discard bed similarly discardRegions = {} if DISCARD_BED != None: f = open(DISCARD_BED,'r') for line in f: [myChr,pos1,pos2] = line.strip().split('\t')[:3] if myChr not in discardRegions: discardRegions[myChr] = [-1] discardRegions[myChr].extend([int(pos1),int(pos2)]) f.close() # parse input mutation rate rescaling regions, if present mutRateRegions = {} mutRateValues = {} if MUT_BED != None: with open(MUT_BED,'r') as f: for line in f: [myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4] mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';') (pos1,pos2) = (int(pos1),int(pos2)) if len(mutStr) and (pos2-pos1) > 1: # mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])]) if myChr not in mutRateRegions: mutRateRegions[myChr] = [-1] mutRateValues[myChr] = [0.0] mutRateRegions[myChr].extend([pos1,pos2]) mutRateValues.extend([mutRate*(pos2-pos1)]*2) # initialize output files (part I) bamHeader = None if SAVE_BAM: bamHeader = [copy.deepcopy(refIndex)] vcfHeader = None if SAVE_VCF: vcfHeader = [REFERENCE] # If processing jobs in parallel, precompute the independent regions that can be process separately if NJOBS > 1: parallelRegionList = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N) (myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS) if not len(myRegions): print 'This job id has no regions to process, exiting...' exit(1) for i in xrange(len(refIndex)-1,-1,-1): # delete reference not used in our job if not refIndex[i][0] in myRefs: del refIndex[i] # if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])]) else: corrected_nJobs = 1 # initialize output files (part II) if CANCER: OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,noFASTQ=NO_FASTQ,FASTA_instead=FASTA_INSTEAD) OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ,FASTA_instead=FASTA_INSTEAD) else: OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ,FASTA_instead=FASTA_INSTEAD) OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1] """************************************************ **** LET'S GET THIS PARTY STARTED... ************************************************""" readNameCount = 1 # keep track of the number of reads we've sampled, for read-names unmapped_records = [] for RI in xrange(len(refIndex)): # read in reference sequence and notate blocks of Ns (refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING) # if we're processing jobs in parallel only take the regions relevant for the current job if NJOBS > 1: for i in xrange(len(N_regions['non_N'])-1,-1,-1): if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions: del N_regions['non_N'][i] # count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators) total_bp_span = sum([n[1]-n[0] for n in N_regions['non_N']]) currentProgress = 0 currentPercent = 0 havePrinted100 = False # prune invalid input variants, e.g variants that: # - try to delete or alter any N characters # - don't match the reference base at their specified position # - any alt allele contains anything other than allowed characters validVariants = [] nSkipped = [0,0,0] if refIndex[RI][0] in inputVariants: for n in inputVariants[refIndex[RI][0]]: span = (n[0],n[0]+len(n[1])) rseq = str(refSequence[span[0]-1:span[1]-1]) # -1 because going from VCF coords to array coords anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist]) if rseq != n[1]: nSkipped[0] += 1 continue elif 'N' in rseq: nSkipped[1] += 1 continue elif anyBadChr: nSkipped[2] += 1 continue #if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2: # continue validVariants.append(n) print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...' if any(nSkipped): print sum(nSkipped),'variants skipped...' print ' - ['+str(nSkipped[0])+'] ref allele does not match reference' print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region' print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters' # add large random structural variants # # TBD!!! # determine sampling windows based on read length, large N regions, and structural mutations. # in order to obtain uniform coverage, windows should overlap by: # - READLEN, if single-end reads # - FRAGMENT_SIZE (mean), if paired-end reads # ploidy is fixed per large sampling window, # coverage distributions due to GC% and targeted regions are specified within these windows samplingWindows = [] ALL_VARIANTS_OUT = {} sequences = None if PAIRED_END: targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE overlap = FRAGMENT_SIZE overlap_minWindowSize = max(FRAGLEN_DISTRIBUTION.values) + 10 else: targSize = WINDOW_TARGET_SCALE*READLEN overlap = READLEN overlap_minWindowSize = READLEN + 10 print '--------------------------------' if ONLY_VCF: print 'generating vcf...' else: print 'sampling reads...' tt = time.time() for i in xrange(len(N_regions['non_N'])): (pi,pf) = N_regions['non_N'][i] nTargWindows = max([1,(pf-pi)/targSize]) bpd = int((pf-pi)/float(nTargWindows)) #bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE #print len(refSequence), (pi,pf), nTargWindows #print structuralVars # if for some reason our region is too small to process, skip it! (sorry) if nTargWindows == 1 and (pf-pi) < overlap_minWindowSize: #print 'Does this ever happen?' continue start = pi end = min([start+bpd,pf]) #print '------------------RAWR:', (pi,pf), nTargWindows, bpd varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] vindFromPrev = 0 isLastTime = False havePrinted100 = False while True: # which inserted variants are in this window? varsInWindow = [] updated = False for j in xrange(vindFromPrev,len(validVariants)): vPos = validVariants[j][0] if vPos > start and vPos < end: # update: changed >= to >, so variant cannot be inserted in first position varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:]))) # vcf --> array coords if vPos >= end-overlap-1 and updated == False: updated = True vindFromPrev = j if vPos >= end: break # determine which structural variants will affect our sampling window positions structuralVars = [] for n in varsInWindow: bufferNeeded = max([max([abs(len(n[1])-len(alt_allele)),1]) for alt_allele in n[2]]) # change: added abs() so that insertions are also buffered. structuralVars.append((n[0]-1,bufferNeeded)) # -1 because going from VCF coords to array coords # adjust end-position of window based on inserted structural mutations buffer_added = 0 keepGoing = True while keepGoing: keepGoing = False for n in structuralVars: # adding "overlap" here to prevent SVs from being introduced in overlap regions # (which can cause problems if random mutations from the previous window land on top of them) delta = (end-1) - (n[0] + n[1]) - 2 - overlap if delta < 0: #print 'DELTA:', delta, 'END:', end, '-->', buffer_added = -delta end += buffer_added ####print end keepGoing = True break next_start = end-overlap next_end = min([next_start+bpd,pf]) if next_end-next_start < bpd: end = next_end isLastTime = True # print progress indicator #print 'PROCESSING WINDOW:',(start,end), [buffer_added], 'next:', (next_start,next_end), 'isLastTime:', isLastTime currentProgress += end-start newPercent = int((currentProgress*100)/float(total_bp_span)) if newPercent > currentPercent: if newPercent <= 99 or (newPercent == 100 and not havePrinted100): sys.stdout.write(str(newPercent)+'% ') sys.stdout.flush() currentPercent = newPercent if currentPercent == 100: havePrinted100 = True skip_this_window = False # compute coverage modifiers coverage_avg = None coverage_dat = [GC_WINDOW_SIZE,GC_SCALE_VAL,[]] target_hits = 0 if INPUT_BED == None: coverage_dat[2] = [1.0]*(end-start) else: if refIndex[RI][0] not in inputRegions: coverage_dat[2] = [OFFTARGET_SCALAR]*(end-start) else: for j in xrange(start,end): if not(bisect.bisect(inputRegions[refIndex[RI][0]],j)%2): coverage_dat[2].append(1.0) target_hits += 1 else: coverage_dat[2].append(OFFTARGET_SCALAR) # offtarget and we're not interested? if OFFTARGET_DISCARD and target_hits <= READLEN: coverage_avg = 0.0 skip_this_window = True #print len(coverage_dat[2]), sum(coverage_dat[2]) if sum(coverage_dat[2]) < LOW_COV_THRESH: coverage_avg = 0.0 skip_this_window = True # check for small window sizes if (end-start) < overlap_minWindowSize: skip_this_window = True if skip_this_window: # skip window, save cpu time start = next_start end = next_end if isLastTime: break if end >= pf: isLastTime = True varsFromPrevOverlap = [] continue # construct sequence data that we will sample reads from if sequences == None: sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,onlyVCF=ONLY_VCF) else: sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE) # insert variants sequences.insert_mutations(varsFromPrevOverlap + varsInWindow) all_inserted_variants = sequences.random_mutations() #print all_inserted_variants # init coverage if sum(coverage_dat[2]) >= LOW_COV_THRESH: if PAIRED_END: coverage_avg = sequences.init_coverage(tuple(coverage_dat),fragDist=FRAGLEN_DISTRIBUTION) else: coverage_avg = sequences.init_coverage(tuple(coverage_dat)) # unused cancer stuff if CANCER: tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat) tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants) all_cancer_variants = tumor_sequences.random_mutations() # which variants do we need to keep for next time (because of window overlap)? varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] for n in all_inserted_variants: if n[0] >= end-overlap-1: varsFromPrevOverlap.append(n) if CANCER: for n in all_cancer_variants: if n[0] >= end-overlap-1: varsCancerFromPrevOverlap.append(n) # if we're only producing VCF, no need to go through the hassle of generating reads if ONLY_VCF: pass else: windowSpan = end-start if PAIRED_END: if FORCE_COVERAGE: readsToSample = int((windowSpan*float(COVERAGE))/(2*READLEN))+1 else: readsToSample = int((windowSpan*float(COVERAGE)*coverage_avg)/(2*READLEN))+1 else: if FORCE_COVERAGE: readsToSample = int((windowSpan*float(COVERAGE))/READLEN)+1 else: readsToSample = int((windowSpan*float(COVERAGE)*coverage_avg)/READLEN)+1 # if coverage is so low such that no reads are to be sampled, skip region # (i.e., remove buffer of +1 reads we add to every window) if readsToSample == 1 and sum(coverage_dat[2]) < LOW_COV_THRESH: readsToSample = 0 # sample reads ASDF2_TT = time.time() for i in xrange(readsToSample): isUnmapped = [] if PAIRED_END: myFraglen = FRAGLEN_DISTRIBUTION.sample() myReadData = sequences.sample_read(SE_CLASS,myFraglen) if myReadData == None: # skip if we failed to find a valid position to sample read continue if myReadData[0][0] == None: isUnmapped.append(True) else: isUnmapped.append(False) myReadData[0][0] += start # adjust mapping position based on window start if myReadData[1][0] == None: isUnmapped.append(True) else: isUnmapped.append(False) myReadData[1][0] += start else: myReadData = sequences.sample_read(SE_CLASS) if myReadData == None: # skip if we failed to find a valid position to sample read continue if myReadData[0][0] == None: # unmapped read (lives in large insertion) isUnmapped = [True] else: isUnmapped = [False] myReadData[0][0] += start # adjust mapping position based on window start # are we discarding offtargets? outside_boundaries = [] if OFFTARGET_DISCARD and INPUT_BED != None: outside_boundaries += [bisect.bisect(inputRegions[refIndex[RI][0]],n[0])%2 for n in myReadData] outside_boundaries += [bisect.bisect(inputRegions[refIndex[RI][0]],n[0]+len(n[2]))%2 for n in myReadData] if DISCARD_BED != None: outside_boundaries += [bisect.bisect(discardRegions[refIndex[RI][0]],n[0])%2 for n in myReadData] outside_boundaries += [bisect.bisect(discardRegions[refIndex[RI][0]],n[0]+len(n[2]))%2 for n in myReadData] if len(outside_boundaries) and any(outside_boundaries): continue if NJOBS > 1: myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount) else: myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount) readNameCount += len(myReadData) # if desired, replace all low-quality bases with Ns if N_MAX_QUAL > -1: for j in xrange(len(myReadData)): myReadString = [n for n in myReadData[j][2]] for k in xrange(len(myReadData[j][3])): adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ if adjusted_qual <= N_MAX_QUAL: myReadString[k] = 'N' myReadData[j][2] = ''.join(myReadString) # flip a coin, are we forward or reverse strand? isForward = (random.random() < 0.5) # if read (or read + mate for PE) are unmapped, put them at end of bam file if all(isUnmapped): if PAIRED_END: if isForward: flag1 = sam_flag(['paired','unmapped','mate_unmapped','first','mate_reverse']) flag2 = sam_flag(['paired','unmapped','mate_unmapped','second','reverse']) else: flag1 = sam_flag(['paired','unmapped','mate_unmapped','second','mate_reverse']) flag2 = sam_flag(['paired','unmapped','mate_unmapped','first','reverse']) unmapped_records.append((myReadName+'/1',myReadData[0],flag1)) unmapped_records.append((myReadName+'/2',myReadData[1],flag2)) else: flag1 = sam_flag(['unmapped']) unmapped_records.append((myReadName+'/1',myReadData[0],flag1)) myRefIndex = indices_by_refName[refIndex[RI][0]] # # write SE output # if len(myReadData) == 1: if NO_FASTQ != True: if isForward: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3]) else: OFW.writeFASTQRecord(myReadName,RC(myReadData[0][2]),myReadData[0][3][::-1]) if SAVE_BAM: if isUnmapped[0] == False: if isForward: flag1 = 0 OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1) else: flag1 = sam_flag(['reverse']) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1) # # write PE output # elif len(myReadData) == 2: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3],orientation=isForward) if SAVE_BAM: if isUnmapped[0] == False and isUnmapped[1] == False: if isForward: flag1 = sam_flag(['paired','proper','first','mate_reverse']) flag2 = sam_flag(['paired','proper','second','reverse']) else: flag1 = sam_flag(['paired','proper','second','mate_reverse']) flag2 = sam_flag(['paired','proper','first','reverse']) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1, matePos=myReadData[1][0]) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=flag2, matePos=myReadData[0][0]) elif isUnmapped[0] == False and isUnmapped[1] == True: if isForward: flag1 = sam_flag(['paired','first', 'mate_unmapped', 'mate_reverse']) flag2 = sam_flag(['paired','second', 'unmapped', 'reverse']) else: flag1 = sam_flag(['paired','second', 'mate_unmapped', 'mate_reverse']) flag2 = sam_flag(['paired','first', 'unmapped', 'reverse']) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1, matePos=myReadData[0][0]) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[0][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=flag2, matePos=myReadData[0][0], alnMapQual=0) elif isUnmapped[0] == True and isUnmapped[1] == False: if isForward: flag1 = sam_flag(['paired','first', 'unmapped', 'mate_reverse']) flag2 = sam_flag(['paired','second', 'mate_unmapped', 'reverse']) else: flag1 = sam_flag(['paired','second', 'unmapped', 'mate_reverse']) flag2 = sam_flag(['paired','first', 'mate_unmapped', 'reverse']) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[1][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=flag1, matePos=myReadData[1][0], alnMapQual=0) OFW.writeBAMRecord(myRefIndex, myReadName, myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=flag2, matePos=myReadData[1][0]) else: print '\nError: Unexpected number of reads generated...\n' exit(1) #print 'READS:',time.time()-ASDF2_TT if not isLastTime: OFW.flushBuffers(bamMax=next_start) else: OFW.flushBuffers(bamMax=end+1) # tally up all the variants that got successfully introduced for n in all_inserted_variants: ALL_VARIANTS_OUT[n] = True # prepare indices of next window start = next_start end = next_end if isLastTime: break if end >= pf: isLastTime = True if currentPercent != 100 and not havePrinted100: print '100%' else: print '' if ONLY_VCF: print 'VCF generation completed in', else: print 'Read sampling completed in', print int(time.time()-tt),'(sec)' # write all output variants for this reference if SAVE_VCF: print 'Writing output VCF...' for k in sorted(ALL_VARIANTS_OUT.keys()): currentRef = refIndex[RI][0] myID = '.' myQual = '.' myFilt = 'PASS' # k[0] + 1 because we're going back to 1-based vcf coords OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4]) #break # write unmapped reads to bam file if SAVE_BAM and len(unmapped_records): print 'writing unmapped reads to bam file...' for umr in unmapped_records: if PAIRED_END: OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], matePos=0, alnMapQual=0) else: OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], alnMapQual=0) # close output files OFW.closeFiles() if CANCER: OFW_CANCER.closeFiles()
def main(): # index reference refIndex = indexRef(REFERENCE) if PAIRED_END: N_HANDLING = ('random',FRAGMENT_SIZE) else: N_HANDLING = ('ignore',READLEN) indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))} # parse input variants, if present inputVariants = [] if INPUT_VCF != None: if CANCER: (sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS) tumorInd = sampNames.index('TUMOR') normalInd = sampNames.index('NORMAL') else: (sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS) for k in sorted(inputVariants.keys()): inputVariants[k].sort() # parse input targeted regions, if present inputRegions = {} if INPUT_BED != None: with open(INPUT_BED,'r') as f: for line in f: [myChr,pos1,pos2] = line.strip().split('\t')[:3] if myChr not in inputRegions: inputRegions[myChr] = [-1] inputRegions[myChr].extend([int(pos1),int(pos2)]) # parse input mutation rate rescaling regions, if present mutRateRegions = {} mutRateValues = {} if MUT_BED != None: with open(MUT_BED,'r') as f: for line in f: [myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4] mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';') (pos1,pos2) = (int(pos1),int(pos2)) if len(mutStr) and (pos2-pos1) > 1: # mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])]) if myChr not in mutRateRegions: mutRateRegions[myChr] = [-1] mutRateValues[myChr] = [0.0] mutRateRegions[myChr].extend([pos1,pos2]) mutRateValues.extend([mutRate*(pos2-pos1)]*2) # initialize output files (part I) bamHeader = None if SAVE_BAM: bamHeader = [copy.deepcopy(refIndex)] vcfHeader = None if SAVE_VCF: vcfHeader = [REFERENCE] # If processing jobs in parallel, precompute the independent regions that can be process separately if NJOBS > 1: parallelRegionList = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N) (myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS) if not len(myRegions): print 'This job id has no regions to process, exiting...' exit(1) for i in xrange(len(refIndex)-1,-1,-1): # delete reference not used in our job if not refIndex[i][0] in myRefs: del refIndex[i] # if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])]) else: corrected_nJobs = 1 # initialize output files (part II) if CANCER: OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT) OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs)) else: OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs)) OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1] """************************************************ **** LET'S GET THIS PARTY STARTED... ************************************************""" readNameCount = 1 # keep track of the number of reads we've sampled, for read-names for RI in xrange(len(refIndex)): # read in reference sequence and notate blocks of Ns (refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING) # if we're processing jobs in parallel only take the regions relevant for the current job if NJOBS > 1: for i in xrange(len(N_regions['non_N'])-1,-1,-1): if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions: del N_regions['non_N'][i] # count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators) total_bp_span = sum([n[1]-n[0] for n in N_regions['non_N']]) currentProgress = 0 currentPercent = 0 # prune invalid input variants, e.g variants that: # - try to delete or alter any N characters # - don't match the reference base at their specified position # - any alt allele contains anything other than allowed characters validVariants = [] nSkipped = [0,0,0] if refIndex[RI][0] in inputVariants: for n in inputVariants[refIndex[RI][0]]: span = (n[0],n[0]+len(n[1])) rseq = str(refSequence[span[0]-1:span[1]-1]) # -1 because going from VCF coords to array coords anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist]) if rseq != n[1]: nSkipped[0] += 1 continue elif 'N' in rseq: nSkipped[1] += 1 continue elif anyBadChr: nSkipped[2] += 1 continue #if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2: # continue validVariants.append(n) print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...' if any(nSkipped): print sum(nSkipped),'variants skipped...' print ' - ['+str(nSkipped[0])+'] ref allele does not match reference' print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region' print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters' # add large random structural variants # # TBD!!! # determine which structural variants will affect our sampling window positions structuralVars = [] for n in validVariants: bufferNeeded = max([max([len(n[1])-len(alt_allele),1]) for alt_allele in n[2]]) structuralVars.append((n[0]-1,bufferNeeded)) # -1 because going from VCF coords to array coords # determine sampling windows based on read length, large N regions, and structural mutations. # in order to obtain uniform coverage, windows should overlap by: # - READLEN, if single-end reads # - FRAGMENT_SIZE (mean), if paired-end reads # ploidy is fixed per large sampling window, # coverage distributions due to GC% and targeted regions are specified within these windows samplingWindows = [] ALL_VARIANTS_OUT = {} sequences = None if PAIRED_END: targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE overlap = FRAGMENT_SIZE else: targSize = WINDOW_TARGET_SCALE*READLEN overlap = READLEN print '--------------------------------' print 'sampling reads...' for i in xrange(len(N_regions['non_N'])): (pi,pf) = N_regions['non_N'][i] nTargWindows = max([1,(pf-pi)/targSize]) bpd = int((pf-pi)/float(nTargWindows)) bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE #print len(refSequence), (pi,pf), nTargWindows #print structuralVars # if for some reason our region is too small to process, skip it! (sorry) if nTargWindows == 1 and (pf-pi) < overlap-1: #print 'Does this ever happen?' continue start = pi end = min([start+bpd,pf]) #print '------------------RAWR:', (pi,pf), bpd currentVariantInd = 0 varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] vindFromPrev = 0 isLastTime = False while True: ####print (start,end) # adjust end-position of window based on inserted structural mutations relevantVars = [] if len(structuralVars) and currentVariantInd < len(structuralVars): prevVarInd = currentVariantInd while structuralVars[currentVariantInd][0] <= end: delta = (end-1) - (structuralVars[currentVariantInd][0] + structuralVars[currentVariantInd][1]) if delta <= 0: ####print 'DELTA:', delta end -= (delta-1) currentVariantInd += 1 if currentVariantInd == len(structuralVars): break relevantVars = structuralVars[prevVarInd:currentVariantInd] next_start = end-overlap next_end = min([next_start+bpd,pf]) if next_end-next_start < bpd: end = next_end isLastTime = True # print progress indicator #print 'PROCESSING WINDOW:',(start,end) currentProgress += end-start newPercent = int((currentProgress*100)/float(total_bp_span)) if newPercent > currentPercent: sys.stdout.write(str(newPercent)+'% ') sys.stdout.flush() currentPercent = newPercent # which inserted variants are in this window? varsInWindow = [] updated = False for j in xrange(vindFromPrev,len(validVariants)): vPos = validVariants[j][0] if vPos >= start and vPos < end: varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:]))) # vcf --> array coords if vPos >= end-overlap-1 and updated == False: updated = True vindFromPrev = j if vPos >= end: break # if computing only VCF, we can skip this... if ONLY_VCF: coverage_dat = None coverage_avg = None else: # pre-compute gc-bias and targeted sequencing coverage modifiers nSubWindows = (end-start)/GC_WINDOW_SIZE coverage_dat = (GC_WINDOW_SIZE,[]) for j in xrange(nSubWindows): rInd = start + j*GC_WINDOW_SIZE if INPUT_BED == None: tCov = True else: tCov = not(bisect.bisect(inputRegions[myChr],rInd)%2) or not(bisect.bisect(inputRegions[myChr],rInd+GC_WINDOW_SIZE)%2) if tCov: tScl = 1.0 else: tScl = OFFTARGET_SCALAR gc_v = refSequence[rInd:rInd+GC_WINDOW_SIZE].count('G') + refSequence[rInd:rInd+GC_WINDOW_SIZE].count('C') gScl = GC_SCALE_VAL[gc_v] coverage_dat[1].append(1.0*tScl*gScl) coverage_avg = np.mean(coverage_dat[1]) # pre-compute mutation rate tracks # PROVIDED MUTATION RATES OVERRIDE AVERAGE VALUE # construct sequence data that we will sample reads from if sequences == None: sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat,onlyVCF=ONLY_VCF) else: sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,coverage_dat) # adjust position of all inserted variants to match current window offset #variants_to_insert = [] #for n in varsFromPrevOverlap: # ln = [n[0]-start] + list(n[1:]) # variants_to_insert.append(tuple(ln)) #for n in varsInWindow: # ln = [n[0]-start] + list(n[1:]) # variants_to_insert.append(tuple(ln)) #sequences.insert_mutations(variants_to_insert) sequences.insert_mutations(varsFromPrevOverlap + varsInWindow) all_inserted_variants = sequences.random_mutations() #print all_inserted_variants if CANCER: tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat) tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants) all_cancer_variants = tumor_sequences.random_mutations() # which variants do we need to keep for next time (because of window overlap)? varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] for n in all_inserted_variants: if n[0] >= end-overlap-1: varsFromPrevOverlap.append(n) if CANCER: for n in all_cancer_variants: if n[0] >= end-overlap-1: varsCancerFromPrevOverlap.append(n) # if we're only producing VCF, no need to go through the hassle of generating reads if ONLY_VCF: pass else: # for each sampling window, construct sub-windows with coverage information covWindows = [COVERAGE for n in xrange((end-start)/SMALL_WINDOW)] if (end-start)%SMALL_WINDOW: covWindows.append(COVERAGE) meanCov = sum(covWindows)/float(len(covWindows)) if PAIRED_END: readsToSample = int(((end-start)*meanCov*coverage_avg)/(2*READLEN))+1 else: readsToSample = int(((end-start)*meanCov*coverage_avg)/(READLEN))+1 # sample reads from altered reference for i in xrange(readsToSample): if PAIRED_END: myFraglen = FRAGLEN_DISTRIBUTION.sample() myReadData = sequences.sample_read(SE_CLASS,myFraglen) myReadData[0][0] += start # adjust mapping position based on window start myReadData[1][0] += start else: myReadData = sequences.sample_read(SE_CLASS) myReadData[0][0] += start # adjust mapping position based on window start if NJOBS > 1: myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount) else: myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount) readNameCount += len(myReadData) # if desired, replace all low-quality bases with Ns if N_MAX_QUAL > -1: for j in xrange(len(myReadData)): myReadString = [n for n in myReadData[j][2]] for k in xrange(len(myReadData[j][3])): adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ if adjusted_qual <= N_MAX_QUAL: myReadString[k] = 'N' myReadData[j][2] = ''.join(myReadString) # write read data out to FASTQ and BAM files, bypass FASTQ if option specified myRefIndex = indices_by_refName[refIndex[RI][0]] if len(myReadData) == 1: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3]) if SAVE_BAM: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=0) elif len(myReadData) == 2: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3]) if SAVE_BAM: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=99, matePos=myReadData[1][0]) OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=147, matePos=myReadData[0][0]) else: print '\nError: Unexpected number of reads generated...\n' exit(1) # tally up all the variants that got successfully introduced for n in all_inserted_variants: ALL_VARIANTS_OUT[n] = True # prepare indices of next window start = next_start end = next_end if isLastTime: break if end >= pf: isLastTime = True if currentPercent != 100: print '100%' else: print '' # write all output variants for this reference if SAVE_VCF: for k in sorted(ALL_VARIANTS_OUT.keys()): currentRef = refIndex[RI][0] myID = '.' myQual = '.' myFilt = 'PASS' # k[0] + 1 because we're going back to 1-based vcf coords OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4]) #break # close output files OFW.closeFiles() if CANCER: OFW_CANCER.closeFiles()