def statRead(self, read): global WARNED_BZIP2_ERROR seq = read[1] qual = read[3] seqlen = len(seq) gc = 0 for i in xrange(seqlen): self.totalNum[i] += 1 try: qnum = util.qualNum(qual[i]) except Exception: if self.filename.endswith( "bz2") and WARNED_BZIP2_ERROR == False: WARNED_BZIP2_ERROR = True print( "WARNING: Incompatible bzip2 format, please note that the file compressed with pbzip2 may have problem. Please compress it with bzip2 insteadly.\n" ) continue self.totalQual[i] += qnum b = seq[i] if b == 'G' or b == 'C': gc += 1 if b in ALL_BASES: self.baseCounts[b][i] += 1 self.baseTotalQual[b][i] += qnum # calculate discontinuity left = i - 2 right = i + 3 if left < 0: left = 0 right = 5 elif right >= seqlen: right = seqlen left = seqlen - 5 discontinuity = 0 for j in xrange(left, right - 1): if seq[j] != seq[j + 1]: discontinuity += 1 self.totalDiscontinuity[i] += discontinuity #gcPer = int(1000.0* float(gc)/seqlen) self.gcHistogram[gc] += 1 for i in xrange(seqlen - self.kmerLen): self.totalKmer += 1 kmer = seq[i:i + self.kmerLen] if kmer in self.kmerCount: self.kmerCount[kmer] += 1 else: self.kmerCount[kmer] = 1 rcKmer = util.reverseComplement(kmer) if rcKmer not in self.kmerCount: self.kmerCount[rcKmer] = 0
def statRead(self, read): seq = read[1] qual = read[3] for i in xrange(len(seq)): self.totalNum[i] += 1 qnum = util.qualNum(qual[i]) self.totalQual[i] += qnum b = seq[i] if b in ALL_BASES: self.baseCounts[b][i] += 1 self.baseTotalQual[b][i] += qnum for i in xrange(len(seq) - self.kmerLen): self.totalKmer += 1 kmer = seq[i:i+self.kmerLen] if kmer in self.kmerCount: self.kmerCount[kmer] += 1 else: self.kmerCount[kmer] = 1
def statRead(self, read): seq = read[1] qual = read[3] seqlen = len(seq) gc = 0 for i in xrange(seqlen): self.totalNum[i] += 1 qnum = util.qualNum(qual[i]) self.totalQual[i] += qnum b = seq[i] if b=='G' or b=='C': gc += 1 if b in ALL_BASES: self.baseCounts[b][i] += 1 self.baseTotalQual[b][i] += qnum # calculate discontinuity left = i-2 right = i+3 if left<0: left = 0 right = 5 elif right >= seqlen: right = seqlen left = seqlen - 5 discontinuity = 0 for j in xrange(left, right-1): if seq[j] != seq[j+1]: discontinuity += 1 self.totalDiscontinuity[i] += discontinuity #gcPer = int(1000.0* float(gc)/seqlen) self.gcHistogram[gc] += 1 for i in xrange(seqlen - self.kmerLen): self.totalKmer += 1 kmer = seq[i:i+self.kmerLen] if kmer in self.kmerCount: self.kmerCount[kmer] += 1 else: self.kmerCount[kmer] = 1 rcKmer = util.reverseComplement(kmer) if rcKmer not in self.kmerCount: self.kmerCount[rcKmer] = 0
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() self.r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r1qc_prefilter.statFile(self.options.read1_file) if self.options.read2_file != None: self.r2qc_prefilter.statFile(self.options.read2_file) self.r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = self.r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen + 1)] distance_histgram = [0 for x in xrange(readLen + 1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = self.r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "bad") #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: # overlap_dir = os.path.dirname(self.options.read1_file) overlap_dir = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "overlap") #save QC results at the same folder of good qc_base_folder = self.options.report_output_folder if qc_base_folder == None: qc_base_folder = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) qc_dir = qc_base_folder if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and ( not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) gzip_out = self.options.gzip gzip_comp = self.options.compression if not gzip_out and self.options.read1_file.endswith(".gz"): gzip_out = True good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer( os.path.join(good_dir, getMainName(self.options.read1_file) + ".good.fq"), gzip_out, gzip_comp) bad_read1_file = fastq.Writer( os.path.join(bad_dir, getMainName(self.options.read1_file) + ".bad.fq"), gzip_out, gzip_comp) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read1_file) + ".overlap.fq"), gzip_out, gzip_comp) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.read2_file) + ".good.fq"), gzip_out, gzip_comp) bad_read2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.read2_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read2_file) + ".overlap.fq"), gzip_out, gzip_comp) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index1_file) + ".good.fq"), gzip_out, gzip_comp) bad_index1_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index1_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index1_file) + ".overlap.fq"), gzip_out, gzip_comp) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index2_file) + ".good.fq"), gzip_out, gzip_comp) bad_index2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index2_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index2_file) + ".overlap.fq"), gzip_out, gzip_comp) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL_BASES = 0 GOOD_BASES = 0 TOTAL_READS = 0 GOOD_READS = 0 BAD_READS = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADINDEL = 0 BADMISMATCH = 0 BADDIFF = 0 READ_CORRECTED = 0 BASE_CORRECTED = 0 BASE_SKIPPED_CORRECTION = 0 BASE_ZERO_QUAL_MASKED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 OVERLAP_BASE_SUM = 0 # error profiling by overlap analysis OVERLAP_BASE_ERR = 0 OVERLAP_ERR_MATRIX = init_error_matrix() #adapter trimming by overlap analysis TRIMMED_ADAPTER_BASE = 0 TRIMMED_ADAPTER_READ = 0 while True: r1 = read1_file.nextRead() if r1 == None: break else: TOTAL_BASES += len(r1[1]) if read2_file != None: r2 = read2_file.nextRead() if r2 == None: break if index1_file != None: i1 = index1_file.nextRead() if i1 == None: break if index2_file != None: i2 = index2_file.nextRead() if i2 == None: break else: TOTAL_BASES += len(r2[1]) TOTAL_READS += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode( r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName( r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode( r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair( r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2 != None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1 != None or poly2 != None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2 != None: lowQual2 = lowQualityNum( r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2 != None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2 != None and (not self.options.no_overlap): (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative # in this case the adapter is sequenced and should be trimmed if offset < 0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][0:overlap_len] r2[3] = r2[3][0:overlap_len] TRIMMED_ADAPTER_BASE += abs(offset) * 2 TRIMMED_ADAPTER_READ += 1 # check the sequence length again after adapter trimmed if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) distance_histgram[distance] += 1 # if distance is too high, then set it as bad mismatch if distance > 3: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADDIFF") BADDIFF += 1 continue if overlap_len > 30: OVERLAPPED += 1 OVERLAP_LEN_SUM += overlap_len # we consider the distance is caused by sequencing error OVERLAP_BASE_SUM += overlap_len * 2 OVERLAP_BASE_ERR += distance corrected = 0 zero_qual_masked = 0 skipped_mismatch = 0 if distance > 0: #try to fix low quality base #hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #if hamming != distance: # self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") # BADINDEL += 1 # continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) err_mtx = init_error_matrix() for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o - 1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o - 1] if b1 != b2: # print(TOTAL_READS, o, b1, b2, q1, q2) this_is_corrected = False if util.qualNum(q1) >= 30 and util.qualNum( q2) <= 14: if b1 != 'N' and b2 != 'N': err_mtx[util.complement(b1)][ util.complement(b2)] += 1 if not self.options.no_correction: r2[1] = util.changeString( r2[1], -o - 1, util.complement(b1)) r2[3] = util.changeString( r2[3], -o - 1, q1) corrected += 1 this_is_corrected = True elif util.qualNum(q2) >= 30 and util.qualNum( q1) <= 14: if b1 != 'N' and b2 != 'N': err_mtx[b2][b1] += 1 if not self.options.no_correction: r1[1] = util.changeString( r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 this_is_corrected = True if not this_is_corrected: if self.options.mask_mismatch: # mask them as zero qual if it is not corrected zero_qual = '!' r2[3] = util.changeString( r2[3], -o - 1, zero_qual) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, zero_qual) zero_qual_masked += 1 else: skipped_mismatch += 1 if corrected + zero_qual_masked + skipped_mismatch >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected + zero_qual_masked + skipped_mismatch == distance: merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx) if corrected > 0: READ_CORRECTED += 1 BASE_CORRECTED += corrected # multiply by 2 since we mask bases by pair BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2 BASE_SKIPPED_CORRECTION += skipped_mismatch * 2 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) GOOD_BASES += len(r1[1]) if i2 != None: GOOD_BASES += len(r2[1]) if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample: self.r1qc_postfilter.statRead(r1) if r2 != None: self.r2qc_postfilter.statRead(r2) GOOD_READS += 1 if self.options.qc_only and TOTAL_READS >= self.options.qc_sample: break self.r1qc_postfilter.qc() #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: self.r2qc_postfilter.qc() #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.close() bad_read1_file.close() if self.options.read2_file != None: good_read2_file.close() bad_read2_file.close() if self.options.index1_file != None: good_index1_file.close() bad_index1_file.close() if self.options.index2_file != None: good_index2_file.close() bad_index2_file.close() # print stat numbers BAD_READS = TOTAL_READS - GOOD_READS result = {} result['total_bases'] = TOTAL_BASES result['good_bases'] = GOOD_BASES result['total_reads'] = TOTAL_READS result['good_reads'] = GOOD_READS result['bad_reads'] = BAD_READS result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble'] = BADBBL result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX'] = BADPOL result['bad_reads_with_low_quality'] = BADLQC result['bad_reads_with_too_many_N'] = BADNCT result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL + BADDIFF result['readlen'] = readLen # plot result bar figure labels = [ 'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N' ] counts = [ GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT ] colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADMISMATCH + BADINDEL + BADDIFF) colors.append('#FF6600') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#EEBB00') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#CCDD22') for i in xrange(len(counts)): type_percent = 0.0 if TOTAL_READS > 0: type_percent = 100.0 * float(counts[i]) / TOTAL_READS labels[i] = labels[i] + ": " + str( counts[i]) + "(" + str(type_percent) + "%)" reporter.addFigure( 'Good reads and bad reads after filtering', self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS, 'filter_stat'), 'filter_stat', "") #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png")) #squeeze qc data for JSON output self.r1qc_prefilter.squeeze() self.r1qc_postfilter.squeeze() if self.options.read2_file != None: self.r2qc_prefilter.squeeze() self.r2qc_postfilter.squeeze() stat = {} # stat["options"]=self.options stat["afterqc_main_summary"] = result stat["command"] = makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"][ "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10] # output more data in JSON file for offline plotting directly from JSON stat["base_quality"] = {} stat["base_quality"][ "read1_prefilter"] = self.r1qc_prefilter.baseMeanQual stat["base_quality"][ "read1_postfilter"] = self.r1qc_postfilter.baseMeanQual stat["mean_quality"] = {} stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual stat["mean_quality"][ "read1_postfilter"] = self.r1qc_postfilter.meanQual stat["base_content"] = {} stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents stat["base_content"][ "read1_postfilter"] = self.r1qc_postfilter.percents stat["gc_content"] = {} stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents stat["gc_content"][ "read1_postfilter"] = self.r1qc_postfilter.gcPercents if self.options.read2_file != None: stat["kmer_content"][ "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10] stat["base_quality"][ "read2_prefilter"] = self.r2qc_prefilter.baseMeanQual stat["base_quality"][ "read2_postfilter"] = self.r2qc_postfilter.baseMeanQual stat["mean_quality"][ "read2_prefilter"] = self.r2qc_prefilter.meanQual stat["mean_quality"][ "read2_postfilter"] = self.r2qc_postfilter.meanQual stat["base_content"][ "read2_prefilter"] = self.r2qc_prefilter.percents stat["base_content"][ "read2_postfilter"] = self.r2qc_postfilter.percents stat["gc_content"][ "read2_prefilter"] = self.r2qc_prefilter.gcPercents stat["gc_content"][ "read2_postfilter"] = self.r2qc_postfilter.gcPercents stat["afterqc_overlap"] = {} stat["afterqc_overlap"]['overlapped_pairs'] = OVERLAPPED if OVERLAPPED > 0: stat["afterqc_overlap"]['average_overlap_length'] = float( OVERLAP_LEN_SUM / OVERLAPPED) else: stat["afterqc_overlap"]['average_overlap_length'] = 0.0 stat["afterqc_overlap"]['bad_mismatch_reads'] = BADMISMATCH stat["afterqc_overlap"]['bad_diff'] = BADDIFF stat["afterqc_overlap"]['bad_indel_reads'] = BADINDEL stat["afterqc_overlap"]['corrected_reads'] = READ_CORRECTED stat["afterqc_overlap"]['corrected_bases'] = BASE_CORRECTED stat["afterqc_overlap"][ 'skipped_correction_bases'] = BASE_SKIPPED_CORRECTION stat["afterqc_overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED stat["afterqc_overlap"][ 'zero_qual_skipped'] = BASE_ZERO_QUAL_MASKED stat["afterqc_overlap"][ 'trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE stat["afterqc_overlap"][ 'trimmed_adapter_reads'] = TRIMMED_ADAPTER_READ if OVERLAP_BASE_SUM > 0: stat["afterqc_overlap"]['error_rate'] = float( OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM) else: stat["afterqc_overlap"]['error_rate'] = 0.0 stat["afterqc_overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX stat["afterqc_overlap"][ 'edit_distance_histogram'] = distance_histgram[0:10] reporter.addFigure( 'Sequence error distribution', self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX, 'error_matrix'), 'error_matrix', "") reporter.addFigure( 'Overlap length distribution', self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen, TOTAL_READS, 'overlap_stat'), 'overlap_stat', "") #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png")) stat_file = open( os.path.join(qc_dir, os.path.basename(self.options.read1_file) + ".json"), "w") stat_json = json.dumps(stat, sort_keys=True, indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.setStat(stat) reporter.setVersion(self.options.version) reporter.output( os.path.join(qc_dir, os.path.basename(self.options.read1_file) + ".html"))
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #create a QC folder to contains QC results qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) #QC result of this file/pair qc_dir = os.path.join(qc_base_folder, os.path.basename(self.options.read1_file)) if not os.path.exists(qc_dir): os.makedirs(qc_dir) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r1qc_prefilter.statFile(self.options.read1_file) r1qc_prefilter.plot(qc_dir, "R1-prefilter") if self.options.read2_file != None: r2qc_prefilter.statFile(self.options.read2_file) r2qc_prefilter.plot(qc_dir, "R2-prefilter") r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen + 1)] distance_histgram = [0 for x in xrange(readLen + 1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.dirname(self.options.read1_file) #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: overlap_dir = os.path.dirname(self.options.read1_file) if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and ( not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer( os.path.join(good_dir, getMainName(self.options.read1_file) + ".good.fq")) bad_read1_file = fastq.Writer( os.path.join(bad_dir, getMainName(self.options.read1_file) + ".bad.fq")) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read1_file) + ".overlap.fq")) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.read2_file) + ".good.fq")) bad_read2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.read2_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read2_file) + ".overlap.fq")) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index1_file) + ".good.fq")) bad_index1_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index1_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index1_file) + ".overlap.fq")) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index2_file) + ".good.fq")) bad_index2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index2_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index2_file) + ".overlap.fq")) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL = 0 GOOD = 0 BAD = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADOL = 0 BADINDEL = 0 BADMISMATCH = 0 BASE_CORRECTED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 while True: r1 = read1_file.nextRead() if r1 == None: break if read2_file != None: r2 = read2_file.nextRead() if r2 == None: break if index1_file != None: i1 = index1_file.nextRead() if i1 == None: break if index2_file != None: i2 = index2_file.nextRead() if i2 == None: break TOTAL += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode( r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName( r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode( r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair( r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2 != None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1 != None or poly2 != None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2 != None: lowQual2 = lowQualityNum( r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2 != None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2 != None: (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative if offset < 0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][-offset:-offset + overlap_len] r2[3] = r2[3][-offset:-offset + overlap_len] # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) if overlap_len > 30: OVERLAPPED += 1 distance_histgram[distance] += 1 OVERLAP_LEN_SUM += overlap_len corrected = 0 if distance > 2: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL") BADOL += 1 continue elif distance > 0: #try to fix low quality base hamming = util.hammingDistance( r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) if hamming != distance: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") BADINDEL += 1 continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o - 1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o - 1] if b1 != b2: # print(TOTAL, o, b1, b2, q1, q2) if util.qualNum(q1) >= 27 and util.qualNum( q2) <= 16: r2[1] = util.changeString( r2[1], -o - 1, util.complement(b1)) r2[3] = util.changeString( r2[3], -o - 1, q1) corrected += 1 elif util.qualNum(q2) >= 27 and util.qualNum( q1) <= 16: r1[1] = util.changeString( r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 if corrected >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected == distance: BASE_CORRECTED += 1 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample: r1qc_postfilter.statRead(r1) if r2 != None: r2qc_postfilter.statRead(r2) GOOD += 1 if self.options.qc_only and TOTAL >= self.options.qc_sample: break r1qc_postfilter.qc() r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: r2qc_postfilter.qc() r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.flush() bad_read1_file.flush() if self.options.read2_file != None: good_read2_file.flush() bad_read2_file.flush() if self.options.index1_file != None: good_index1_file.flush() bad_index1_file.flush() if self.options.index2_file != None: good_index2_file.flush() bad_index2_file.flush() # print stat numbers BAD = TOTAL - GOOD result = {} result['total_reads'] = TOTAL result['good_reads'] = GOOD result['bad_reads'] = BAD result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble'] = BADBBL result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX'] = BADPOL result['bad_reads_with_low_quality'] = BADLQC result['bad_reads_with_too_many_N'] = BADNCT result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL # plot result bar figure labels = [ 'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N' ] counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT] colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADOL + BADMISMATCH + BADINDEL) colors.append('#FF6600') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#EEBB00') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#CCDD22') for i in xrange(len(counts)): labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str( 100.0 * float(counts[i]) / TOTAL) + "%)" fig = plt.figure(1) plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads", fontsize=12, color='#666666') plt.axis('equal') patches, texts = plt.pie(counts, colors=colors, radius=0.7) patches, labels, dummy = zip(*sorted( zip(patches, labels, counts), key=lambda x: x[2], reverse=True)) plt.legend(patches, labels, loc='upper left', fontsize=9) plt.savefig(os.path.join(qc_dir, "filter-stat.png"), bbox_inches='tight') plt.close(1) stat = {} # stat["options"]=self.options stat["summary"] = result stat["command"] = makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[ 0:10] stat["kmer_content"][ "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10] if self.options.read2_file != None: stat["kmer_content"][ "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10] stat["overlap"] = {} stat["overlap"]['overlapped_pairs'] = OVERLAPPED if OVERLAPPED > 0: stat["overlap"]['average_overlap_length'] = float( OVERLAP_LEN_SUM / OVERLAPPED) else: stat["overlap"]['average_overlap_length'] = 0.0 stat["overlap"]['bad_edit_distance'] = BADOL stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH stat["overlap"]['bad_indel'] = BADINDEL stat["overlap"][ 'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED stat["overlap"][ 'overlapped_area_edit_distance_histogram'] = distance_histgram[ 0:10] plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png")) stat_file = open(os.path.join(qc_dir, "after.json"), "w") stat_json = json.dumps(stat, sort_keys=True, indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.output(os.path.join(qc_dir, "report.html"))
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #create a QC folder to contains QC results qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) #QC result of this file/pair qc_dir = os.path.join(qc_base_folder, os.path.basename(self.options.read1_file)) if not os.path.exists(qc_dir): os.makedirs(qc_dir) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r1qc_prefilter.statFile(self.options.read1_file) r1qc_prefilter.plot(qc_dir, "R1-prefilter") if self.options.read2_file != None: r2qc_prefilter.statFile(self.options.read2_file) r2qc_prefilter.plot(qc_dir, "R2-prefilter") r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen+1)] distance_histgram = [0 for x in xrange(readLen+1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.dirname(self.options.read1_file) #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: overlap_dir = os.path.dirname(self.options.read1_file) if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq")) bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq")) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq")) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq")) bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq")) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq")) bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq")) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq")) bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq")) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL = 0 GOOD = 0 BAD = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADOL = 0 BADINDEL = 0 BADMISMATCH = 0 BASE_CORRECTED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 while True: r1 = read1_file.nextRead() if r1==None: break if read2_file != None: r2 = read2_file.nextRead() if r2==None: break if index1_file != None: i1 = index1_file.nextRead() if i1==None: break if index2_file != None: i2 = index2_file.nextRead() if i2==None: break TOTAL += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair(r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1])<self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2!=None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1!=None or poly2!=None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2!=None: lowQual2 = lowQualityNum(r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2!=None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2!=None: (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative if offset <0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][-offset:-offset+overlap_len] r2[3] = r2[3][-offset:-offset+overlap_len] # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) if overlap_len>30: OVERLAPPED += 1 distance_histgram[distance] += 1 OVERLAP_LEN_SUM += overlap_len corrected = 0 if distance > 2: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL") BADOL += 1 continue elif distance>0: #try to fix low quality base hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) if hamming != distance: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") BADINDEL += 1 continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o-1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o-1] if b1 != b2: # print(TOTAL, o, b1, b2, q1, q2) if util.qualNum(q1) >= 27 and util.qualNum(q2) <= 16: r2[1] = util.changeString(r2[1], -o-1, util.complement(b1)) r2[3] = util.changeString(r2[3], -o-1, q1) corrected += 1 elif util.qualNum(q2) >= 27 and util.qualNum(q1) <= 16: r1[1]= util.changeString(r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString(r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 if corrected >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected == distance: BASE_CORRECTED += 1 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) r1qc_postfilter.statRead(r1) if r2 != None: r2qc_postfilter.statRead(r2) GOOD += 1 if self.options.qc_only and TOTAL >= self.options.qc_sample: break r1qc_postfilter.qc() r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: r2qc_postfilter.qc() r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.flush() bad_read1_file.flush() if self.options.read2_file != None: good_read2_file.flush() bad_read2_file.flush() if self.options.index1_file != None: good_index1_file.flush() bad_index1_file.flush() if self.options.index2_file != None: good_index2_file.flush() bad_index2_file.flush() # print stat numbers BAD = TOTAL - GOOD result = {} result['total_reads']=TOTAL result['good_reads']=GOOD result['bad_reads']=BAD result['bad_reads_with_bad_barcode']= BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble']= BADBBL result['bad_reads_with_bad_read_length']= BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX']= BADPOL result['bad_reads_with_low_quality']=BADLQC result['bad_reads_with_too_many_N']= BADNCT result['bad_reads_with_bad_overlap']= BADOL + BADMISMATCH + BADINDEL # plot result bar figure labels = ['good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'] counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT] colors = ['green', '#FF1111', '#FF3333', '#FF5555', '#FF7777'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADOL + BADMISMATCH + BADINDEL) colors.append('#FF9999') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#FFBBBB') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#FFDDDD') fig = plt.figure(1) plt.title("Good reads (green) and bad reads (red) of total " + str(TOTAL)) fig.subplots_adjust(left = 0.14) lefts = xrange(len(counts)) plt.yticks(lefts, labels) plt.ylim(-0.5, len(counts)-0.5) plt.barh(lefts, counts, align='center', height=0.5, alpha=0.8, color=colors) plt.savefig(os.path.join(qc_dir, "filter-stat.png")) plt.close(1) stat={} # stat["options"]=self.options stat["summary"]=result stat["command"]=makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[0:10] stat["kmer_content"]["read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10] if self.options.read2_file != None: stat["kmer_content"]["read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"]["read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10] stat["overlap"]={} stat["overlap"]['overlapped_pairs']=OVERLAPPED if OVERLAPPED > 0: stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED) else: stat["overlap"]['average_overlap_length']=0.0 stat["overlap"]['bad_edit_distance']=BADOL stat["overlap"]['bad_mismatch_bases']=BADMISMATCH stat["overlap"]['bad_indel']=BADINDEL stat["overlap"]['reads_with_corrected_mismatch_bases']=BASE_CORRECTED stat["overlap"]['overlapped_area_edit_distance_histogram']=distance_histgram[0:10] plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png")) stat_file = open(os.path.join(qc_dir, "after.json"), "w") stat_json = json.dumps(stat, sort_keys=True,indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.output(os.path.join(qc_dir, "report.html"))