def statFile(self, filename): self.filename = filename READ_TO_SKIP = 1000 reader = fastq.Reader(filename) stat_reads_num = 0 skipped_reads = [] #sample up to maxSample reads for stat while True: read = reader.nextRead() if read == None: break self.readCount += 1 # here we skip the first 1000 reads because usually they are usually not stable if self.readCount < READ_TO_SKIP: skipped_reads.append(read) continue stat_reads_num += 1 if stat_reads_num > self.sampleLimit and self.sampleLimit > 0: break self.statRead(read) # if the fq file is too small, then we stat the skipped reads again if stat_reads_num < READ_TO_SKIP: for read in skipped_reads: self.statRead(read) self.qc()
def extract(self): reader = fastq.Reader(self.filename) stat_reads_num = 0 skipped_reads = [] #sample up to maxSample reads for stat while True: read = reader.nextRead() if read == None: break self.read_count += 1 # here we skip the first 1000 reads because usually they are usually not stable if self.read_count < READ_TO_SKIP: skipped_reads.append(read) continue stat_reads_num += 1 if stat_reads_num > self.sample_limit and self.sample_limit > 0: break self.stat_read(read) # if the fq file is too small, then we stat the skipped reads again if stat_reads_num < READ_TO_SKIP: for read in skipped_reads: self.stat_read(read) self.calc_read_len() self.calc_percents()
def statFile(self, filename): reader = fastq.Reader(filename) #sample up to maxSample reads for stat while True: read = reader.nextRead() self.readCount += 1 if read==None or (self.readCount > self.sampleLimit and self.sampleLimit>0): break self.statRead(read) self.qc()
def stat(filename): reader = fastq.Reader(filename) total_count = 0 q20_count = 0 q30_count = 0 while True: read = reader.nextRead() if not read: break total_count += len(read[3]) q20, q30 = qual_stat(read[3]) q20_count += q20 q30_count += q30 print("total bases:", total_count) print("q20 bases:", q20_count) print("q30 bases:", q30_count) print("q20 percents:", 100 * float(q20_count) / float(total_count)) print("q30 percents:", 100 * float(q30_count) / float(total_count))
def stat(filename): reader = fastq.Reader(filename) total_count = 0 q20_count = 0 q30_count = 0 min_len = 150 max_len = 150 read_num = 0 Count_GC = 0 while True: read = reader.nextRead() if read == None: break total_count += len(read[3]) if min_len > len(read[3]): min_len = len(read[3]) if max_len < len(read[3]): max_len = len(read[3]) read_num = read_num + 1 q20, q30 = qual_stat(read[3]) q20_count += q20 q30_count += q30 Count_GC += getContentOf(['G', 'C', 'g', 'c'], read[1]) #print "total bases","\t",total_count #print "q20 bases","\t",q20_count #print "q30 bases","\t",q30_count #print "q20 percents","\t",100 * float(q20_count)/float(total_count) #print "q30 percents","\t",100 * float(q30_count)/float(total_count) #print "Read Length Distributon","\t",min_len, max_len #print "Mean Read Length","\t",float(total_count)/float(read_num) print "Total bases\tQ20 bases\tQ30 bases\tQ20 percents\tq30 percents\tGC Cntent\tRead Length Distributon\tMean Read Length" print total_count, "\t", q20_count, "\t", q30_count, "\t", 100 * float( q20_count) / float(total_count), "\t", 100 * float(q30_count) / float( total_count), "\t", 100 * float(Count_GC) / float( total_count), "\t", min_len, "-", max_len, "\t", float( total_count) / float(read_num)
def stat(filename, output): """ Modified to output total sequences and sequences with >=30 avg quality """ reader = fastq.Reader(filename) # q30_count = 0 # total_count = 0 seq_count = 0 q30_seq_count = 0 while True: read = reader.nextRead() if read == None: break # total_count += len(read[3]) q30_seq = qual_stat(read[3]) # q30_count += q30 seq_count += 1 if q30_seq >= 30: q30_seq_count += 1 filename = str(filename) total_seqs = "total sequences: " + str(seq_count) q30_seqs = "q30 reads: " + str(q30_seq_count) q30_out = open(output, "w") q30_out.writelines([filename, "\n", total_seqs, "\n", q30_seqs]) q30_out.close()
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() self.r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r1qc_prefilter.statFile(self.options.read1_file) if self.options.read2_file != None: self.r2qc_prefilter.statFile(self.options.read2_file) self.r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = self.r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen + 1)] distance_histgram = [0 for x in xrange(readLen + 1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = self.r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "bad") #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: # overlap_dir = os.path.dirname(self.options.read1_file) overlap_dir = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "overlap") #save QC results at the same folder of good qc_base_folder = self.options.report_output_folder if qc_base_folder == None: qc_base_folder = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) qc_dir = qc_base_folder if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and ( not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) gzip_out = self.options.gzip gzip_comp = self.options.compression if not gzip_out and self.options.read1_file.endswith(".gz"): gzip_out = True good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer( os.path.join(good_dir, getMainName(self.options.read1_file) + ".good.fq"), gzip_out, gzip_comp) bad_read1_file = fastq.Writer( os.path.join(bad_dir, getMainName(self.options.read1_file) + ".bad.fq"), gzip_out, gzip_comp) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read1_file) + ".overlap.fq"), gzip_out, gzip_comp) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.read2_file) + ".good.fq"), gzip_out, gzip_comp) bad_read2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.read2_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read2_file) + ".overlap.fq"), gzip_out, gzip_comp) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index1_file) + ".good.fq"), gzip_out, gzip_comp) bad_index1_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index1_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index1_file) + ".overlap.fq"), gzip_out, gzip_comp) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index2_file) + ".good.fq"), gzip_out, gzip_comp) bad_index2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index2_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index2_file) + ".overlap.fq"), gzip_out, gzip_comp) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL_BASES = 0 GOOD_BASES = 0 TOTAL_READS = 0 GOOD_READS = 0 BAD_READS = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADINDEL = 0 BADMISMATCH = 0 BADDIFF = 0 READ_CORRECTED = 0 BASE_CORRECTED = 0 BASE_SKIPPED_CORRECTION = 0 BASE_ZERO_QUAL_MASKED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 OVERLAP_BASE_SUM = 0 # error profiling by overlap analysis OVERLAP_BASE_ERR = 0 OVERLAP_ERR_MATRIX = init_error_matrix() #adapter trimming by overlap analysis TRIMMED_ADAPTER_BASE = 0 TRIMMED_ADAPTER_READ = 0 while True: r1 = read1_file.nextRead() if r1 == None: break else: TOTAL_BASES += len(r1[1]) if read2_file != None: r2 = read2_file.nextRead() if r2 == None: break if index1_file != None: i1 = index1_file.nextRead() if i1 == None: break if index2_file != None: i2 = index2_file.nextRead() if i2 == None: break else: TOTAL_BASES += len(r2[1]) TOTAL_READS += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode( r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName( r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode( r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair( r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2 != None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1 != None or poly2 != None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2 != None: lowQual2 = lowQualityNum( r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2 != None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2 != None and (not self.options.no_overlap): (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative # in this case the adapter is sequenced and should be trimmed if offset < 0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][0:overlap_len] r2[3] = r2[3][0:overlap_len] TRIMMED_ADAPTER_BASE += abs(offset) * 2 TRIMMED_ADAPTER_READ += 1 # check the sequence length again after adapter trimmed if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) distance_histgram[distance] += 1 # if distance is too high, then set it as bad mismatch if distance > 3: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADDIFF") BADDIFF += 1 continue if overlap_len > 30: OVERLAPPED += 1 OVERLAP_LEN_SUM += overlap_len # we consider the distance is caused by sequencing error OVERLAP_BASE_SUM += overlap_len * 2 OVERLAP_BASE_ERR += distance corrected = 0 zero_qual_masked = 0 skipped_mismatch = 0 if distance > 0: #try to fix low quality base #hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #if hamming != distance: # self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") # BADINDEL += 1 # continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) err_mtx = init_error_matrix() for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o - 1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o - 1] if b1 != b2: # print(TOTAL_READS, o, b1, b2, q1, q2) this_is_corrected = False if util.qualNum(q1) >= 30 and util.qualNum( q2) <= 14: if b1 != 'N' and b2 != 'N': err_mtx[util.complement(b1)][ util.complement(b2)] += 1 if not self.options.no_correction: r2[1] = util.changeString( r2[1], -o - 1, util.complement(b1)) r2[3] = util.changeString( r2[3], -o - 1, q1) corrected += 1 this_is_corrected = True elif util.qualNum(q2) >= 30 and util.qualNum( q1) <= 14: if b1 != 'N' and b2 != 'N': err_mtx[b2][b1] += 1 if not self.options.no_correction: r1[1] = util.changeString( r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 this_is_corrected = True if not this_is_corrected: if self.options.mask_mismatch: # mask them as zero qual if it is not corrected zero_qual = '!' r2[3] = util.changeString( r2[3], -o - 1, zero_qual) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, zero_qual) zero_qual_masked += 1 else: skipped_mismatch += 1 if corrected + zero_qual_masked + skipped_mismatch >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected + zero_qual_masked + skipped_mismatch == distance: merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx) if corrected > 0: READ_CORRECTED += 1 BASE_CORRECTED += corrected # multiply by 2 since we mask bases by pair BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2 BASE_SKIPPED_CORRECTION += skipped_mismatch * 2 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) GOOD_BASES += len(r1[1]) if i2 != None: GOOD_BASES += len(r2[1]) if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample: self.r1qc_postfilter.statRead(r1) if r2 != None: self.r2qc_postfilter.statRead(r2) GOOD_READS += 1 if self.options.qc_only and TOTAL_READS >= self.options.qc_sample: break self.r1qc_postfilter.qc() #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: self.r2qc_postfilter.qc() #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.close() bad_read1_file.close() if self.options.read2_file != None: good_read2_file.close() bad_read2_file.close() if self.options.index1_file != None: good_index1_file.close() bad_index1_file.close() if self.options.index2_file != None: good_index2_file.close() bad_index2_file.close() # print stat numbers BAD_READS = TOTAL_READS - GOOD_READS result = {} result['total_bases'] = TOTAL_BASES result['good_bases'] = GOOD_BASES result['total_reads'] = TOTAL_READS result['good_reads'] = GOOD_READS result['bad_reads'] = BAD_READS result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble'] = BADBBL result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX'] = BADPOL result['bad_reads_with_low_quality'] = BADLQC result['bad_reads_with_too_many_N'] = BADNCT result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL + BADDIFF result['readlen'] = readLen # plot result bar figure labels = [ 'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N' ] counts = [ GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT ] colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADMISMATCH + BADINDEL + BADDIFF) colors.append('#FF6600') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#EEBB00') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#CCDD22') for i in xrange(len(counts)): type_percent = 0.0 if TOTAL_READS > 0: type_percent = 100.0 * float(counts[i]) / TOTAL_READS labels[i] = labels[i] + ": " + str( counts[i]) + "(" + str(type_percent) + "%)" reporter.addFigure( 'Good reads and bad reads after filtering', self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS, 'filter_stat'), 'filter_stat', "") #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png")) #squeeze qc data for JSON output self.r1qc_prefilter.squeeze() self.r1qc_postfilter.squeeze() if self.options.read2_file != None: self.r2qc_prefilter.squeeze() self.r2qc_postfilter.squeeze() stat = {} # stat["options"]=self.options stat["afterqc_main_summary"] = result stat["command"] = makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"][ "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10] # output more data in JSON file for offline plotting directly from JSON stat["base_quality"] = {} stat["base_quality"][ "read1_prefilter"] = self.r1qc_prefilter.baseMeanQual stat["base_quality"][ "read1_postfilter"] = self.r1qc_postfilter.baseMeanQual stat["mean_quality"] = {} stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual stat["mean_quality"][ "read1_postfilter"] = self.r1qc_postfilter.meanQual stat["base_content"] = {} stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents stat["base_content"][ "read1_postfilter"] = self.r1qc_postfilter.percents stat["gc_content"] = {} stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents stat["gc_content"][ "read1_postfilter"] = self.r1qc_postfilter.gcPercents if self.options.read2_file != None: stat["kmer_content"][ "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10] stat["base_quality"][ "read2_prefilter"] = self.r2qc_prefilter.baseMeanQual stat["base_quality"][ "read2_postfilter"] = self.r2qc_postfilter.baseMeanQual stat["mean_quality"][ "read2_prefilter"] = self.r2qc_prefilter.meanQual stat["mean_quality"][ "read2_postfilter"] = self.r2qc_postfilter.meanQual stat["base_content"][ "read2_prefilter"] = self.r2qc_prefilter.percents stat["base_content"][ "read2_postfilter"] = self.r2qc_postfilter.percents stat["gc_content"][ "read2_prefilter"] = self.r2qc_prefilter.gcPercents stat["gc_content"][ "read2_postfilter"] = self.r2qc_postfilter.gcPercents stat["afterqc_overlap"] = {} stat["afterqc_overlap"]['overlapped_pairs'] = OVERLAPPED if OVERLAPPED > 0: stat["afterqc_overlap"]['average_overlap_length'] = float( OVERLAP_LEN_SUM / OVERLAPPED) else: stat["afterqc_overlap"]['average_overlap_length'] = 0.0 stat["afterqc_overlap"]['bad_mismatch_reads'] = BADMISMATCH stat["afterqc_overlap"]['bad_diff'] = BADDIFF stat["afterqc_overlap"]['bad_indel_reads'] = BADINDEL stat["afterqc_overlap"]['corrected_reads'] = READ_CORRECTED stat["afterqc_overlap"]['corrected_bases'] = BASE_CORRECTED stat["afterqc_overlap"][ 'skipped_correction_bases'] = BASE_SKIPPED_CORRECTION stat["afterqc_overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED stat["afterqc_overlap"][ 'zero_qual_skipped'] = BASE_ZERO_QUAL_MASKED stat["afterqc_overlap"][ 'trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE stat["afterqc_overlap"][ 'trimmed_adapter_reads'] = TRIMMED_ADAPTER_READ if OVERLAP_BASE_SUM > 0: stat["afterqc_overlap"]['error_rate'] = float( OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM) else: stat["afterqc_overlap"]['error_rate'] = 0.0 stat["afterqc_overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX stat["afterqc_overlap"][ 'edit_distance_histogram'] = distance_histgram[0:10] reporter.addFigure( 'Sequence error distribution', self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX, 'error_matrix'), 'error_matrix', "") reporter.addFigure( 'Overlap length distribution', self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen, TOTAL_READS, 'overlap_stat'), 'overlap_stat', "") #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png")) stat_file = open( os.path.join(qc_dir, os.path.basename(self.options.read1_file) + ".json"), "w") stat_json = json.dumps(stat, sort_keys=True, indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.setStat(stat) reporter.setVersion(self.options.version) reporter.output( os.path.join(qc_dir, os.path.basename(self.options.read1_file) + ".html"))
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #create a QC folder to contains QC results qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) #QC result of this file/pair qc_dir = os.path.join(qc_base_folder, os.path.basename(self.options.read1_file)) if not os.path.exists(qc_dir): os.makedirs(qc_dir) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r1qc_prefilter.statFile(self.options.read1_file) r1qc_prefilter.plot(qc_dir, "R1-prefilter") if self.options.read2_file != None: r2qc_prefilter.statFile(self.options.read2_file) r2qc_prefilter.plot(qc_dir, "R2-prefilter") r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen + 1)] distance_histgram = [0 for x in xrange(readLen + 1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.dirname(self.options.read1_file) #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: overlap_dir = os.path.dirname(self.options.read1_file) if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and ( not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer( os.path.join(good_dir, getMainName(self.options.read1_file) + ".good.fq")) bad_read1_file = fastq.Writer( os.path.join(bad_dir, getMainName(self.options.read1_file) + ".bad.fq")) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read1_file) + ".overlap.fq")) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.read2_file) + ".good.fq")) bad_read2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.read2_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read2_file) + ".overlap.fq")) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index1_file) + ".good.fq")) bad_index1_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index1_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index1_file) + ".overlap.fq")) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index2_file) + ".good.fq")) bad_index2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index2_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index2_file) + ".overlap.fq")) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL = 0 GOOD = 0 BAD = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADOL = 0 BADINDEL = 0 BADMISMATCH = 0 BASE_CORRECTED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 while True: r1 = read1_file.nextRead() if r1 == None: break if read2_file != None: r2 = read2_file.nextRead() if r2 == None: break if index1_file != None: i1 = index1_file.nextRead() if i1 == None: break if index2_file != None: i2 = index2_file.nextRead() if i2 == None: break TOTAL += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode( r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName( r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode( r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair( r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2 != None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1 != None or poly2 != None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2 != None: lowQual2 = lowQualityNum( r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2 != None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2 != None: (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative if offset < 0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][-offset:-offset + overlap_len] r2[3] = r2[3][-offset:-offset + overlap_len] # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) if overlap_len > 30: OVERLAPPED += 1 distance_histgram[distance] += 1 OVERLAP_LEN_SUM += overlap_len corrected = 0 if distance > 2: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL") BADOL += 1 continue elif distance > 0: #try to fix low quality base hamming = util.hammingDistance( r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) if hamming != distance: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") BADINDEL += 1 continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o - 1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o - 1] if b1 != b2: # print(TOTAL, o, b1, b2, q1, q2) if util.qualNum(q1) >= 27 and util.qualNum( q2) <= 16: r2[1] = util.changeString( r2[1], -o - 1, util.complement(b1)) r2[3] = util.changeString( r2[3], -o - 1, q1) corrected += 1 elif util.qualNum(q2) >= 27 and util.qualNum( q1) <= 16: r1[1] = util.changeString( r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 if corrected >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected == distance: BASE_CORRECTED += 1 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample: r1qc_postfilter.statRead(r1) if r2 != None: r2qc_postfilter.statRead(r2) GOOD += 1 if self.options.qc_only and TOTAL >= self.options.qc_sample: break r1qc_postfilter.qc() r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: r2qc_postfilter.qc() r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.flush() bad_read1_file.flush() if self.options.read2_file != None: good_read2_file.flush() bad_read2_file.flush() if self.options.index1_file != None: good_index1_file.flush() bad_index1_file.flush() if self.options.index2_file != None: good_index2_file.flush() bad_index2_file.flush() # print stat numbers BAD = TOTAL - GOOD result = {} result['total_reads'] = TOTAL result['good_reads'] = GOOD result['bad_reads'] = BAD result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble'] = BADBBL result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX'] = BADPOL result['bad_reads_with_low_quality'] = BADLQC result['bad_reads_with_too_many_N'] = BADNCT result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL # plot result bar figure labels = [ 'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N' ] counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT] colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADOL + BADMISMATCH + BADINDEL) colors.append('#FF6600') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#EEBB00') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#CCDD22') for i in xrange(len(counts)): labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str( 100.0 * float(counts[i]) / TOTAL) + "%)" fig = plt.figure(1) plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads", fontsize=12, color='#666666') plt.axis('equal') patches, texts = plt.pie(counts, colors=colors, radius=0.7) patches, labels, dummy = zip(*sorted( zip(patches, labels, counts), key=lambda x: x[2], reverse=True)) plt.legend(patches, labels, loc='upper left', fontsize=9) plt.savefig(os.path.join(qc_dir, "filter-stat.png"), bbox_inches='tight') plt.close(1) stat = {} # stat["options"]=self.options stat["summary"] = result stat["command"] = makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[ 0:10] stat["kmer_content"][ "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10] if self.options.read2_file != None: stat["kmer_content"][ "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10] stat["overlap"] = {} stat["overlap"]['overlapped_pairs'] = OVERLAPPED if OVERLAPPED > 0: stat["overlap"]['average_overlap_length'] = float( OVERLAP_LEN_SUM / OVERLAPPED) else: stat["overlap"]['average_overlap_length'] = 0.0 stat["overlap"]['bad_edit_distance'] = BADOL stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH stat["overlap"]['bad_indel'] = BADINDEL stat["overlap"][ 'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED stat["overlap"][ 'overlapped_area_edit_distance_histogram'] = distance_histgram[ 0:10] plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png")) stat_file = open(os.path.join(qc_dir, "after.json"), "w") stat_json = json.dumps(stat, sort_keys=True, indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.output(os.path.join(qc_dir, "report.html"))
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: tm = Trimmer() #auto trim for read1 trimFront, trimTail = tm.calcTrimLength(self.options.read1_file) if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: trimFront2, trimTail2 = tm.calcTrimLength(self.options.read2_file) if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.dirname(self.options.read1_file) if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq")) bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq")) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq")) bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq")) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq")) bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq")) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq")) bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq")) r1 = None r2 = None i1 = None i2 = None while True: r1 = read1_file.nextRead() if r1==None: break if read2_file != None: r2 = read2_file.nextRead() if r2==None: break if index1_file != None: i1 = index1_file.nextRead() if i1==None: break if index2_file != None: i2 = index2_file.nextRead() if i2==None: break #barcode processing if self.options.barcode: barcodeLen = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen == 0: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD") continue else: if r2 == None: barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen == 0: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD") continue else: barcodeprocesser.moveBarcodeToName(r1, barcodeLen, self.options.barcode_verify) barcodeprocesser.moveBarcodeToName(r2, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") continue #filter sequence length if len(r1[1])<self.options.min_seq_len: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") continue #check polyX if self.options.poly_max > 0: poly1 = hasPolyX(r1[1], self.options.poly_max, self.options.allow_poly_mismatch) poly2 = None if r2!=None: poly2 = hasPolyX(r2[1], self.options.poly_max, self.options.allow_poly_mismatch) if poly1!=None or poly2!=None: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") continue #check min quality if self.options.min_quality > 0: minQual1 = minQuality(r1) minQual2 = 255 if r2!=None: minQual2 = minQuality(r2) if minQual1 < self.options.min_quality or minQual2 < self.options.min_quality: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMIN") continue #check low quality count if self.options.max_low_quality > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality) lowQual2 = 0 if r2!=None: lowQual2 = lowQualityNum(r2, self.options.qualified_quality) if lowQual1 > self.options.max_low_quality or lowQual1 > self.options.max_low_quality: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") continue #check N number if self.options.max_n_count > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2!=None: nNum2 = nNumber(r2) if nNum1 > self.options.max_n_count or nNum2 > self.options.max_n_count: writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") continue #write to good writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) #close all files good_read1_file.flush() bad_read1_file.flush() if self.options.read2_file != None: good_read2_file.flush() bad_read2_file.flush() if self.options.index1_file != None: good_index1_file.flush() bad_index1_file.flush() if self.options.index2_file != None: good_index2_file.flush() bad_index2_file.flush()
def calcTrimLength(self, filename): maxLen = 1000 allbases = ("A", "T", "C", "G") counts = {} percents = {} for base in allbases: counts[base] = [0 for x in xrange(maxLen)] percents[base] = [0.0 for x in xrange(maxLen)] reader = fastq.Reader(filename) #sample up to maxSample reads for stat maxSample = 5000000 sampled = 0 while True: read = reader.nextRead() sampled += 1 if read == None or sampled > maxSample: break seq = read[1] for i in xrange(len(seq)): b = seq[i] if b in allbases: counts[b][i] += 1 #get the length of read readLen = 0 for pos in xrange(maxLen): hasData = False for base in allbases: if counts[base][pos] > 0: hasData = True if hasData == False: readLen = pos break #calc percents of each base for pos in xrange(readLen): total = 0 for base in allbases: total += counts[base][pos] for base in allbases: percents[base][pos] = float(counts[base][pos]) / float(total) #use (center-5, center+5) as initial good segment center = int(readLen / 2) left = center - 5 right = center + 5 threshold = 0.05 lastStepIsLeft = False leftFinished = False rightFinished = False current = -1 #expand the good segment meanPercents = {} while not (leftFinished and rightFinished): for base in allbases: meanPercents[base] = 0.0 for pos in xrange(left, right): meanPercents[base] += percents[base][pos] meanPercents[base] /= (right - left) if leftFinished: current = right + 1 lastStepIsLeft = False elif rightFinished: current = left - 1 lastStepIsLeft = True elif lastStepIsLeft: current = right + 1 lastStepIsLeft = False else: current = left - 1 lastStepIsLeft = True percentBias = 0.0 for base in allbases: percentBias += abs(meanPercents[base] - percents[base][current]) if percentBias > threshold: if lastStepIsLeft: leftFinished = True else: rightFinished = True else: if lastStepIsLeft: left = current if left == 0: leftFinished = True else: right = current if right == readLen - 1: rightFinished = True #find the bad segment from front, considering a small window #if any in the window is bad, it is bad trimFront = left window = 3 for pos in xrange(0, left): isGood = True for posInWindow in xrange(pos, min(pos + 3, readLen)): percentBias = 0.0 for base in allbases: percentBias += abs(meanPercents[base] - percents[base][posInWindow]) if percentBias > threshold: isGood = False if isGood: trimFront = pos break #find the bad segment from tail, considering a small window #if any in the window is bad, it is bad trimTail = right for pos in xrange(readLen - 1, right, -1): isGood = True for posInWindow in xrange(pos, max(pos - 3, 0), -1): percentBias = 0.0 for base in allbases: percentBias += abs(meanPercents[base] - percents[base][posInWindow]) if percentBias > threshold: isGood = False if isGood: trimTail = pos break trimFront = min(readLen * 0.2, trimFront) trimTail = min(readLen * 0.1, readLen - 1 - trimTail) return (int(trimFront), int(trimTail))