def statRead(self, read):
        global WARNED_BZIP2_ERROR
        seq = read[1]
        qual = read[3]
        seqlen = len(seq)
        gc = 0
        for i in xrange(seqlen):
            self.totalNum[i] += 1
            try:
                qnum = util.qualNum(qual[i])
            except Exception:
                if self.filename.endswith(
                        "bz2") and WARNED_BZIP2_ERROR == False:
                    WARNED_BZIP2_ERROR = True
                    print(
                        "WARNING: Incompatible bzip2 format, please note that the file compressed with pbzip2 may have problem. Please compress it with bzip2 insteadly.\n"
                    )
                continue
            self.totalQual[i] += qnum
            b = seq[i]
            if b == 'G' or b == 'C':
                gc += 1
            if b in ALL_BASES:
                self.baseCounts[b][i] += 1
                self.baseTotalQual[b][i] += qnum

            # calculate discontinuity
            left = i - 2
            right = i + 3
            if left < 0:
                left = 0
                right = 5
            elif right >= seqlen:
                right = seqlen
                left = seqlen - 5
            discontinuity = 0
            for j in xrange(left, right - 1):
                if seq[j] != seq[j + 1]:
                    discontinuity += 1
            self.totalDiscontinuity[i] += discontinuity

        #gcPer = int(1000.0* float(gc)/seqlen)
        self.gcHistogram[gc] += 1
        for i in xrange(seqlen - self.kmerLen):
            self.totalKmer += 1
            kmer = seq[i:i + self.kmerLen]
            if kmer in self.kmerCount:
                self.kmerCount[kmer] += 1
            else:
                self.kmerCount[kmer] = 1
                rcKmer = util.reverseComplement(kmer)
                if rcKmer not in self.kmerCount:
                    self.kmerCount[rcKmer] = 0
Beispiel #2
0
 def statRead(self, read):
     seq = read[1]
     qual = read[3]
     for i in xrange(len(seq)):
         self.totalNum[i] += 1
         qnum = util.qualNum(qual[i])
         self.totalQual[i] += qnum
         b = seq[i]
         if b in ALL_BASES:
             self.baseCounts[b][i] += 1
             self.baseTotalQual[b][i] += qnum
     for i in xrange(len(seq) - self.kmerLen):
         self.totalKmer += 1
         kmer = seq[i:i+self.kmerLen]
         if kmer in self.kmerCount:
             self.kmerCount[kmer] += 1
         else:
             self.kmerCount[kmer] = 1
Beispiel #3
0
    def statRead(self, read):
        seq = read[1]
        qual = read[3]
        seqlen = len(seq)
        gc = 0
        for i in xrange(seqlen):
            self.totalNum[i] += 1
            qnum = util.qualNum(qual[i])
            self.totalQual[i] += qnum
            b = seq[i]
            if b=='G' or b=='C':
                gc += 1
            if b in ALL_BASES:
                self.baseCounts[b][i] += 1
                self.baseTotalQual[b][i] += qnum

            # calculate discontinuity
            left = i-2
            right = i+3
            if left<0:
                left = 0
                right = 5
            elif right >= seqlen:
                right = seqlen
                left = seqlen - 5
            discontinuity = 0
            for j in xrange(left, right-1):
                if seq[j] != seq[j+1]:
                    discontinuity += 1
            self.totalDiscontinuity[i] += discontinuity

        #gcPer = int(1000.0* float(gc)/seqlen)
        self.gcHistogram[gc] += 1
        for i in xrange(seqlen - self.kmerLen):
            self.totalKmer += 1
            kmer = seq[i:i+self.kmerLen]
            if kmer in self.kmerCount:
                self.kmerCount[kmer] += 1
            else:
                self.kmerCount[kmer] = 1
                rcKmer = util.reverseComplement(kmer)
                if rcKmer not in self.kmerCount:
                    self.kmerCount[rcKmer] = 0
Beispiel #4
0
    def statRead(self, read):
        seq = read[1]
        qual = read[3]
        seqlen = len(seq)
        gc = 0
        for i in xrange(seqlen):
            self.totalNum[i] += 1
            qnum = util.qualNum(qual[i])
            self.totalQual[i] += qnum
            b = seq[i]
            if b=='G' or b=='C':
                gc += 1
            if b in ALL_BASES:
                self.baseCounts[b][i] += 1
                self.baseTotalQual[b][i] += qnum

            # calculate discontinuity
            left = i-2
            right = i+3
            if left<0:
                left = 0
                right = 5
            elif right >= seqlen:
                right = seqlen
                left = seqlen - 5
            discontinuity = 0
            for j in xrange(left, right-1):
                if seq[j] != seq[j+1]:
                    discontinuity += 1
            self.totalDiscontinuity[i] += discontinuity

        #gcPer = int(1000.0* float(gc)/seqlen)
        self.gcHistogram[gc] += 1
        for i in xrange(seqlen - self.kmerLen):
            self.totalKmer += 1
            kmer = seq[i:i+self.kmerLen]
            if kmer in self.kmerCount:
                self.kmerCount[kmer] += 1
            else:
                self.kmerCount[kmer] = 1
                rcKmer = util.reverseComplement(kmer)
                if rcKmer not in self.kmerCount:
                    self.kmerCount[rcKmer] = 0
Beispiel #5
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        self.r1qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r2qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r1qc_prefilter.statFile(self.options.read1_file)
        if self.options.read2_file != None:
            self.r2qc_prefilter.statFile(self.options.read2_file)

        self.r1qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)
        self.r2qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)

        readLen = self.r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = self.r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "bad")

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            #            overlap_dir = os.path.dirname(self.options.read1_file)
            overlap_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "overlap")

        #save QC results at the same folder of good
        qc_base_folder = self.options.report_output_folder
        if qc_base_folder == None:
            qc_base_folder = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        qc_dir = qc_base_folder

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        gzip_out = self.options.gzip
        gzip_comp = self.options.compression
        if not gzip_out and self.options.read1_file.endswith(".gz"):
            gzip_out = True

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"), gzip_out, gzip_comp)
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"),
                gzip_out, gzip_comp)

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"),
                    gzip_out, gzip_comp)

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL_BASES = 0
        GOOD_BASES = 0
        TOTAL_READS = 0
        GOOD_READS = 0
        BAD_READS = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BADDIFF = 0
        READ_CORRECTED = 0
        BASE_CORRECTED = 0
        BASE_SKIPPED_CORRECTION = 0
        BASE_ZERO_QUAL_MASKED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0
        OVERLAP_BASE_SUM = 0
        # error profiling by overlap analysis
        OVERLAP_BASE_ERR = 0
        OVERLAP_ERR_MATRIX = init_error_matrix()

        #adapter trimming by overlap analysis
        TRIMMED_ADAPTER_BASE = 0
        TRIMMED_ADAPTER_READ = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break
            else:
                TOTAL_BASES += len(r1[1])

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break
                else:
                    TOTAL_BASES += len(r2[1])

            TOTAL_READS += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None and (not self.options.no_overlap):
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                # in this case the adapter is sequenced and should be trimmed
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][0:overlap_len]
                    r2[3] = r2[3][0:overlap_len]
                    TRIMMED_ADAPTER_BASE += abs(offset) * 2
                    TRIMMED_ADAPTER_READ += 1
                    # check the sequence length again after adapter trimmed
                    if len(r1[1]) < self.options.seq_len_req:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADLEN")
                        BADLEN += 1
                        continue
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])

                distance_histgram[distance] += 1
                # if distance is too high, then set it as bad mismatch
                if distance > 3:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADDIFF")
                    BADDIFF += 1
                    continue
                if overlap_len > 30:
                    OVERLAPPED += 1
                    OVERLAP_LEN_SUM += overlap_len
                    # we consider the distance is caused by sequencing error
                    OVERLAP_BASE_SUM += overlap_len * 2
                    OVERLAP_BASE_ERR += distance
                    corrected = 0
                    zero_qual_masked = 0
                    skipped_mismatch = 0
                    if distance > 0:
                        #try to fix low quality base
                        #hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #if hamming != distance:
                        #    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                        #    BADINDEL += 1
                        #    continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        err_mtx = init_error_matrix()
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL_READS, o, b1, b2, q1, q2)
                                this_is_corrected = False
                                if util.qualNum(q1) >= 30 and util.qualNum(
                                        q2) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[util.complement(b1)][
                                            util.complement(b2)] += 1
                                    if not self.options.no_correction:
                                        r2[1] = util.changeString(
                                            r2[1], -o - 1, util.complement(b1))
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, q1)
                                        corrected += 1
                                        this_is_corrected = True
                                elif util.qualNum(q2) >= 30 and util.qualNum(
                                        q1) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[b2][b1] += 1
                                    if not self.options.no_correction:
                                        r1[1] = util.changeString(
                                            r1[1],
                                            len(r1[1]) - overlap_len + o, b2)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o, q2)
                                        corrected += 1
                                        this_is_corrected = True
                                if not this_is_corrected:
                                    if self.options.mask_mismatch:
                                        # mask them as zero qual if it is not corrected
                                        zero_qual = '!'
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, zero_qual)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o,
                                            zero_qual)
                                        zero_qual_masked += 1
                                    else:
                                        skipped_mismatch += 1

                                if corrected + zero_qual_masked + skipped_mismatch >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected + zero_qual_masked + skipped_mismatch == distance:
                            merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx)
                            if corrected > 0:
                                READ_CORRECTED += 1
                            BASE_CORRECTED += corrected
                            # multiply by 2 since we mask bases by pair
                            BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2
                            BASE_SKIPPED_CORRECTION += skipped_mismatch * 2
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            GOOD_BASES += len(r1[1])
            if i2 != None:
                GOOD_BASES += len(r2[1])
            if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample:
                self.r1qc_postfilter.statRead(r1)
                if r2 != None:
                    self.r2qc_postfilter.statRead(r2)

            GOOD_READS += 1
            if self.options.qc_only and TOTAL_READS >= self.options.qc_sample:
                break

        self.r1qc_postfilter.qc()
        #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            self.r2qc_postfilter.qc()
            #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.close()
            bad_read1_file.close()
            if self.options.read2_file != None:
                good_read2_file.close()
                bad_read2_file.close()
            if self.options.index1_file != None:
                good_index1_file.close()
                bad_index1_file.close()
            if self.options.index2_file != None:
                good_index2_file.close()
                bad_index2_file.close()

        # print stat numbers
        BAD_READS = TOTAL_READS - GOOD_READS
        result = {}
        result['total_bases'] = TOTAL_BASES
        result['good_bases'] = GOOD_BASES
        result['total_reads'] = TOTAL_READS
        result['good_reads'] = GOOD_READS
        result['bad_reads'] = BAD_READS
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL + BADDIFF
        result['readlen'] = readLen

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [
            GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT
        ]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADMISMATCH + BADINDEL + BADDIFF)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            type_percent = 0.0
            if TOTAL_READS > 0:
                type_percent = 100.0 * float(counts[i]) / TOTAL_READS
            labels[i] = labels[i] + ": " + str(
                counts[i]) + "(" + str(type_percent) + "%)"

        reporter.addFigure(
            'Good reads and bad reads after filtering',
            self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS,
                                           'filter_stat'), 'filter_stat', "")
        #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png"))

        #squeeze qc data for JSON output
        self.r1qc_prefilter.squeeze()
        self.r1qc_postfilter.squeeze()
        if self.options.read2_file != None:
            self.r2qc_prefilter.squeeze()
            self.r2qc_postfilter.squeeze()

        stat = {}
        # stat["options"]=self.options
        stat["afterqc_main_summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"][
            "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"][
            "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10]

        # output more data in JSON file for offline plotting directly from JSON
        stat["base_quality"] = {}
        stat["base_quality"][
            "read1_prefilter"] = self.r1qc_prefilter.baseMeanQual
        stat["base_quality"][
            "read1_postfilter"] = self.r1qc_postfilter.baseMeanQual
        stat["mean_quality"] = {}
        stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual
        stat["mean_quality"][
            "read1_postfilter"] = self.r1qc_postfilter.meanQual
        stat["base_content"] = {}
        stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents
        stat["base_content"][
            "read1_postfilter"] = self.r1qc_postfilter.percents
        stat["gc_content"] = {}
        stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents
        stat["gc_content"][
            "read1_postfilter"] = self.r1qc_postfilter.gcPercents

        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10]

            stat["base_quality"][
                "read2_prefilter"] = self.r2qc_prefilter.baseMeanQual
            stat["base_quality"][
                "read2_postfilter"] = self.r2qc_postfilter.baseMeanQual

            stat["mean_quality"][
                "read2_prefilter"] = self.r2qc_prefilter.meanQual
            stat["mean_quality"][
                "read2_postfilter"] = self.r2qc_postfilter.meanQual

            stat["base_content"][
                "read2_prefilter"] = self.r2qc_prefilter.percents
            stat["base_content"][
                "read2_postfilter"] = self.r2qc_postfilter.percents

            stat["gc_content"][
                "read2_prefilter"] = self.r2qc_prefilter.gcPercents
            stat["gc_content"][
                "read2_postfilter"] = self.r2qc_postfilter.gcPercents

            stat["afterqc_overlap"] = {}
            stat["afterqc_overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["afterqc_overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["afterqc_overlap"]['average_overlap_length'] = 0.0
            stat["afterqc_overlap"]['bad_mismatch_reads'] = BADMISMATCH
            stat["afterqc_overlap"]['bad_diff'] = BADDIFF
            stat["afterqc_overlap"]['bad_indel_reads'] = BADINDEL
            stat["afterqc_overlap"]['corrected_reads'] = READ_CORRECTED
            stat["afterqc_overlap"]['corrected_bases'] = BASE_CORRECTED
            stat["afterqc_overlap"][
                'skipped_correction_bases'] = BASE_SKIPPED_CORRECTION
            stat["afterqc_overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED
            stat["afterqc_overlap"][
                'zero_qual_skipped'] = BASE_ZERO_QUAL_MASKED
            stat["afterqc_overlap"][
                'trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE
            stat["afterqc_overlap"][
                'trimmed_adapter_reads'] = TRIMMED_ADAPTER_READ
            if OVERLAP_BASE_SUM > 0:
                stat["afterqc_overlap"]['error_rate'] = float(
                    OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM)
            else:
                stat["afterqc_overlap"]['error_rate'] = 0.0
            stat["afterqc_overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX
            stat["afterqc_overlap"][
                'edit_distance_histogram'] = distance_histgram[0:10]
            reporter.addFigure(
                'Sequence error distribution',
                self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX,
                                                'error_matrix'),
                'error_matrix', "")
            reporter.addFigure(
                'Overlap length distribution',
                self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen,
                                                  TOTAL_READS, 'overlap_stat'),
                'overlap_stat', "")
            #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(
            os.path.join(qc_dir,
                         os.path.basename(self.options.read1_file) + ".json"),
            "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.setStat(stat)
        reporter.setVersion(self.options.version)
        reporter.output(
            os.path.join(qc_dir,
                         os.path.basename(self.options.read1_file) + ".html"))
Beispiel #6
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file),
                                      "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break

            TOTAL += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset + overlap_len]
                    r2[3] = r2[3][-offset:-offset + overlap_len]
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(
                                        q2) <= 16:
                                    r2[1] = util.changeString(
                                        r2[1], -o - 1, util.complement(b1))
                                    r2[3] = util.changeString(
                                        r2[3], -o - 1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(
                                        q1) <= 16:
                                    r1[1] = util.changeString(
                                        r1[1],
                                        len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(
                                        r1[3],
                                        len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample:
                r1qc_postfilter.statRead(r1)
                if r2 != None:
                    r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads'] = TOTAL
        result['good_reads'] = GOOD
        result['bad_reads'] = BAD
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str(
                100.0 * float(counts[i]) / TOTAL) + "%)"

        fig = plt.figure(1)
        plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads",
                  fontsize=12,
                  color='#666666')
        plt.axis('equal')
        patches, texts = plt.pie(counts, colors=colors, radius=0.7)
        patches, labels, dummy = zip(*sorted(
            zip(patches, labels, counts), key=lambda x: x[2], reverse=True))
        plt.legend(patches, labels, loc='upper left', fontsize=9)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"),
                    bbox_inches='tight')
        plt.close(1)

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[
            0:10]
        stat["kmer_content"][
            "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_edit_distance'] = BADOL
            stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH
            stat["overlap"]['bad_indel'] = BADINDEL
            stat["overlap"][
                'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED
            stat["overlap"][
                'overlapped_area_edit_distance_histogram'] = distance_histgram[
                    0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL,
                                os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))
Beispiel #7
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir =  os.path.join(qc_base_folder, os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen+1)]
        distance_histgram = [0 for x in xrange(readLen+1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2
                
        print(self.options.read1_file + " options:")
        print(self.options)
        
        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file            
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)
            
        if not os.path.exists(good_dir):
            os.makedirs(good_dir)
            
        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)
        
        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"))
            bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"))
        
        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None
        
        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"))
                bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"))
                bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"))
                bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"))
            
        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1==None:
                break
                
            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2==None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1==None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2==None:
                    break

            TOTAL += 1
                    
            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify)
            
            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue
            
            #filter sequence length
            if len(r1[1])<self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue
                    
            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2!=None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                if poly1!=None or poly2!=None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue
            
            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2!=None:
                    lowQual2 = lowQualityNum(r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue
            
            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2!=None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2!=None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset <0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset+overlap_len]
                    r2[3] = r2[3][-offset:-offset+overlap_len]
                    # then calc overlap again
                    (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                if overlap_len>30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance>0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o-1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o-1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(q2) <= 16:
                                    r2[1] = util.changeString(r2[1], -o-1, util.complement(b1))
                                    r2[3] = util.changeString(r2[3], -o-1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(q1) <= 16:
                                    r1[1]= util.changeString(r1[1], len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(r1[3], len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None)

            #write to good       
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None)
            r1qc_postfilter.statRead(r1)
            if r2 != None:
                r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")
        
        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads']=TOTAL
        result['good_reads']=GOOD
        result['bad_reads']=BAD
        result['bad_reads_with_bad_barcode']= BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble']= BADBBL
        result['bad_reads_with_bad_read_length']= BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX']= BADPOL
        result['bad_reads_with_low_quality']=BADLQC
        result['bad_reads_with_too_many_N']= BADNCT
        result['bad_reads_with_bad_overlap']= BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = ['good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N']
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['green', '#FF1111', '#FF3333', '#FF5555', '#FF7777']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF9999')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#FFBBBB')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#FFDDDD')

        fig = plt.figure(1)
        plt.title("Good reads (green) and bad reads (red) of total " + str(TOTAL))
        fig.subplots_adjust(left = 0.14)
        lefts = xrange(len(counts))
        plt.yticks(lefts, labels)
        plt.ylim(-0.5, len(counts)-0.5)
        plt.barh(lefts, counts, align='center', height=0.5, alpha=0.8, color=colors)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"))
        plt.close(1)

        stat={}
        # stat["options"]=self.options
        stat["summary"]=result
        stat["command"]=makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"]["read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"]["read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"]["read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"]={}
            stat["overlap"]['overlapped_pairs']=OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length']=0.0
            stat["overlap"]['bad_edit_distance']=BADOL
            stat["overlap"]['bad_mismatch_bases']=BADMISMATCH
            stat["overlap"]['bad_indel']=BADINDEL
            stat["overlap"]['reads_with_corrected_mismatch_bases']=BASE_CORRECTED
            stat["overlap"]['overlapped_area_edit_distance_histogram']=distance_histgram[0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat, sort_keys=True,indent=4, separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))