Esempio n. 1
0
    def statFile(self, filename):
        self.filename = filename
        READ_TO_SKIP = 1000
        reader = fastq.Reader(filename)
        stat_reads_num = 0
        skipped_reads = []
        #sample up to maxSample reads for stat
        while True:
            read = reader.nextRead()
            if read == None:
                break
            self.readCount += 1
            # here we skip the first 1000 reads because usually they are usually not stable
            if self.readCount < READ_TO_SKIP:
                skipped_reads.append(read)
                continue
            stat_reads_num += 1
            if stat_reads_num > self.sampleLimit and self.sampleLimit > 0:
                break
            self.statRead(read)

        # if the fq file is too small, then we stat the skipped reads again
        if stat_reads_num < READ_TO_SKIP:
            for read in skipped_reads:
                self.statRead(read)

        self.qc()
Esempio n. 2
0
    def extract(self):
        reader = fastq.Reader(self.filename)
        stat_reads_num = 0
        skipped_reads = []
        #sample up to maxSample reads for stat
        while True:
            read = reader.nextRead()
            if read == None:
                break
            self.read_count += 1
            # here we skip the first 1000 reads because usually they are usually not stable
            if self.read_count < READ_TO_SKIP:
                skipped_reads.append(read)
                continue
            stat_reads_num += 1
            if stat_reads_num > self.sample_limit and self.sample_limit > 0:
                break
            self.stat_read(read)

        # if the fq file is too small, then we stat the skipped reads again
        if stat_reads_num < READ_TO_SKIP:
            for read in skipped_reads:
                self.stat_read(read)

        self.calc_read_len()
        self.calc_percents()
Esempio n. 3
0
    def statFile(self, filename):
        reader = fastq.Reader(filename)
        #sample up to maxSample reads for stat
        while True:
            read = reader.nextRead()
            self.readCount += 1
            if read==None or (self.readCount > self.sampleLimit and self.sampleLimit>0):
                break
            self.statRead(read)

        self.qc()
Esempio n. 4
0
def stat(filename):
    reader = fastq.Reader(filename)
    total_count = 0
    q20_count = 0
    q30_count = 0
    while True:
        read = reader.nextRead()
        if not read:
            break
        total_count += len(read[3])
        q20, q30 = qual_stat(read[3])
        q20_count += q20
        q30_count += q30

    print("total bases:", total_count)
    print("q20 bases:", q20_count)
    print("q30 bases:", q30_count)
    print("q20 percents:", 100 * float(q20_count) / float(total_count))
    print("q30 percents:", 100 * float(q30_count) / float(total_count))
Esempio n. 5
0
def stat(filename):
    reader = fastq.Reader(filename)
    total_count = 0
    q20_count = 0
    q30_count = 0
    min_len = 150
    max_len = 150
    read_num = 0
    Count_GC = 0
    while True:
        read = reader.nextRead()
        if read == None:
            break
        total_count += len(read[3])
        if min_len > len(read[3]):
            min_len = len(read[3])
        if max_len < len(read[3]):
            max_len = len(read[3])
        read_num = read_num + 1
        q20, q30 = qual_stat(read[3])
        q20_count += q20
        q30_count += q30
        Count_GC += getContentOf(['G', 'C', 'g', 'c'], read[1])

    #print "total bases","\t",total_count
    #print "q20 bases","\t",q20_count
    #print "q30 bases","\t",q30_count
    #print "q20 percents","\t",100 * float(q20_count)/float(total_count)
    #print "q30 percents","\t",100 * float(q30_count)/float(total_count)
    #print "Read Length Distributon","\t",min_len,  max_len
    #print "Mean Read Length","\t",float(total_count)/float(read_num)
    print "Total bases\tQ20 bases\tQ30 bases\tQ20 percents\tq30 percents\tGC Cntent\tRead Length Distributon\tMean Read Length"
    print total_count, "\t", q20_count, "\t", q30_count, "\t", 100 * float(
        q20_count) / float(total_count), "\t", 100 * float(q30_count) / float(
            total_count), "\t", 100 * float(Count_GC) / float(
                total_count), "\t", min_len, "-", max_len, "\t", float(
                    total_count) / float(read_num)
Esempio n. 6
0
def stat(filename, output):
    """ Modified to output total sequences and sequences with >=30 avg quality """
    reader = fastq.Reader(filename)
    # q30_count = 0
    # total_count = 0
    seq_count = 0
    q30_seq_count = 0
    while True:
        read = reader.nextRead()
        if read == None:
            break
        # total_count += len(read[3])
        q30_seq = qual_stat(read[3])
        # q30_count += q30
        seq_count += 1
        if q30_seq >= 30:
            q30_seq_count += 1
    filename = str(filename)
    total_seqs = "total sequences: " + str(seq_count)
    q30_seqs = "q30 reads: " + str(q30_seq_count)

    q30_out = open(output, "w")
    q30_out.writelines([filename, "\n", total_seqs, "\n", q30_seqs])
    q30_out.close()
Esempio n. 7
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        self.r1qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r2qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r1qc_prefilter.statFile(self.options.read1_file)
        if self.options.read2_file != None:
            self.r2qc_prefilter.statFile(self.options.read2_file)

        self.r1qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)
        self.r2qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)

        readLen = self.r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = self.r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "bad")

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            #            overlap_dir = os.path.dirname(self.options.read1_file)
            overlap_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "overlap")

        #save QC results at the same folder of good
        qc_base_folder = self.options.report_output_folder
        if qc_base_folder == None:
            qc_base_folder = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        qc_dir = qc_base_folder

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        gzip_out = self.options.gzip
        gzip_comp = self.options.compression
        if not gzip_out and self.options.read1_file.endswith(".gz"):
            gzip_out = True

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"), gzip_out, gzip_comp)
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"),
                gzip_out, gzip_comp)

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"),
                    gzip_out, gzip_comp)

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"),
                    gzip_out, gzip_comp)
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"),
                    gzip_out, gzip_comp)
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"), gzip_out, gzip_comp)

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL_BASES = 0
        GOOD_BASES = 0
        TOTAL_READS = 0
        GOOD_READS = 0
        BAD_READS = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BADDIFF = 0
        READ_CORRECTED = 0
        BASE_CORRECTED = 0
        BASE_SKIPPED_CORRECTION = 0
        BASE_ZERO_QUAL_MASKED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0
        OVERLAP_BASE_SUM = 0
        # error profiling by overlap analysis
        OVERLAP_BASE_ERR = 0
        OVERLAP_ERR_MATRIX = init_error_matrix()

        #adapter trimming by overlap analysis
        TRIMMED_ADAPTER_BASE = 0
        TRIMMED_ADAPTER_READ = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break
            else:
                TOTAL_BASES += len(r1[1])

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break
                else:
                    TOTAL_BASES += len(r2[1])

            TOTAL_READS += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None and (not self.options.no_overlap):
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                # in this case the adapter is sequenced and should be trimmed
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][0:overlap_len]
                    r2[3] = r2[3][0:overlap_len]
                    TRIMMED_ADAPTER_BASE += abs(offset) * 2
                    TRIMMED_ADAPTER_READ += 1
                    # check the sequence length again after adapter trimmed
                    if len(r1[1]) < self.options.seq_len_req:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADLEN")
                        BADLEN += 1
                        continue
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])

                distance_histgram[distance] += 1
                # if distance is too high, then set it as bad mismatch
                if distance > 3:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADDIFF")
                    BADDIFF += 1
                    continue
                if overlap_len > 30:
                    OVERLAPPED += 1
                    OVERLAP_LEN_SUM += overlap_len
                    # we consider the distance is caused by sequencing error
                    OVERLAP_BASE_SUM += overlap_len * 2
                    OVERLAP_BASE_ERR += distance
                    corrected = 0
                    zero_qual_masked = 0
                    skipped_mismatch = 0
                    if distance > 0:
                        #try to fix low quality base
                        #hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #if hamming != distance:
                        #    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                        #    BADINDEL += 1
                        #    continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        err_mtx = init_error_matrix()
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL_READS, o, b1, b2, q1, q2)
                                this_is_corrected = False
                                if util.qualNum(q1) >= 30 and util.qualNum(
                                        q2) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[util.complement(b1)][
                                            util.complement(b2)] += 1
                                    if not self.options.no_correction:
                                        r2[1] = util.changeString(
                                            r2[1], -o - 1, util.complement(b1))
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, q1)
                                        corrected += 1
                                        this_is_corrected = True
                                elif util.qualNum(q2) >= 30 and util.qualNum(
                                        q1) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[b2][b1] += 1
                                    if not self.options.no_correction:
                                        r1[1] = util.changeString(
                                            r1[1],
                                            len(r1[1]) - overlap_len + o, b2)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o, q2)
                                        corrected += 1
                                        this_is_corrected = True
                                if not this_is_corrected:
                                    if self.options.mask_mismatch:
                                        # mask them as zero qual if it is not corrected
                                        zero_qual = '!'
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, zero_qual)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o,
                                            zero_qual)
                                        zero_qual_masked += 1
                                    else:
                                        skipped_mismatch += 1

                                if corrected + zero_qual_masked + skipped_mismatch >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected + zero_qual_masked + skipped_mismatch == distance:
                            merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx)
                            if corrected > 0:
                                READ_CORRECTED += 1
                            BASE_CORRECTED += corrected
                            # multiply by 2 since we mask bases by pair
                            BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2
                            BASE_SKIPPED_CORRECTION += skipped_mismatch * 2
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            GOOD_BASES += len(r1[1])
            if i2 != None:
                GOOD_BASES += len(r2[1])
            if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample:
                self.r1qc_postfilter.statRead(r1)
                if r2 != None:
                    self.r2qc_postfilter.statRead(r2)

            GOOD_READS += 1
            if self.options.qc_only and TOTAL_READS >= self.options.qc_sample:
                break

        self.r1qc_postfilter.qc()
        #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            self.r2qc_postfilter.qc()
            #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.close()
            bad_read1_file.close()
            if self.options.read2_file != None:
                good_read2_file.close()
                bad_read2_file.close()
            if self.options.index1_file != None:
                good_index1_file.close()
                bad_index1_file.close()
            if self.options.index2_file != None:
                good_index2_file.close()
                bad_index2_file.close()

        # print stat numbers
        BAD_READS = TOTAL_READS - GOOD_READS
        result = {}
        result['total_bases'] = TOTAL_BASES
        result['good_bases'] = GOOD_BASES
        result['total_reads'] = TOTAL_READS
        result['good_reads'] = GOOD_READS
        result['bad_reads'] = BAD_READS
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL + BADDIFF
        result['readlen'] = readLen

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [
            GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT
        ]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADMISMATCH + BADINDEL + BADDIFF)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            type_percent = 0.0
            if TOTAL_READS > 0:
                type_percent = 100.0 * float(counts[i]) / TOTAL_READS
            labels[i] = labels[i] + ": " + str(
                counts[i]) + "(" + str(type_percent) + "%)"

        reporter.addFigure(
            'Good reads and bad reads after filtering',
            self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS,
                                           'filter_stat'), 'filter_stat', "")
        #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png"))

        #squeeze qc data for JSON output
        self.r1qc_prefilter.squeeze()
        self.r1qc_postfilter.squeeze()
        if self.options.read2_file != None:
            self.r2qc_prefilter.squeeze()
            self.r2qc_postfilter.squeeze()

        stat = {}
        # stat["options"]=self.options
        stat["afterqc_main_summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"][
            "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"][
            "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10]

        # output more data in JSON file for offline plotting directly from JSON
        stat["base_quality"] = {}
        stat["base_quality"][
            "read1_prefilter"] = self.r1qc_prefilter.baseMeanQual
        stat["base_quality"][
            "read1_postfilter"] = self.r1qc_postfilter.baseMeanQual
        stat["mean_quality"] = {}
        stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual
        stat["mean_quality"][
            "read1_postfilter"] = self.r1qc_postfilter.meanQual
        stat["base_content"] = {}
        stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents
        stat["base_content"][
            "read1_postfilter"] = self.r1qc_postfilter.percents
        stat["gc_content"] = {}
        stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents
        stat["gc_content"][
            "read1_postfilter"] = self.r1qc_postfilter.gcPercents

        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10]

            stat["base_quality"][
                "read2_prefilter"] = self.r2qc_prefilter.baseMeanQual
            stat["base_quality"][
                "read2_postfilter"] = self.r2qc_postfilter.baseMeanQual

            stat["mean_quality"][
                "read2_prefilter"] = self.r2qc_prefilter.meanQual
            stat["mean_quality"][
                "read2_postfilter"] = self.r2qc_postfilter.meanQual

            stat["base_content"][
                "read2_prefilter"] = self.r2qc_prefilter.percents
            stat["base_content"][
                "read2_postfilter"] = self.r2qc_postfilter.percents

            stat["gc_content"][
                "read2_prefilter"] = self.r2qc_prefilter.gcPercents
            stat["gc_content"][
                "read2_postfilter"] = self.r2qc_postfilter.gcPercents

            stat["afterqc_overlap"] = {}
            stat["afterqc_overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["afterqc_overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["afterqc_overlap"]['average_overlap_length'] = 0.0
            stat["afterqc_overlap"]['bad_mismatch_reads'] = BADMISMATCH
            stat["afterqc_overlap"]['bad_diff'] = BADDIFF
            stat["afterqc_overlap"]['bad_indel_reads'] = BADINDEL
            stat["afterqc_overlap"]['corrected_reads'] = READ_CORRECTED
            stat["afterqc_overlap"]['corrected_bases'] = BASE_CORRECTED
            stat["afterqc_overlap"][
                'skipped_correction_bases'] = BASE_SKIPPED_CORRECTION
            stat["afterqc_overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED
            stat["afterqc_overlap"][
                'zero_qual_skipped'] = BASE_ZERO_QUAL_MASKED
            stat["afterqc_overlap"][
                'trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE
            stat["afterqc_overlap"][
                'trimmed_adapter_reads'] = TRIMMED_ADAPTER_READ
            if OVERLAP_BASE_SUM > 0:
                stat["afterqc_overlap"]['error_rate'] = float(
                    OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM)
            else:
                stat["afterqc_overlap"]['error_rate'] = 0.0
            stat["afterqc_overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX
            stat["afterqc_overlap"][
                'edit_distance_histogram'] = distance_histgram[0:10]
            reporter.addFigure(
                'Sequence error distribution',
                self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX,
                                                'error_matrix'),
                'error_matrix', "")
            reporter.addFigure(
                'Overlap length distribution',
                self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen,
                                                  TOTAL_READS, 'overlap_stat'),
                'overlap_stat', "")
            #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(
            os.path.join(qc_dir,
                         os.path.basename(self.options.read1_file) + ".json"),
            "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.setStat(stat)
        reporter.setVersion(self.options.version)
        reporter.output(
            os.path.join(qc_dir,
                         os.path.basename(self.options.read1_file) + ".html"))
Esempio n. 8
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file),
                                      "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break

            TOTAL += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset + overlap_len]
                    r2[3] = r2[3][-offset:-offset + overlap_len]
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(
                                        q2) <= 16:
                                    r2[1] = util.changeString(
                                        r2[1], -o - 1, util.complement(b1))
                                    r2[3] = util.changeString(
                                        r2[3], -o - 1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(
                                        q1) <= 16:
                                    r1[1] = util.changeString(
                                        r1[1],
                                        len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(
                                        r1[3],
                                        len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample:
                r1qc_postfilter.statRead(r1)
                if r2 != None:
                    r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads'] = TOTAL
        result['good_reads'] = GOOD
        result['bad_reads'] = BAD
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str(
                100.0 * float(counts[i]) / TOTAL) + "%)"

        fig = plt.figure(1)
        plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads",
                  fontsize=12,
                  color='#666666')
        plt.axis('equal')
        patches, texts = plt.pie(counts, colors=colors, radius=0.7)
        patches, labels, dummy = zip(*sorted(
            zip(patches, labels, counts), key=lambda x: x[2], reverse=True))
        plt.legend(patches, labels, loc='upper left', fontsize=9)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"),
                    bbox_inches='tight')
        plt.close(1)

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[
            0:10]
        stat["kmer_content"][
            "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_edit_distance'] = BADOL
            stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH
            stat["overlap"]['bad_indel'] = BADINDEL
            stat["overlap"][
                'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED
            stat["overlap"][
                'overlapped_area_edit_distance_histogram'] = distance_histgram[
                    0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL,
                                os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))
Esempio n. 9
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        
        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            tm = Trimmer()
            #auto trim for read1
            trimFront, trimTail = tm.calcTrimLength(self.options.read1_file)
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                trimFront2, trimTail2 = tm.calcTrimLength(self.options.read2_file)
                if self.options.trim_front2 == -1:
                    self.options.trim_front2 = trimFront2
                if self.options.trim_tail2 == -1:
                    self.options.trim_tail2 = trimTail2
                
        print(self.options)
        
        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file            
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)
            
        if not os.path.exists(good_dir):
            os.makedirs(good_dir)
            
        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)
        
        good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"))
        bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"))
        
        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        
        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"))
            bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"))
            bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"))
            bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"))
            
        r1 = None
        r2 = None
        i1 = None
        i2 = None
        
        while True:
            r1 = read1_file.nextRead()
            if r1==None:
                break
                
            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2==None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1==None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2==None:
                    break
                    
            #barcode processing
            if self.options.barcode:
                barcodeLen = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify)
                if barcodeLen == 0:
                    writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD")
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify)
                        if barcodeLen == 0:
                            writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD")
                            continue
                        else:
                            barcodeprocesser.moveBarcodeToName(r1, barcodeLen, self.options.barcode_verify)
                            barcodeprocesser.moveBarcodeToName(r2, barcodeLen2, self.options.barcode_verify)
            
            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2)
            
            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL")
                    continue
            
            #filter sequence length
            if len(r1[1])<self.options.min_seq_len:
                writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN")
                continue
                    
            #check polyX
            if self.options.poly_max > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_max, self.options.allow_poly_mismatch)
                poly2 = None
                if r2!=None:
                    poly2 = hasPolyX(r2[1], self.options.poly_max, self.options.allow_poly_mismatch)
                if poly1!=None or poly2!=None:
                    writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL")
                    continue
            
            #check min quality
            if self.options.min_quality > 0:
                minQual1 = minQuality(r1)
                minQual2 = 255
                if r2!=None:
                    minQual2 = minQuality(r2)
                if minQual1 < self.options.min_quality or minQual2 < self.options.min_quality:
                    writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMIN")
                    continue
            
            #check low quality count
            if self.options.max_low_quality > 0:
                lowQual1 = lowQualityNum(r1, self.options.qualified_quality)
                lowQual2 = 0
                if r2!=None:
                    lowQual2 = lowQualityNum(r2, self.options.qualified_quality)
                if lowQual1 > self.options.max_low_quality or lowQual1 > self.options.max_low_quality:
                    writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC")
                    continue
            
            #check N number
            if self.options.max_n_count > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2!=None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.max_n_count or nNum2 > self.options.max_n_count:
                    writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT")
                    continue
                                
            #write to good       
            writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None)
        
        #close all files
        good_read1_file.flush()
        bad_read1_file.flush()
        if self.options.read2_file != None:
            good_read2_file.flush()
            bad_read2_file.flush()
        if self.options.index1_file != None:
            good_index1_file.flush()
            bad_index1_file.flush()
        if self.options.index2_file != None:
            good_index2_file.flush()
            bad_index2_file.flush()
Esempio n. 10
0
    def calcTrimLength(self, filename):
        maxLen = 1000
        allbases = ("A", "T", "C", "G")
        counts = {}
        percents = {}
        for base in allbases:
            counts[base] = [0 for x in xrange(maxLen)]
            percents[base] = [0.0 for x in xrange(maxLen)]

        reader = fastq.Reader(filename)
        #sample up to maxSample reads for stat
        maxSample = 5000000
        sampled = 0
        while True:
            read = reader.nextRead()
            sampled += 1
            if read == None or sampled > maxSample:
                break
            seq = read[1]
            for i in xrange(len(seq)):
                b = seq[i]
                if b in allbases:
                    counts[b][i] += 1

        #get the length of read
        readLen = 0
        for pos in xrange(maxLen):
            hasData = False
            for base in allbases:
                if counts[base][pos] > 0:
                    hasData = True
            if hasData == False:
                readLen = pos
                break

        #calc percents of each base
        for pos in xrange(readLen):
            total = 0
            for base in allbases:
                total += counts[base][pos]
            for base in allbases:
                percents[base][pos] = float(counts[base][pos]) / float(total)

        #use (center-5, center+5) as initial good segment
        center = int(readLen / 2)
        left = center - 5
        right = center + 5

        threshold = 0.05
        lastStepIsLeft = False
        leftFinished = False
        rightFinished = False
        current = -1

        #expand the good segment
        meanPercents = {}
        while not (leftFinished and rightFinished):
            for base in allbases:
                meanPercents[base] = 0.0
                for pos in xrange(left, right):
                    meanPercents[base] += percents[base][pos]
                meanPercents[base] /= (right - left)

            if leftFinished:
                current = right + 1
                lastStepIsLeft = False
            elif rightFinished:
                current = left - 1
                lastStepIsLeft = True
            elif lastStepIsLeft:
                current = right + 1
                lastStepIsLeft = False
            else:
                current = left - 1
                lastStepIsLeft = True

            percentBias = 0.0
            for base in allbases:
                percentBias += abs(meanPercents[base] -
                                   percents[base][current])

            if percentBias > threshold:
                if lastStepIsLeft:
                    leftFinished = True
                else:
                    rightFinished = True
            else:
                if lastStepIsLeft:
                    left = current
                    if left == 0: leftFinished = True
                else:
                    right = current
                    if right == readLen - 1: rightFinished = True

        #find the bad segment from front, considering a small window
        #if any in the window is bad, it is bad
        trimFront = left
        window = 3
        for pos in xrange(0, left):
            isGood = True
            for posInWindow in xrange(pos, min(pos + 3, readLen)):
                percentBias = 0.0
                for base in allbases:
                    percentBias += abs(meanPercents[base] -
                                       percents[base][posInWindow])
                if percentBias > threshold:
                    isGood = False
            if isGood:
                trimFront = pos
                break
        #find the bad segment from tail, considering a small window
        #if any in the window is bad, it is bad
        trimTail = right
        for pos in xrange(readLen - 1, right, -1):
            isGood = True
            for posInWindow in xrange(pos, max(pos - 3, 0), -1):
                percentBias = 0.0
                for base in allbases:
                    percentBias += abs(meanPercents[base] -
                                       percents[base][posInWindow])
                if percentBias > threshold:
                    isGood = False
            if isGood:
                trimTail = pos
                break

        trimFront = min(readLen * 0.2, trimFront)
        trimTail = min(readLen * 0.1, readLen - 1 - trimTail)

        return (int(trimFront), int(trimTail))