Python hammingDistance Examples

Programming Language: Python

Namespace/Package Name: util

Method/Function: hammingDistance

Examples at hotexamples.com: 5

Python hammingDistance - 5 examples found. These are the top rated real world Python examples of util.hammingDistance extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: averageHamming.py Project: jacobDrigdon/PUFProject

def testHamming(strategy="Strat1"):
    print "-------------------------------------------------------------------"
    print "Testing Hamming distances for strategy: %s" % strategy
    byteArrList = []
    fileCount = 0
    print "##### Binary Files #####"
    for root, dirs, files in os.walk(GENERATED_OUTPUT_DIR + strategy + "/"):
        if(len(files) != 0):
            files.sort()
            for filename in files:
                byteArrList.append(convBinFileToByteArr(join(root, filename)))
                print "Binary File %i = %s" % (fileCount, join(root,filename))
                fileCount += 1

    # Calculate average hamming distance between runs of the same 
    # routes but different devices/users
    print "\n##### Average hamming distance between runs of same routes, different devices/users #####"
    for i in range(0, len(byteArrList)):
        for j in range(i, len(byteArrList)):
            shortLen = len(byteArrList[i]) if len(byteArrList[i]) < len(byteArrList[j]) else len(byteArrList[j])
            avgHamDist = 0
            #byte length of 16 creates 128-bit sequences
            for k in range(0, shortLen, 16):
                if((len(byteArrList[i][k:k+16]) == 16) and (len(byteArrList[j][k:k+16]) == 16)):
                    avgHamDist += hammingDistance(byteArrList[i][k:k+16], byteArrList[j][k:k+16])
            avgHamDist /= (shortLen/16)
            print "%i -> %i = %i" % (i, j, avgHamDist)

    # Calculate average hamming distance between runs of same device/users 
    # but different routes
    print "\n##### Average hamming distance between runs of same devices/user, different routes #####"
    count = 0
    for byteArr in byteArrList:
        avgHamDist = 0
        comparisons = 0
        for i in range(0, len(byteArr)/16):
            for j in range(i, len(byteArr)/16):
                if( i != j ):
                    avgHamDist += hammingDistance(byteArr[i:i+16], byteArr[j:j+16])
                    comparisons += 1
        avgHamDist /= comparisons
        print "%i = %i" % (count, avgHamDist)
        count += 1
    print "\n"

Example #2

Show file

File: consistencyHammingTest.py Project: jacobDrigdon/PUFProject

def testConsistencyHamming(strategy="ConsistencyStrat1"):
    print "-------------------------------------------------------------------"
    print "Testing Hamming distances for strategy: %s" % strategy
    byteArrList = []
    fileCount = 0
    print "##### Binary Files #####"
    for root, dirs, files in os.walk(GENERATED_OUTPUT_DIR + strategy + "/"):
        if(len(files) != 0):
            files.sort()
            for filename in files:
                byteArrList.append(convBinFileToByteArr(join(root, filename)))
                print "Binary File %i = %s" % (fileCount, join(root,filename))
                fileCount += 1

    # Calculate average hamming distance between runs of the same 
    # routes, same user, same device
    print "\n##### Average hamming distance between runs of same routes, same devices, same users #####"
    for i in range(0, len(byteArrList)):
        for j in range(i, len(byteArrList)):
            shortLen = len(byteArrList[i]) if len(byteArrList[i]) < len(byteArrList[j]) else len(byteArrList[j])
            #byte length of 16 creates 128-bit sequences
            if( shortLen > 3 ):
                hamDist = hammingDistance(byteArrList[i][0:3], byteArrList[j][0:3])
                print "%i -> %i = %i" % (i, j, hamDist)

Example #3

Show file

    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file),
                                      "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break

            TOTAL += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset + overlap_len]
                    r2[3] = r2[3][-offset:-offset + overlap_len]
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(
                                        q2) <= 16:
                                    r2[1] = util.changeString(
                                        r2[1], -o - 1, util.complement(b1))
                                    r2[3] = util.changeString(
                                        r2[3], -o - 1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(
                                        q1) <= 16:
                                    r1[1] = util.changeString(
                                        r1[1],
                                        len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(
                                        r1[3],
                                        len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample:
                r1qc_postfilter.statRead(r1)
                if r2 != None:
                    r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads'] = TOTAL
        result['good_reads'] = GOOD
        result['bad_reads'] = BAD
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str(
                100.0 * float(counts[i]) / TOTAL) + "%)"

        fig = plt.figure(1)
        plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads",
                  fontsize=12,
                  color='#666666')
        plt.axis('equal')
        patches, texts = plt.pie(counts, colors=colors, radius=0.7)
        patches, labels, dummy = zip(*sorted(
            zip(patches, labels, counts), key=lambda x: x[2], reverse=True))
        plt.legend(patches, labels, loc='upper left', fontsize=9)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"),
                    bbox_inches='tight')
        plt.close(1)

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[
            0:10]
        stat["kmer_content"][
            "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_edit_distance'] = BADOL
            stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH
            stat["overlap"]['bad_indel'] = BADINDEL
            stat["overlap"][
                'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED
            stat["overlap"][
                'overlapped_area_edit_distance_histogram'] = distance_histgram[
                    0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL,
                                os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))

Example #4

Show file

File: preprocesser.py Project: hy714335634/AfterQC

    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        self.r1qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r2qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r1qc_prefilter.statFile(self.options.read1_file)
        if self.options.read2_file != None:
            self.r2qc_prefilter.statFile(self.options.read2_file)

        self.r1qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)
        self.r2qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)

        readLen = self.r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = self.r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "bad")

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            #            overlap_dir = os.path.dirname(self.options.read1_file)
            overlap_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "overlap")

        #save QC results at the same folder of good
        qc_base_folder = self.options.report_output_folder
        if qc_base_folder == None:
            qc_base_folder = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL_BASES = 0
        GOOD_BASES = 0
        TOTAL_READS = 0
        GOOD_READS = 0
        BAD_READS = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADINDEL = 0
        BADMISMATCH = 0
        READ_CORRECTED = 0
        BASE_CORRECTED = 0
        BASE_ZERO_QUAL_MASKED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0
        OVERLAP_BASE_SUM = 0
        # error profiling by overlap analysis
        OVERLAP_BASE_ERR = 0
        OVERLAP_ERR_MATRIX = init_error_matrix()

        #adapter trimming by overlap analysis
        TRIMMED_ADAPTER_BASE = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break
            else:
                TOTAL_BASES += len(r1[1])

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break
                else:
                    TOTAL_BASES += len(r2[1])

            TOTAL_READS += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None and (not self.options.no_overlap):
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                # in this case the adapter is sequenced and should be trimmed
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][0:overlap_len]
                    r2[3] = r2[3][0:overlap_len]
                    TRIMMED_ADAPTER_BASE += abs(offset) * 2
                    # check the sequence length again after adapter trimmed
                    if len(r1[1]) < self.options.seq_len_req:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADLEN")
                        BADLEN += 1
                        continue
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    # we consider the distance is caused by sequencing error
                    OVERLAP_BASE_SUM += overlap_len * 2
                    OVERLAP_BASE_ERR += distance
                    corrected = 0
                    zero_qual_masked = 0
                    skipped_mismatch = 0
                    if distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        err_mtx = init_error_matrix()
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL_READS, o, b1, b2, q1, q2)
                                this_is_corrected = False
                                if util.qualNum(q1) >= 30 and util.qualNum(
                                        q2) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[util.complement(b1)][
                                            util.complement(b2)] += 1
                                    if not self.options.no_correction:
                                        r2[1] = util.changeString(
                                            r2[1], -o - 1, util.complement(b1))
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, q1)
                                        corrected += 1
                                        this_is_corrected = True
                                elif util.qualNum(q2) >= 30 and util.qualNum(
                                        q1) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[b2][b1] += 1
                                    if not self.options.no_correction:
                                        r1[1] = util.changeString(
                                            r1[1],
                                            len(r1[1]) - overlap_len + o, b2)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o, q2)
                                        corrected += 1
                                        this_is_corrected = True
                                if not this_is_corrected:
                                    if self.options.mask_mismatch:
                                        # mask them as zero qual if it is not corrected
                                        zero_qual = '!'
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, zero_qual)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o,
                                            zero_qual)
                                        zero_qual_masked += 1
                                    else:
                                        skipped_mismatch += 1

                                if corrected + zero_qual_masked + skipped_mismatch >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected + zero_qual_masked + skipped_mismatch == distance:
                            merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx)
                            if corrected > 0:
                                READ_CORRECTED += 1
                            BASE_CORRECTED += corrected
                            # multiply by 2 since we mask bases by pair
                            BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            GOOD_BASES += len(r1[1])
            if i2 != None:
                GOOD_BASES += len(r2[1])
            if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample:
                self.r1qc_postfilter.statRead(r1)
                if r2 != None:
                    self.r2qc_postfilter.statRead(r2)

            GOOD_READS += 1
            if self.options.qc_only and TOTAL_READS >= self.options.qc_sample:
                break

        self.r1qc_postfilter.qc()
        #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            self.r2qc_postfilter.qc()
            #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD_READS = TOTAL_READS - GOOD_READS
        result = {}
        result['total_bases'] = TOTAL_BASES
        result['good_bases'] = GOOD_BASES
        result['total_reads'] = TOTAL_READS
        result['good_reads'] = GOOD_READS
        result['bad_reads'] = BAD_READS
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL
        result['readlen'] = readLen

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [
            GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT
        ]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            type_percent = 0.0
            if TOTAL_READS > 0:
                type_percent = 100.0 * float(counts[i]) / TOTAL_READS
            labels[i] = labels[i] + ": " + str(
                counts[i]) + "(" + str(type_percent) + "%)"

        reporter.addFigure(
            'Good reads and bad reads after filtering',
            self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS,
                                           'filter_stat'), 'filter_stat', "")
        #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png"))

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"][
            "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"][
            "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_mismatch_reads'] = BADMISMATCH
            stat["overlap"]['bad_indel_reads'] = BADINDEL
            stat["overlap"]['corrected_reads'] = READ_CORRECTED
            stat["overlap"]['corrected_bases'] = BASE_CORRECTED
            stat["overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED
            stat["overlap"]['trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE
            if OVERLAP_BASE_SUM > 0:
                stat["overlap"]['error_rate'] = float(
                    OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM)
            else:
                stat["overlap"]['error_rate'] = 0.0
            stat["overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX
            stat["overlap"]['edit_distance_histogram'] = distance_histgram[
                0:10]
            reporter.addFigure(
                'Sequence error distribution',
                self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX,
                                                'error_matrix'),
                'error_matrix', "")
            reporter.addFigure(
                'Overlap length distribution',
                self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen,
                                                  TOTAL_READS, 'overlap_stat'),
                'overlap_stat', "")
            #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "report.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.setStat(stat)
        reporter.setVersion(self.options.version)
        reporter.output(os.path.join(qc_dir, "report.html"))

Example #5

Show file

File: preprocesser.py Project: Yixf-Self/AfterQC

    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir =  os.path.join(qc_base_folder, os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen+1)]
        distance_histgram = [0 for x in xrange(readLen+1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2
                
        print(self.options.read1_file + " options:")
        print(self.options)
        
        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file            
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)
            
        if not os.path.exists(good_dir):
            os.makedirs(good_dir)
            
        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)
        
        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"))
            bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"))
        
        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None
        
        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"))
                bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"))
                bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"))
                bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"))
            
        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1==None:
                break
                
            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2==None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1==None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2==None:
                    break

            TOTAL += 1
                    
            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify)
            
            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue
            
            #filter sequence length
            if len(r1[1])<self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue
                    
            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2!=None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                if poly1!=None or poly2!=None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue
            
            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2!=None:
                    lowQual2 = lowQualityNum(r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue
            
            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2!=None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2!=None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset <0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset+overlap_len]
                    r2[3] = r2[3][-offset:-offset+overlap_len]
                    # then calc overlap again
                    (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                if overlap_len>30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance>0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o-1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o-1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(q2) <= 16:
                                    r2[1] = util.changeString(r2[1], -o-1, util.complement(b1))
                                    r2[3] = util.changeString(r2[3], -o-1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(q1) <= 16:
                                    r1[1]= util.changeString(r1[1], len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(r1[3], len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None)

            #write to good       
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None)
            r1qc_postfilter.statRead(r1)
            if r2 != None:
                r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")
        
        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads']=TOTAL
        result['good_reads']=GOOD
        result['bad_reads']=BAD
        result['bad_reads_with_bad_barcode']= BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble']= BADBBL
        result['bad_reads_with_bad_read_length']= BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX']= BADPOL
        result['bad_reads_with_low_quality']=BADLQC
        result['bad_reads_with_too_many_N']= BADNCT
        result['bad_reads_with_bad_overlap']= BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = ['good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N']
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['green', '#FF1111', '#FF3333', '#FF5555', '#FF7777']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF9999')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#FFBBBB')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#FFDDDD')

        fig = plt.figure(1)
        plt.title("Good reads (green) and bad reads (red) of total " + str(TOTAL))
        fig.subplots_adjust(left = 0.14)
        lefts = xrange(len(counts))
        plt.yticks(lefts, labels)
        plt.ylim(-0.5, len(counts)-0.5)
        plt.barh(lefts, counts, align='center', height=0.5, alpha=0.8, color=colors)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"))
        plt.close(1)

        stat={}
        # stat["options"]=self.options
        stat["summary"]=result
        stat["command"]=makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"]["read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"]["read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"]["read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"]={}
            stat["overlap"]['overlapped_pairs']=OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length']=0.0
            stat["overlap"]['bad_edit_distance']=BADOL
            stat["overlap"]['bad_mismatch_bases']=BADMISMATCH
            stat["overlap"]['bad_indel']=BADINDEL
            stat["overlap"]['reads_with_corrected_mismatch_bases']=BASE_CORRECTED
            stat["overlap"]['overlapped_area_edit_distance_histogram']=distance_histgram[0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat, sort_keys=True,indent=4, separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))