Esempio n. 1
0
def cleanBarcodeTail(read1, read2, readStart1, readStart2):
    reverse1 = util.reverseComplement(readStart1)
    reverse2 = util.reverseComplement(readStart2)
    barcodeStringLen = min(len(readStart1), len(readStart2))
    r1len = len(read1[1])
    r2len = len(read2[1])
    compLen = 0
    overlap = False
    for i in xrange(barcodeStringLen):
        compLen = barcodeStringLen - i
        if compLen >= r1len or compLen >= r2len:
            continue
        distance1 = util.editDistance(read1[1][-compLen:], reverse2[i:])
        distance2 = util.editDistance(read2[1][-compLen:], reverse1[i:])
        #if the tail of one end matches the start of the other end
        #we then suspect the template is not shorter than the read length
        #so to trim them on both ends
        threshold = compLen / 5
        if distance1 <= threshold and distance2 <= threshold:
            read1[1] = read1[1][:-compLen]
            read1[3] = read1[3][:-compLen]
            read2[1] = read2[1][:-compLen]
            read2[3] = read2[3][:-compLen]
            overlap = True
            break
    if overlap:
        return compLen
    else:
        return 0
Esempio n. 2
0
def cleanBarcodeTail(read1, read2, readStart1, readStart2):
    reverse1 = util.reverseComplement(readStart1)
    reverse2 = util.reverseComplement(readStart2)
    barcodeStringLen = min(len(readStart1), len(readStart2))
    r1len  = len(read1[1])
    r2len  = len(read2[1])
    compLen = 0
    overlap = False
    for i in xrange(barcodeStringLen):
        compLen = barcodeStringLen - i
        if compLen >= r1len or compLen >= r2len:
            continue
        distance1 = util.editDistance(read1[1][-compLen:], reverse2[i:])
        distance2 = util.editDistance(read2[1][-compLen:], reverse1[i:])
        #if the tail of one end matches the start of the other end
        #we then suspect the template is not shorter than the read length
        #so to trim them on both ends
        threshold  = compLen/5
        if distance1<=threshold  and distance2<=threshold:
            read1[1] = read1[1][:-compLen]
            read1[3] = read1[3][:-compLen]
            read2[1] = read2[1][:-compLen]
            read2[3] = read2[3][:-compLen]
            overlap = True
            break;
    if overlap:
        return compLen
    else:
        return 0
Esempio n. 3
0
 def scoreSeqAtPos(self, seq, startIdx, reverseComplement=False):
     """
         This method will score the seq at startIdx:startIdx+len(seq).
         if startIdx is < 0 or startIdx is too close to the end of the string,
         returns 0.
         This behaviour is useful when you want to generate scores at a fixed number
         of positions for a set of PWMs, some of which are longer than others.
         So at each position, for startSeq you supply pos-len(pwm)/2, and if it
         does not fit the score will automatically be 0. 
     """
     endIdx = startIdx+self.pwmSize;
     if (not self._finalised):
         raise RuntimeError("Please call finalised on "+str(self.name));
     assert hasattr(self, '_logRows');
     if (endIdx > len(seq) or startIdx < 0):
         return 0.0; #return 0 when indicating a segment that is too short
     score = 0;
     for idx in xrange(startIdx, endIdx):
         if (reverseComplement):
             revIdx=(endIdx-1)-(idx-startIdx);
         letter=util.reverseComplement(seq[revIdx]) if reverseComplement else seq[idx];
         if (letter not in self.letterToIndex and (letter=='N' or letter=='n')):
             pass; #just skip the letter
         else:
             #compute the score at this position
             score += self._logRows[idx-startIdx, self.letterToIndex[letter]] - self._logBackground[letter]
     return score;
Esempio n. 4
0
 def strandBiasPlotly(self, div, title=""):
     if self.readLen == 0:
         return ""
     shift = min(50, len(self.topKmerCount) / 2)
     # we only sample 1000 points for performance issue
     top = min(len(self.topKmerCount) - shift, 1000)
     forward = [0 for i in xrange(top)]
     reverse = [0 for i in xrange(top)]
     step = (len(self.topKmerCount) - shift) / top
     if step == 0:
         step = 1
     maxValue = 0
     for i in xrange(top):
         index = i * step + shift
         if index >= len(self.topKmerCount):
             break
         kmer = self.topKmerCount[i * step + shift][0]
         forward[i] = self.kmerCount[kmer]
         reverse[i] = self.kmerCount[util.reverseComplement(kmer)]
         maxValue = max(max(forward[i], reverse[i]), maxValue)
     json_str = "var data=["
     x = range(self.readLen)
     json_str += "{"
     json_str += "x:[" + ",".join(map(str, forward)) + "],"
     json_str += "y:[" + ",".join(map(str, reverse)) + "],"
     json_str += "mode:'markers',"
     json_str += "type:'scatter',\n"
     json_str += "marker:{size:2, color:'rgba(0,0,50,128)'}\n"
     json_str += "}];"
     json_str += "var layout={title:'" + title + "', xaxis:{title:'relative forward strand KMER count', range:" + makeRange(
         -10, maxValue
     ) + "}, yaxis:{title:'relative reverse strand KMER count', range:" + makeRange(
         -10, maxValue) + "}};\n"
     json_str += "Plotly.newPlot('" + div + "', data, layout);\n"
     return json_str
Esempio n. 5
0
 def strandBiasPlotly(self, div, title=""):
     if self.readLen == 0:
         return ""
     shift = min(50, len(self.topKmerCount)/2)
     # we only sample 1000 points for performance issue
     top = min(len(self.topKmerCount) - shift,1000)
     forward = [0 for i in xrange(top)]
     reverse = [0 for i in xrange(top)]
     step = (len(self.topKmerCount) - shift) / top
     if step == 0:
         step = 1
     maxValue = 0
     for i in xrange(top):
         index = i*step+shift
         if index >= len(self.topKmerCount):
             break
         kmer = self.topKmerCount[i*step+shift][0]
         forward[i] = self.kmerCount[kmer]
         reverse[i] = self.kmerCount[util.reverseComplement(kmer)]
         maxValue = max(max(forward[i], reverse[i]), maxValue)
     json_str = "var data=["
     x = range(self.readLen)
     json_str += "{"
     json_str += "x:[" + ",".join(map(str, forward)) + "],"
     json_str += "y:[" + ",".join(map(str, reverse)) + "],"
     json_str += "mode:'markers',"
     json_str += "type:'scatter',\n"
     json_str += "marker:{size:2, color:'rgba(0,0,50,128)'}\n"
     json_str += "}];"
     json_str += "var layout={title:'" + title + "', xaxis:{title:'relative forward strand KMER count', range:" + makeRange(-10, maxValue) + "}, yaxis:{title:'relative reverse strand KMER count', range:" + makeRange(-10, maxValue) + "}};\n"
     json_str += "Plotly.newPlot('" + div + "', data, layout);\n"
     return json_str
Esempio n. 6
0
def findProtein(dna, p):
    window_len = len(p) * 3
    dna_len = len(dna)
    if dna_len < window_len:
        sys.exit()
    for i in range(0, dna_len - window_len + 1):
        d = dna[i:i + window_len]
        if rna2protein(dna2rna(d)) == p:
            print d
        elif rna2protein(dna2rna(reverseComplement(d))) == p:
            print d
Esempio n. 7
0
    def statRead(self, read):
        global WARNED_BZIP2_ERROR
        seq = read[1]
        qual = read[3]
        seqlen = len(seq)
        gc = 0
        for i in xrange(seqlen):
            self.totalNum[i] += 1
            try:
                qnum = util.qualNum(qual[i])
            except Exception:
                if self.filename.endswith(
                        "bz2") and WARNED_BZIP2_ERROR == False:
                    WARNED_BZIP2_ERROR = True
                    print(
                        "WARNING: Incompatible bzip2 format, please note that the file compressed with pbzip2 may have problem. Please compress it with bzip2 insteadly.\n"
                    )
                continue
            self.totalQual[i] += qnum
            b = seq[i]
            if b == 'G' or b == 'C':
                gc += 1
            if b in ALL_BASES:
                self.baseCounts[b][i] += 1
                self.baseTotalQual[b][i] += qnum

            # calculate discontinuity
            left = i - 2
            right = i + 3
            if left < 0:
                left = 0
                right = 5
            elif right >= seqlen:
                right = seqlen
                left = seqlen - 5
            discontinuity = 0
            for j in xrange(left, right - 1):
                if seq[j] != seq[j + 1]:
                    discontinuity += 1
            self.totalDiscontinuity[i] += discontinuity

        #gcPer = int(1000.0* float(gc)/seqlen)
        self.gcHistogram[gc] += 1
        for i in xrange(seqlen - self.kmerLen):
            self.totalKmer += 1
            kmer = seq[i:i + self.kmerLen]
            if kmer in self.kmerCount:
                self.kmerCount[kmer] += 1
            else:
                self.kmerCount[kmer] = 1
                rcKmer = util.reverseComplement(kmer)
                if rcKmer not in self.kmerCount:
                    self.kmerCount[rcKmer] = 0
Esempio n. 8
0
def getPwmSample(options):
    if (options.pwmSamplingMode == PWM_SAMPLING_MODE.default):
        seq, generationProb = options.pwm.sampleFromPwm();
    elif (options.pwmSamplingMode == PWM_SAMPLING_MODE.bestHit):
        seq, generationProb = options.pwm.bestHit, 1.0;
    else:
        raise RuntimeError("Unsupported pwm sampling mode: "+str(options.pwmSamplingMode));
    
    ##apply the reverse complement thing
    if (random.random() < options.reverseComplementProb): 
        seq = util.reverseComplement(seq);
    return seq, generationProb; 
 def action(inp, lineNumber):
     if (lineNumber==1):
         outputFileHandle.write("\t".join(inp));
         outputFileHandle.write("\n");
         kmerOrdering = inp[1:];
         kmerToIndex = dict((x[1],x[0]) for x in enumerate(kmerOrdering));
         reverseComplementIndexMappingWrapper.var = dict([(x[0], kmerToIndex[util.reverseComplement(x[1])]) for x in enumerate(kmerOrdering)]);
     else:
         theId = inp[0];
         kmerCounts = [int(x) for x in inp[1:]]
         revCompKmerCounts = kmerCounts[:]; #slicing makes a copy
         for i,kmerCount in enumerate(kmerCounts):
             revCompKmerCounts[reverseComplementIndexMappingWrapper.var[i]] += kmerCount;
         outputFileHandle.write(theId);
         outputFileHandle.write("\t");
         outputFileHandle.write("\t".join(str(x) for x in revCompKmerCounts));
         outputFileHandle.write("\n");
Esempio n. 10
0
    def statRead(self, read):
        seq = read[1]
        qual = read[3]
        seqlen = len(seq)
        gc = 0
        for i in xrange(seqlen):
            self.totalNum[i] += 1
            qnum = util.qualNum(qual[i])
            self.totalQual[i] += qnum
            b = seq[i]
            if b=='G' or b=='C':
                gc += 1
            if b in ALL_BASES:
                self.baseCounts[b][i] += 1
                self.baseTotalQual[b][i] += qnum

            # calculate discontinuity
            left = i-2
            right = i+3
            if left<0:
                left = 0
                right = 5
            elif right >= seqlen:
                right = seqlen
                left = seqlen - 5
            discontinuity = 0
            for j in xrange(left, right-1):
                if seq[j] != seq[j+1]:
                    discontinuity += 1
            self.totalDiscontinuity[i] += discontinuity

        #gcPer = int(1000.0* float(gc)/seqlen)
        self.gcHistogram[gc] += 1
        for i in xrange(seqlen - self.kmerLen):
            self.totalKmer += 1
            kmer = seq[i:i+self.kmerLen]
            if kmer in self.kmerCount:
                self.kmerCount[kmer] += 1
            else:
                self.kmerCount[kmer] = 1
                rcKmer = util.reverseComplement(kmer)
                if rcKmer not in self.kmerCount:
                    self.kmerCount[rcKmer] = 0
Esempio n. 11
0
    def statRead(self, read):
        seq = read[1]
        qual = read[3]
        seqlen = len(seq)
        gc = 0
        for i in xrange(seqlen):
            self.totalNum[i] += 1
            qnum = util.qualNum(qual[i])
            self.totalQual[i] += qnum
            b = seq[i]
            if b=='G' or b=='C':
                gc += 1
            if b in ALL_BASES:
                self.baseCounts[b][i] += 1
                self.baseTotalQual[b][i] += qnum

            # calculate discontinuity
            left = i-2
            right = i+3
            if left<0:
                left = 0
                right = 5
            elif right >= seqlen:
                right = seqlen
                left = seqlen - 5
            discontinuity = 0
            for j in xrange(left, right-1):
                if seq[j] != seq[j+1]:
                    discontinuity += 1
            self.totalDiscontinuity[i] += discontinuity

        #gcPer = int(1000.0* float(gc)/seqlen)
        self.gcHistogram[gc] += 1
        for i in xrange(seqlen - self.kmerLen):
            self.totalKmer += 1
            kmer = seq[i:i+self.kmerLen]
            if kmer in self.kmerCount:
                self.kmerCount[kmer] += 1
            else:
                self.kmerCount[kmer] = 1
                rcKmer = util.reverseComplement(kmer)
                if rcKmer not in self.kmerCount:
                    self.kmerCount[rcKmer] = 0
Esempio n. 12
0
    def plotStrandBias(self, filename, prefix=""):
        shift = min(50, len(self.topKmerCount)/2)
        top = len(self.topKmerCount) - shift
        forward = [0 for i in xrange(top)]
        reverse = [0 for i in xrange(top)]
        maxValue = 0
        for i in xrange(top):
            kmer = self.topKmerCount[i+shift][0]
            forward[i] = self.kmerCount[kmer]
            reverse[i] = self.kmerCount[util.reverseComplement(kmer)]
            maxValue = max(max(forward[i], reverse[i]), maxValue)

        plt.figure(1)
        plt.xlim(-10, maxValue)
        plt.ylim(-10, maxValue)
        plt.title(prefix + " strand bias" )
        plt.xlabel('Relative forward kmer')
        plt.ylabel('Relative reverse kmer')
        plt.scatter(forward, reverse, label='Discontinuity', alpha=0.4, s=0.3)
        # plt.legend(loc='upper right', ncol=5)
        plt.savefig(filename)
        plt.close(1)
Esempio n. 13
0
    def plotStrandBias(self, filename, prefix=""):
        shift = min(50, len(self.topKmerCount)/2)
        top = len(self.topKmerCount) - shift
        forward = [0 for i in xrange(top)]
        reverse = [0 for i in xrange(top)]
        maxValue = 0
        for i in xrange(top):
            kmer = self.topKmerCount[i+shift][0]
            forward[i] = self.kmerCount[kmer]
            reverse[i] = self.kmerCount[util.reverseComplement(kmer)]
            maxValue = max(max(forward[i], reverse[i]), maxValue)

        plt.figure(1)
        plt.xlim(-10, maxValue)
        plt.ylim(-10, maxValue)
        plt.title(prefix + " strand bias" )
        plt.xlabel('Relative forward strand KMER count')
        plt.ylabel('Relative reverse strand KMER count')
        plt.scatter(forward, reverse, label='Discontinuity', alpha=0.4, s=0.3)
        # plt.legend(loc='upper right', ncol=5)
        plt.savefig(filename)
        plt.close(1)
 def action(inp, lineNumber):
     if (lineNumber == 1):
         featuresWrapper.var = inp[1:];
         sumWrapper.var = np.zeros(len(inp)-1);
         sumSquaresWrapper.var = np.zeros(len(inp)-1);
         if options.revComp:
             kmerToIndex = dict((x[1],x[0]) for x in enumerate(featuresWrapper.var));
             reverseComplementIndexMappingWrapper.var = dict([(x[0], kmerToIndex[util.reverseComplement(x[1])]) for x in enumerate(featuresWrapper.var)]);
     else:
         totalRegionsWrapper.var += 1;
         countsArr = np.array([int(x) for x in inp[1:]]); #TODO: generalise to not-just-ints
         if (options.revComp):
             revCompKmerCounts = countsArr.copy();
             for i,kmerCount in enumerate(countsArr):
                 revCompKmerCounts[reverseComplementIndexMappingWrapper.var[i]] += kmerCount;
             countsArr=revCompKmerCounts;
         sumWrapper.var += countsArr;
         sumSquaresWrapper.var += np.square(countsArr);
         if (totalRegionsWrapper.var == 1):
             minWrapper.var = countsArr.copy()
             maxWrapper.var = countsArr.copy()
         minWrapper.var = np.minimum(minWrapper.var, countsArr)
         maxWrapper.var = np.maximum(maxWrapper.var, countsArr) 
Esempio n. 15
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir =  os.path.join(qc_base_folder, os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen+1)]
        distance_histgram = [0 for x in xrange(readLen+1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2
                
        print(self.options.read1_file + " options:")
        print(self.options)
        
        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file            
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)
            
        if not os.path.exists(good_dir):
            os.makedirs(good_dir)
            
        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)
        
        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"))
            bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"))
        
        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None
        
        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"))
                bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"))
                bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"))
                bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"))
            
        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1==None:
                break
                
            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2==None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1==None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2==None:
                    break

            TOTAL += 1
                    
            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify)
            
            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue
            
            #filter sequence length
            if len(r1[1])<self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue
                    
            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2!=None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly)
                if poly1!=None or poly2!=None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue
            
            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2!=None:
                    lowQual2 = lowQualityNum(r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue
            
            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2!=None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2!=None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset <0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset+overlap_len]
                    r2[3] = r2[3][-offset:-offset+overlap_len]
                    # then calc overlap again
                    (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                if overlap_len>30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance>0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o-1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o-1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(q2) <= 16:
                                    r2[1] = util.changeString(r2[1], -o-1, util.complement(b1))
                                    r2[3] = util.changeString(r2[3], -o-1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(q1) <= 16:
                                    r1[1]= util.changeString(r1[1], len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(r1[3], len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None)

            #write to good       
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None)
            r1qc_postfilter.statRead(r1)
            if r2 != None:
                r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")
        
        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads']=TOTAL
        result['good_reads']=GOOD
        result['bad_reads']=BAD
        result['bad_reads_with_bad_barcode']= BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble']= BADBBL
        result['bad_reads_with_bad_read_length']= BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX']= BADPOL
        result['bad_reads_with_low_quality']=BADLQC
        result['bad_reads_with_too_many_N']= BADNCT
        result['bad_reads_with_bad_overlap']= BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = ['good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N']
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['green', '#FF1111', '#FF3333', '#FF5555', '#FF7777']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF9999')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#FFBBBB')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#FFDDDD')

        fig = plt.figure(1)
        plt.title("Good reads (green) and bad reads (red) of total " + str(TOTAL))
        fig.subplots_adjust(left = 0.14)
        lefts = xrange(len(counts))
        plt.yticks(lefts, labels)
        plt.ylim(-0.5, len(counts)-0.5)
        plt.barh(lefts, counts, align='center', height=0.5, alpha=0.8, color=colors)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"))
        plt.close(1)

        stat={}
        # stat["options"]=self.options
        stat["summary"]=result
        stat["command"]=makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"]["read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"]["read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"]["read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"]={}
            stat["overlap"]['overlapped_pairs']=OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length']=0.0
            stat["overlap"]['bad_edit_distance']=BADOL
            stat["overlap"]['bad_mismatch_bases']=BADMISMATCH
            stat["overlap"]['bad_indel']=BADINDEL
            stat["overlap"]['reads_with_corrected_mismatch_bases']=BASE_CORRECTED
            stat["overlap"]['overlapped_area_edit_distance_histogram']=distance_histgram[0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat, sort_keys=True,indent=4, separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))
Esempio n. 16
0
if __name__ == '__main__':
    with open('/Users/evgeny/Downloads/Salmonella_enterica.fasta', 'r+') as f:
        label, s = readFASTA(f)
    k, d = (9, 2)

    s = [x for x in s]
    len_s = len(s)

    with Timer() as t:
        dc = {}
        for i in range(len_s - k + 1):
            ss = s[i:i + k]
            closure = dClosure(ss, d)

            rs = [x for x in reverseComplement(''.join(ss))]
            closure.union(dClosure(rs, d))
            for c in closure:
                if c in dc:
                    dc[c] += 1
                else:
                    dc[c] = 1
        print 'd-closure size: %d' % len(dc)

        m_value = 0
        res = []
        for x, f in dc.items():
            rx = reverseComplement(x)
            rf = dc[rx] if rx in dc else 0
            if f + rf > m_value:
                m_value = f + rf
Esempio n. 17
0
 def action(input,i): #the input is the value of the line after preprocess, filter and transformation
     category = categoryFromInput(input);
     sequence = sequenceFromInput(input);
     if (category not in categoryCounts):
         categoryCounts[category] = 0;
     categoryCounts[category] += 1;
     for countProfilerFactory in countProfilerFactories:
         if (countProfilerFactory.profilerName not in profilerName_to_categoryToCountMaps):
             profilerName_to_categoryToCountMaps[countProfilerFactory.profilerName] = {}
         if (category not in profilerName_to_categoryToCountMaps[countProfilerFactory.profilerName]):
             profilerName_to_categoryToCountMaps[countProfilerFactory.profilerName][category] = countProfilerFactory.getCountProfiler();
         profilerName_to_categoryToCountMaps[countProfilerFactory.profilerName][category].process(sequence);
         profilerName_to_categoryToCountMaps[countProfilerFactory.profilerName][category].process(util.reverseComplement(sequence)); 
Esempio n. 18
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)
        #create a QC folder to contains QC results
        qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file),
                                      "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        #QC result of this file/pair
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        r1qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r2qc_prefilter = QualityControl(self.options.qc_sample,
                                        self.options.qc_kmer)
        r1qc_prefilter.statFile(self.options.read1_file)
        r1qc_prefilter.plot(qc_dir, "R1-prefilter")
        if self.options.read2_file != None:
            r2qc_prefilter.statFile(self.options.read2_file)
            r2qc_prefilter.plot(qc_dir, "R2-prefilter")

        r1qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)
        r2qc_postfilter = QualityControl(self.options.qc_sample,
                                         self.options.qc_kmer)

        readLen = r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.dirname(self.options.read1_file)

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            overlap_dir = os.path.dirname(self.options.read1_file)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL = 0
        GOOD = 0
        BAD = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADOL = 0
        BADINDEL = 0
        BADMISMATCH = 0
        BASE_CORRECTED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break

            TOTAL += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None:
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][-offset:-offset + overlap_len]
                    r2[3] = r2[3][-offset:-offset + overlap_len]
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    corrected = 0
                    if distance > 2:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADOL")
                        BADOL += 1
                        continue
                    elif distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL, o, b1, b2, q1, q2)
                                if util.qualNum(q1) >= 27 and util.qualNum(
                                        q2) <= 16:
                                    r2[1] = util.changeString(
                                        r2[1], -o - 1, util.complement(b1))
                                    r2[3] = util.changeString(
                                        r2[3], -o - 1, q1)
                                    corrected += 1
                                elif util.qualNum(q2) >= 27 and util.qualNum(
                                        q1) <= 16:
                                    r1[1] = util.changeString(
                                        r1[1],
                                        len(r1[1]) - overlap_len + o, b2)
                                    r1[3] = util.changeString(
                                        r1[3],
                                        len(r1[3]) - overlap_len + o, q2)
                                    corrected += 1
                                if corrected >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected == distance:
                            BASE_CORRECTED += 1
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample:
                r1qc_postfilter.statRead(r1)
                if r2 != None:
                    r2qc_postfilter.statRead(r2)

            GOOD += 1
            if self.options.qc_only and TOTAL >= self.options.qc_sample:
                break

        r1qc_postfilter.qc()
        r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            r2qc_postfilter.qc()
            r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD = TOTAL - GOOD
        result = {}
        result['total_reads'] = TOTAL
        result['good_reads'] = GOOD
        result['bad_reads'] = BAD
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADOL + BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str(
                100.0 * float(counts[i]) / TOTAL) + "%)"

        fig = plt.figure(1)
        plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads",
                  fontsize=12,
                  color='#666666')
        plt.axis('equal')
        patches, texts = plt.pie(counts, colors=colors, radius=0.7)
        patches, labels, dummy = zip(*sorted(
            zip(patches, labels, counts), key=lambda x: x[2], reverse=True))
        plt.legend(patches, labels, loc='upper left', fontsize=9)
        plt.savefig(os.path.join(qc_dir, "filter-stat.png"),
                    bbox_inches='tight')
        plt.close(1)

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[
            0:10]
        stat["kmer_content"][
            "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_edit_distance'] = BADOL
            stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH
            stat["overlap"]['bad_indel'] = BADINDEL
            stat["overlap"][
                'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED
            stat["overlap"][
                'overlapped_area_edit_distance_histogram'] = distance_histgram[
                    0:10]
            plotOverlapHistgram(overlap_histgram, readLen, TOTAL,
                                os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "after.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.output(os.path.join(qc_dir, "report.html"))
Esempio n. 19
0
    def run(self):
        if self.options.debubble:
            self.loadBubbleCircles()

        #read1_file is required
        read1_file = fastq.Reader(self.options.read1_file)

        #no front trim if sequence is barcoded
        if self.options.barcode:
            self.options.trim_front = 0

        reporter = QCReporter()

        self.r1qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r2qc_prefilter = QualityControl(self.options.qc_sample,
                                             self.options.qc_kmer)
        self.r1qc_prefilter.statFile(self.options.read1_file)
        if self.options.read2_file != None:
            self.r2qc_prefilter.statFile(self.options.read2_file)

        self.r1qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)
        self.r2qc_postfilter = QualityControl(self.options.qc_sample,
                                              self.options.qc_kmer)

        readLen = self.r1qc_prefilter.readLen
        overlap_histgram = [0 for x in xrange(readLen + 1)]
        distance_histgram = [0 for x in xrange(readLen + 1)]

        #auto detect trim front and trim tail
        if self.options.trim_front == -1 or self.options.trim_tail == -1:
            #auto trim for read1
            trimFront, trimTail = self.r1qc_prefilter.autoTrim()
            if self.options.trim_front == -1:
                self.options.trim_front = trimFront
            if self.options.trim_tail == -1:
                self.options.trim_tail = trimTail
            #auto trim for read2
            if self.options.read2_file != None:
                # check if we should keep same trimming for read1/read2 to keep their length identical
                # this option is on by default because lots of dedup algorithms require this feature
                if self.options.trim_pair_same:
                    self.options.trim_front2 = self.options.trim_front
                    self.options.trim_tail2 = self.options.trim_tail
                else:
                    trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim()
                    if self.options.trim_front2 == -1:
                        self.options.trim_front2 = trimFront2
                    if self.options.trim_tail2 == -1:
                        self.options.trim_tail2 = trimTail2

        print(self.options.read1_file + " options:")
        print(self.options)

        #if good output folder not specified, set it as the same folder of read1 file
        good_dir = self.options.good_output_folder
        if good_dir == None:
            good_dir = os.path.dirname(self.options.read1_file)

        #if bad output folder not specified, set it as the same folder of read1 file
        bad_dir = self.options.bad_output_folder
        if bad_dir == None:
            bad_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "bad")

        #if overlap output folder not specified, set it as the same folder of read1 file
        overlap_dir = self.options.overlap_output_folder
        if overlap_dir == None:
            #            overlap_dir = os.path.dirname(self.options.read1_file)
            overlap_dir = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "overlap")

        #save QC results at the same folder of good
        qc_base_folder = self.options.report_output_folder
        if qc_base_folder == None:
            qc_base_folder = os.path.join(
                os.path.dirname(os.path.dirname(good_dir + "/")), "QC")
        if not os.path.exists(qc_base_folder):
            os.makedirs(qc_base_folder)
        qc_dir = os.path.join(qc_base_folder,
                              os.path.basename(self.options.read1_file))
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        if not os.path.exists(good_dir):
            os.makedirs(good_dir)

        if not os.path.exists(bad_dir):
            os.makedirs(bad_dir)

        if self.options.store_overlap and self.options.read2_file != None and (
                not os.path.exists(overlap_dir)):
            os.makedirs(overlap_dir)

        good_read1_file = None
        bad_read1_file = None
        overlap_read1_file = None
        if not self.options.qc_only:
            good_read1_file = fastq.Writer(
                os.path.join(good_dir,
                             getMainName(self.options.read1_file) +
                             ".good.fq"))
            bad_read1_file = fastq.Writer(
                os.path.join(bad_dir,
                             getMainName(self.options.read1_file) + ".bad.fq"))

            overlap_read1_file = None
            if self.options.store_overlap:
                overlap_read1_file = fastq.Writer(
                    os.path.join(
                        overlap_dir,
                        getMainName(self.options.read1_file) + ".overlap.fq"))

        #other files are optional
        read2_file = None
        good_read2_file = None
        bad_read2_file = None
        overlap_read2_file = None

        index1_file = None
        good_index1_file = None
        bad_index1_file = None
        overlap_index1_file = None

        index2_file = None
        good_index2_file = None
        bad_index2_file = None
        overlap_index2_file = None

        #if other files are specified, then read them
        if self.options.read2_file != None:
            read2_file = fastq.Reader(self.options.read2_file)
            if not self.options.qc_only:
                good_read2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.read2_file) + ".good.fq"))
                bad_read2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.read2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_read2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.read2_file) +
                            ".overlap.fq"))
        if self.options.index1_file != None:
            index1_file = fastq.Reader(self.options.index1_file)
            if not self.options.qc_only:
                good_index1_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index1_file) + ".good.fq"))
                bad_index1_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index1_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index1_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index1_file) +
                            ".overlap.fq"))
        if self.options.index2_file != None:
            index2_file = fastq.Reader(self.options.index2_file)
            if not self.options.qc_only:
                good_index2_file = fastq.Writer(
                    os.path.join(
                        good_dir,
                        getMainName(self.options.index2_file) + ".good.fq"))
                bad_index2_file = fastq.Writer(
                    os.path.join(
                        bad_dir,
                        getMainName(self.options.index2_file) + ".bad.fq"))
                if self.options.store_overlap and self.options.read2_file != None:
                    overlap_index2_file = fastq.Writer(
                        os.path.join(
                            overlap_dir,
                            getMainName(self.options.index2_file) +
                            ".overlap.fq"))

        r1 = None
        r2 = None
        i1 = None
        i2 = None

        # stat numbers
        TOTAL_BASES = 0
        GOOD_BASES = 0
        TOTAL_READS = 0
        GOOD_READS = 0
        BAD_READS = 0
        BADBCD1 = 0
        BADBCD2 = 0
        BADTRIM1 = 0
        BADTRIM2 = 0
        BADBBL = 0
        BADLEN = 0
        BADPOL = 0
        BADLQC = 0
        BADNCT = 0
        BADINDEL = 0
        BADMISMATCH = 0
        READ_CORRECTED = 0
        BASE_CORRECTED = 0
        BASE_ZERO_QUAL_MASKED = 0
        OVERLAPPED = 0
        OVERLAP_LEN_SUM = 0
        OVERLAP_BASE_SUM = 0
        # error profiling by overlap analysis
        OVERLAP_BASE_ERR = 0
        OVERLAP_ERR_MATRIX = init_error_matrix()

        #adapter trimming by overlap analysis
        TRIMMED_ADAPTER_BASE = 0

        while True:
            r1 = read1_file.nextRead()
            if r1 == None:
                break
            else:
                TOTAL_BASES += len(r1[1])

            if read2_file != None:
                r2 = read2_file.nextRead()
                if r2 == None:
                    break
            if index1_file != None:
                i1 = index1_file.nextRead()
                if i1 == None:
                    break
            if index2_file != None:
                i2 = index2_file.nextRead()
                if i2 == None:
                    break
                else:
                    TOTAL_BASES += len(r2[1])

            TOTAL_READS += 1

            #barcode processing
            if self.options.barcode:
                barcodeLen1 = barcodeprocesser.detectBarcode(
                    r1[1], self.options.barcode_length,
                    self.options.barcode_verify)
                if barcodeLen1 == 0:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBCD1")
                    BADBCD1 += 1
                    continue
                else:
                    if r2 == None:
                        barcodeprocesser.moveBarcodeToName(
                            r1, self.options.barcode_length,
                            self.options.barcode_verify)
                    else:
                        barcodeLen2 = barcodeprocesser.detectBarcode(
                            r2[1], self.options.barcode_length,
                            self.options.barcode_verify)
                        if barcodeLen2 == 0:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADBCD2")
                            BADBCD2 += 1
                            continue
                        else:
                            barcodeprocesser.moveAndTrimPair(
                                r1, r2, barcodeLen1, barcodeLen2,
                                self.options.barcode_verify)

            #trim
            if self.options.trim_front > 0 or self.options.trim_tail > 0:
                r1 = trim(r1, self.options.trim_front, self.options.trim_tail)
                if len(r1[1]) < 5:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADTRIM1")
                    BADTRIM1 += 1
                    continue
                if r2 != None:
                    r2 = trim(r2, self.options.trim_front2,
                              self.options.trim_tail2)
                    if len(r2[1]) < 5:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADTRIM2")
                        BADTRIM2 += 1
                        continue

            #filter debubble
            if self.options.debubble:
                if self.isInBubble(r1[0]):
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADBBL")
                    BADBBL += 1
                    continue

            #filter sequence length
            if len(r1[1]) < self.options.seq_len_req:
                self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file,
                                bad_index1_file, bad_index2_file, "BADLEN")
                BADLEN += 1
                continue

            #check polyX
            if self.options.poly_size_limit > 0:
                poly1 = hasPolyX(r1[1], self.options.poly_size_limit,
                                 self.options.allow_mismatch_in_poly)
                poly2 = None
                if r2 != None:
                    poly2 = hasPolyX(r2[1], self.options.poly_size_limit,
                                     self.options.allow_mismatch_in_poly)
                if poly1 != None or poly2 != None:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADPOL")
                    BADPOL += 1
                    continue

            #check low quality count
            if self.options.unqualified_base_limit > 0:
                lowQual1 = lowQualityNum(r1,
                                         self.options.qualified_quality_phred)
                lowQual2 = 0
                if r2 != None:
                    lowQual2 = lowQualityNum(
                        r2, self.options.qualified_quality_phred)
                if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADLQC")
                    BADLQC += 1
                    continue

            #check N number
            if self.options.n_base_limit > 0:
                nNum1 = nNumber(r1)
                nNum2 = 0
                if r2 != None:
                    nNum2 = nNumber(r2)
                if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit:
                    self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                    bad_read2_file, bad_index1_file,
                                    bad_index2_file, "BADNCT")
                    BADNCT += 1
                    continue

            #check overlap and do error correction
            if r2 != None and (not self.options.no_overlap):
                (offset, overlap_len, distance) = util.overlap(r1[1], r2[1])
                overlap_histgram[overlap_len] += 1
                # deal with the case insert DNA is shorter than read length and cause offset is negative
                # in this case the adapter is sequenced and should be trimmed
                if offset < 0 and overlap_len > 30:
                    # shift the junk bases
                    r1[1] = r1[1][0:overlap_len]
                    r1[3] = r1[3][0:overlap_len]
                    r2[1] = r2[1][0:overlap_len]
                    r2[3] = r2[3][0:overlap_len]
                    TRIMMED_ADAPTER_BASE += abs(offset) * 2
                    # check the sequence length again after adapter trimmed
                    if len(r1[1]) < self.options.seq_len_req:
                        self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                        bad_read2_file, bad_index1_file,
                                        bad_index2_file, "BADLEN")
                        BADLEN += 1
                        continue
                    # then calc overlap again
                    (offset, overlap_len,
                     distance) = util.overlap(r1[1], r2[1])
                if overlap_len > 30:
                    OVERLAPPED += 1
                    distance_histgram[distance] += 1
                    OVERLAP_LEN_SUM += overlap_len
                    # we consider the distance is caused by sequencing error
                    OVERLAP_BASE_SUM += overlap_len * 2
                    OVERLAP_BASE_ERR += distance
                    corrected = 0
                    zero_qual_masked = 0
                    skipped_mismatch = 0
                    if distance > 0:
                        #try to fix low quality base
                        hamming = util.hammingDistance(
                            r1[1][len(r1[1]) - overlap_len:],
                            util.reverseComplement(r2[1][len(r2[1]) -
                                                         overlap_len:]))
                        if hamming != distance:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADINDEL")
                            BADINDEL += 1
                            continue
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        err_mtx = init_error_matrix()
                        for o in xrange(overlap_len):
                            b1 = r1[1][len(r1[1]) - overlap_len + o]
                            b2 = util.complement(r2[1][-o - 1])
                            q1 = r1[3][len(r1[3]) - overlap_len + o]
                            q2 = r2[3][-o - 1]
                            if b1 != b2:
                                # print(TOTAL_READS, o, b1, b2, q1, q2)
                                this_is_corrected = False
                                if util.qualNum(q1) >= 30 and util.qualNum(
                                        q2) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[util.complement(b1)][
                                            util.complement(b2)] += 1
                                    if not self.options.no_correction:
                                        r2[1] = util.changeString(
                                            r2[1], -o - 1, util.complement(b1))
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, q1)
                                        corrected += 1
                                        this_is_corrected = True
                                elif util.qualNum(q2) >= 30 and util.qualNum(
                                        q1) <= 14:
                                    if b1 != 'N' and b2 != 'N':
                                        err_mtx[b2][b1] += 1
                                    if not self.options.no_correction:
                                        r1[1] = util.changeString(
                                            r1[1],
                                            len(r1[1]) - overlap_len + o, b2)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o, q2)
                                        corrected += 1
                                        this_is_corrected = True
                                if not this_is_corrected:
                                    if self.options.mask_mismatch:
                                        # mask them as zero qual if it is not corrected
                                        zero_qual = '!'
                                        r2[3] = util.changeString(
                                            r2[3], -o - 1, zero_qual)
                                        r1[3] = util.changeString(
                                            r1[3],
                                            len(r1[3]) - overlap_len + o,
                                            zero_qual)
                                        zero_qual_masked += 1
                                    else:
                                        skipped_mismatch += 1

                                if corrected + zero_qual_masked + skipped_mismatch >= distance:
                                    break
                        #print(r1[1][len(r1[1]) - overlap_len:])
                        #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:]))
                        #print(r1[3][len(r1[1]) - overlap_len:])
                        #print(util.reverse(r2[3][len(r2[1]) - overlap_len:]))
                        if corrected + zero_qual_masked + skipped_mismatch == distance:
                            merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx)
                            if corrected > 0:
                                READ_CORRECTED += 1
                            BASE_CORRECTED += corrected
                            # multiply by 2 since we mask bases by pair
                            BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2
                        else:
                            self.writeReads(r1, r2, i1, i2, bad_read1_file,
                                            bad_read2_file, bad_index1_file,
                                            bad_index2_file, "BADMISMATCH")
                            BADMISMATCH += 1
                            continue
                    if distance == 0 or distance == corrected:
                        if self.options.store_overlap:
                            self.writeReads(getOverlap(r1, overlap_len),
                                            getOverlap(r2, overlap_len), i1,
                                            i2, overlap_read1_file,
                                            overlap_read2_file,
                                            overlap_index1_file,
                                            overlap_index2_file, None)

            #write to good
            self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file,
                            good_index1_file, good_index2_file, None)
            GOOD_BASES += len(r1[1])
            if i2 != None:
                GOOD_BASES += len(r2[1])
            if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample:
                self.r1qc_postfilter.statRead(r1)
                if r2 != None:
                    self.r2qc_postfilter.statRead(r2)

            GOOD_READS += 1
            if self.options.qc_only and TOTAL_READS >= self.options.qc_sample:
                break

        self.r1qc_postfilter.qc()
        #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter")
        if self.options.read2_file != None:
            self.r2qc_postfilter.qc()
            #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter")

        #close all files
        if not self.options.qc_only:
            good_read1_file.flush()
            bad_read1_file.flush()
            if self.options.read2_file != None:
                good_read2_file.flush()
                bad_read2_file.flush()
            if self.options.index1_file != None:
                good_index1_file.flush()
                bad_index1_file.flush()
            if self.options.index2_file != None:
                good_index2_file.flush()
                bad_index2_file.flush()

        # print stat numbers
        BAD_READS = TOTAL_READS - GOOD_READS
        result = {}
        result['total_bases'] = TOTAL_BASES
        result['good_bases'] = GOOD_BASES
        result['total_reads'] = TOTAL_READS
        result['good_reads'] = GOOD_READS
        result['bad_reads'] = BAD_READS
        result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2
        result['bad_reads_with_reads_in_bubble'] = BADBBL
        result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2
        result['bad_reads_with_polyX'] = BADPOL
        result['bad_reads_with_low_quality'] = BADLQC
        result['bad_reads_with_too_many_N'] = BADNCT
        result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL
        result['readlen'] = readLen

        # plot result bar figure
        labels = [
            'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'
        ]
        counts = [
            GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT
        ]
        colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899']
        if self.options.read2_file != None:
            labels.append('bad_overlap')
            counts.append(BADMISMATCH + BADINDEL)
            colors.append('#FF6600')
        if self.options.debubble:
            labels.append('in_bubble')
            counts.append(BADBBL)
            colors.append('#EEBB00')
        if self.options.barcode:
            labels.append('bad_barcode')
            counts.append(BADBCD1 + BADBCD2)
            colors.append('#CCDD22')

        for i in xrange(len(counts)):
            type_percent = 0.0
            if TOTAL_READS > 0:
                type_percent = 100.0 * float(counts[i]) / TOTAL_READS
            labels[i] = labels[i] + ": " + str(
                counts[i]) + "(" + str(type_percent) + "%)"

        reporter.addFigure(
            'Good reads and bad reads after filtering',
            self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS,
                                           'filter_stat'), 'filter_stat', "")
        #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png"))

        stat = {}
        # stat["options"]=self.options
        stat["summary"] = result
        stat["command"] = makeDict(self.options)
        stat["kmer_content"] = {}
        stat["kmer_content"][
            "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10]
        stat["kmer_content"][
            "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10]
        if self.options.read2_file != None:
            stat["kmer_content"][
                "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10]
            stat["kmer_content"][
                "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10]
            stat["overlap"] = {}
            stat["overlap"]['overlapped_pairs'] = OVERLAPPED
            if OVERLAPPED > 0:
                stat["overlap"]['average_overlap_length'] = float(
                    OVERLAP_LEN_SUM / OVERLAPPED)
            else:
                stat["overlap"]['average_overlap_length'] = 0.0
            stat["overlap"]['bad_mismatch_reads'] = BADMISMATCH
            stat["overlap"]['bad_indel_reads'] = BADINDEL
            stat["overlap"]['corrected_reads'] = READ_CORRECTED
            stat["overlap"]['corrected_bases'] = BASE_CORRECTED
            stat["overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED
            stat["overlap"]['trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE
            if OVERLAP_BASE_SUM > 0:
                stat["overlap"]['error_rate'] = float(
                    OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM)
            else:
                stat["overlap"]['error_rate'] = 0.0
            stat["overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX
            stat["overlap"]['edit_distance_histogram'] = distance_histgram[
                0:10]
            reporter.addFigure(
                'Sequence error distribution',
                self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX,
                                                'error_matrix'),
                'error_matrix', "")
            reporter.addFigure(
                'Overlap length distribution',
                self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen,
                                                  TOTAL_READS, 'overlap_stat'),
                'overlap_stat', "")
            #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png"))

        stat_file = open(os.path.join(qc_dir, "report.json"), "w")
        stat_json = json.dumps(stat,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
        stat_file.write(stat_json)
        stat_file.close()

        self.addFiguresToReport(reporter)
        reporter.setStat(stat)
        reporter.setVersion(self.options.version)
        reporter.output(os.path.join(qc_dir, "report.html"))