Esempio n. 1
0
    def _loadBarcodes(self):
        """
        Read barcode names, sequences, and pairs from the barcode Fasta file
        """
        logging.info("Loading barcodes")
        raw_seqs = set()    # 1. Make sure no barcode sequences are duplicate
        names = []          # 2. Keep a list of all unique barcode names
        sequences = {}      # 3. Create a dictionary of all barcode sequences
        for barcode in FastaReader(options.barcodeFilename):
            name = barcode.name.strip().split()[0]
            # Check the barcode name
            if name in names:
                raise ValueError("Duplicate barcode name in '%s'".format(name))
            else:
                names.append( name )

            # Check the forward seqeunce
            if barcode.sequence in raw_seqs:
                raise ValueError("Duplicate barcode sequence in '%s'".format(name))
            else:
                raw_seqs.add( barcode.sequence )

            # Check the reverse complement sequence
            rc_barcode = reverse_complement( barcode )
            if rc_barcode.sequence in raw_seqs:
                raise ValueError("Duplicate barcode sequence in '%s'".format(name))
            else:
                raw_seqs.add( rc_barcode.sequence )

            # If both pass, add the sequences and pair-wise combinations
            sequences[(name, 'FORWARD')] = barcode.sequence
            sequences[(name, 'REVERSE')] = rc_barcode.sequence
        # Verify that all of the barcodes are the same length
        bc_lengths = list(set([len(s) for s in sequences.itervalues()]))
        if len(bc_lengths) > 1:
            msg = "Multiple barcode lengths detected - {0}".format(bc_lengths)
            logging.error( msg )
            raise ValueError( msg )
        self._barcodeLength = bc_lengths[0]
        self._barcodeSequences = sequences
        self._barcodeNames = names
        self._barcodePairs = [(names[i], names[i+1]) for i in range(0,len(names)-1,2)]
        self._barcodePairNames = ["{0}--{1}".format(p[0], p[1]) for p in self._barcodePairs]
Esempio n. 2
0
    def main(self):
        parseOptions()
        self._setupLogging()

        logging.info("h5py version: %s" % h5py.version.version)
        logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
        logging.info("ConsensusCore version: %s" % consensusCoreVersion())
        logging.info("BarcodeAnalysis version: %s" % __version__)
        logging.info("Custom SO File: %s" % options.soFile)

        logging.info("Starting.")
        self._loadWhiteList()
        self._loadData()
        self._loadChemistry()
        self._loadBarcodes()

        self._insertPad = options.insertSidePad
        self._adapterPad = options.adapterSidePad

        scorer = BarcodeScorer(self._inputReader,
                               FastaReader(options.barcodeFilename),
                               adapterSidePad = options.adapterSidePad,
                               insertSidePad = options.insertSidePad,
                               scoreMode = 'paired', maxHits = options.maxHits,
                               scoreFirst = False, startTimeCutoff = 1,
                               minScore = 30,
                               soFile=options.soFile)
        # If tSNE was selected, output the barcode information as a CSV
        if options.tSNE:
            with self.openOutputFile() as handle:
                for holeNum in self._sequencingZmws:
                    zmw = self.inputReader[holeNum]
                    if len(zmw.adapterRegions) >= 3:
                        for window in self._getWindowReads(zmw):
                            handle.write( window.to_csv + '\n' )
        # If AdapterSizes was selected, output the length of the barcode on either side of the adapter
        elif options.adapterSizes:
            with self.openOutputFile() as handle:
                for zmw in self._sequencingZmws:
                    lengths = [(len(w) if w else 0) for w in self._getWindowReads(zmw)]
                    print '%s,%s,%s' % (zmw.zmwName, ','.join([str(l) for l in lengths]), min(lengths))
        # If PBbarcode was selected, use Bullard's BarcodeScorer to score sequences
        elif options.scoreBarcodes:
            print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg"
            for zmw in self._sequencingZmws:
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmw2( zmw )
                adpScores = res[1]
                adpBestArg = [np.argmax(a) for a in adpScores]
                adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]
                adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx]
                adpScoreCorrect =   [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1]
                adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0]
                avgCorrect   = sum(adpScoreCorrect)/float(len(adpScoreCorrect))     if len(adpScoreCorrect)   else 'N/A'
                avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A'
                print "{0},{1},{2},{3},{4},{5}".format(zmw.zmwName, trueIdx,
                                                       len(adpBestArg), sum(adpIdxCorrect),
                                                       avgCorrect, avgIncorrect)
        elif options.scoreBarcodesOld:
            print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg"
            for zmw in self._sequencingZmws:
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmw3( zmw )
                adpScores = res[1]
                adpBestArg = [np.argmax(a) for a in adpScores]
                adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]
                adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx]
                adpScoreCorrect =   [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1]
                adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0]
                avgCorrect   = sum(adpScoreCorrect)/float(len(adpScoreCorrect))     if len(adpScoreCorrect)   else 'N/A'
                avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A'
                print "{0},{1},{2},{3},{4},{5}".format(zmw.zmwName, trueIdx,
                                                       len(adpBestArg), sum(adpIdxCorrect),
                                                       avgCorrect, avgIncorrect)
        elif options.scoreBarcodesRc:
            print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg"
            for zmw in self._sequencingZmws:
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmwRc( zmw )
                adpScores = res[1]
                adpBestArg = [np.argmax(a) for a in adpScores]
                adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]
                adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx]
                adpScoreCorrect =   [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1]
                adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0]
                avgCorrect   = sum(adpScoreCorrect)/float(len(adpScoreCorrect))     if len(adpScoreCorrect)   else 'N/A'
                avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A'
                print "{0},{1},{2},{3},{4},{5}".format(zmw.zmwName, trueIdx,
                                                       len(adpBestArg), sum(adpIdxCorrect),
                                                       avgCorrect, avgIncorrect)
        elif options.testBarcodesRc:
            print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg"
            for zmw in self._sequencingZmws:
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmwRc( zmw )
                adpScores = res[1]
                adpBestArg = [np.argmax(a) for a in adpScores]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]
                uniqueBestIdx = sorted(set(adpBestIdx))
                adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx]
                print zmw.zmwName, adpIdxIncorrect
                if sum(adpIdxIncorrect) > 0:
                    print zmw.zmwName, trueIdx
                    print adpBestIdx
                    scorer.scoreSelectedAdaptersRc(zmw, adpIdxIncorrect, uniqueBestIdx)
        elif options.testBarcodesRc2:
            print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg"
            for zmw in self._sequencingZmws:
                for adp, scores in zip(zmw.adapterRegions, scorer.scoreZmwRc2( zmw )):
                    leftEnd, rightStart = adp
                    leftScore, rightScore = scores
                    leftStart = leftEnd - self.windowSize - self.insertPad
                    leftSeq = reverse_complement(zmw.read(leftStart, leftEnd).basecalls())
                    leftMax = max(leftScore)
                    leftIdx = list(leftScore).index(leftMax)
                    leftBc = self._barcodeNames[leftIdx]
                    leftBcSeq = self._barcodeSequences[(leftBc, 'FORWARD')] if leftBc.startswith('F') else self._barcodeSequences[(leftBc, 'REVERSE')]
                    print "{0},{1},{2},{3},{4}".format(zmw.zmwName, leftStart, leftEnd, leftBc, leftMax)
                    scorer.aligner.score(leftSeq, leftBcSeq)
                    rightEnd  = rightStart + self.windowSize + self.insertPad
                    rightSeq = zmw.read(rightStart, rightEnd).basecalls()
                    rightMax = max(rightScore)
                    rightIdx = list(rightScore).index(rightMax)
                    rightBc = self._barcodeNames[rightIdx]
                    rightBcSeq = self._barcodeSequences[(rightBc, 'FORWARD')] if rightBc.startswith('F') else self._barcodeSequences[(rightBc, 'REVERSE')]
                    print "{0},{1},{2},{3},{4}".format(zmw.zmwName, rightStart, rightEnd, rightBc, rightMax)
                    scorer.aligner.score(rightSeq, rightBcSeq)
        elif options.funnyAdapters:
            for zmw in self._sequencingZmws:
                adpEnds = [str(r[1]) for r in zmw.adapterRegions[:options.maxHits]]
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmwRc( zmw, trim=True )
                adpScores = res[1]
                adpBestArg = [np.argmax(a) for a in adpScores]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]
                adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx]
                isBadAdp = self.isBadAdp( adpBestIdx, adpIdxIncorrect )
                isMissingAdp = self.isMissingAdp( adpBestIdx, adpIdxIncorrect )
                bestIdxStr = '--'.join(adpBestIdx)
                adpEndStr = '--'.join(adpEnds)
                if isBadAdp:
                    print "{0},ExtraAdp,{1},{2}".format(zmw.zmwName, bestIdxStr, adpEndStr)
                elif isMissingAdp:
                    print "{0},MissingAdp,{1},{2}".format(zmw.zmwName, bestIdxStr, adpEndStr)
                elif sum(adpIdxIncorrect) == 0 and not isMissingAdp:
                    print "{0},Perfect,{1},{2}".format(zmw.zmwName, bestIdxStr, adpEndStr)

        elif options.summarizeErrorsRc:
            errors = {'total': 0, 'true': 0, 'mixed':0, 'other': 0, 'badAdp':0, 'single':0, 'multi':0, 'map':0, 'bad':0, 'readScore':0}
            for zmw in self._sequencingZmws:
                # Score the Zmw as per normal
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmwRc( zmw )
                adpScores = res[1]

                # Figure out the best scores and Ids for each Adapter
                adpBestArg = [np.argmax(a) for a in adpScores]
                adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]

                # Figure out which were correct, and which are in error
                adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx]
                adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx]
                numErrors = sum(adpIdxIncorrect)

                # Test for certain error patterns
                isBadAdp = self.isBadAdp( adpBestIdx, adpIdxIncorrect )
                isEndError = self.isEndError( adpIdxIncorrect )
                isMixedError = self.isMixedError( adpIdxIncorrect )

                # Figure out the scores and averages for the correct/incorrect barcode calls
                adpScoreCorrect =   [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1]
                adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0]
                avgCorrect   = sum(adpScoreCorrect)/float(len(adpScoreCorrect))     if len(adpScoreCorrect)   else 'N/A'
                avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A'

                # Summarize the errors
                errors['total'] += numErrors
                if numErrors == 1:
                    errors['single'] += numErrors
                else:
                    errors['multi'] += numErrors

                if avgCorrect == 'N/A':
                    errors['map'] += numErrors
                elif isBadAdp:
                    errors['badAdp'] += numErrors
                elif isEndError and avgIncorrect > 30.0:
                    errors['true'] += numErrors
                elif isMixedError:
                    errors['mixed'] += numErrors
                elif numErrors == 1 and avgIncorrect < 30.0:
                    errors['bad'] += numErrors
                elif zmw.readScore <= 0.8:
                    errors['readScore'] += numErrors
                else:
                    errors['other'] += numErrors
                    print zmw.zmwName
                    print adpBestIdx
                    print adpIdxIncorrect
                    print avgCorrect, avgIncorrect
                    print isBadAdp, isEndError, isMixedError

            print errors

        # Otherwise score the barcodes normally and return a CSV
        elif options.pbbarcode2:
            for zmw in self._sequencingZmws:
                trueIdx = self._whiteListIdx[ zmw.zmwName ]
                trueIdxPts = trueIdx.split('--')
                res = scorer.scoreZmw( zmw )
                adpScores = res[1]
                print scorer.barcodeNames
                print res[0]
                print res[1]
                adpBestArg = [np.argmax(a) for a in adpScores]
                print adpBestArg
                print [max(a) for a in adpScores]
                adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)]
                adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg]
                uniqueBestIdx = sorted(set(adpBestIdx))
                adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx]
                print zmw.zmwName
                print adpBestIdx
                print adpIdxIncorrect
                scorer.scoreSelectedAdapters(zmw, adpIdxIncorrect, uniqueBestIdx)
                print