def _loadBarcodes(self): """ Read barcode names, sequences, and pairs from the barcode Fasta file """ logging.info("Loading barcodes") raw_seqs = set() # 1. Make sure no barcode sequences are duplicate names = [] # 2. Keep a list of all unique barcode names sequences = {} # 3. Create a dictionary of all barcode sequences for barcode in FastaReader(options.barcodeFilename): name = barcode.name.strip().split()[0] # Check the barcode name if name in names: raise ValueError("Duplicate barcode name in '%s'".format(name)) else: names.append( name ) # Check the forward seqeunce if barcode.sequence in raw_seqs: raise ValueError("Duplicate barcode sequence in '%s'".format(name)) else: raw_seqs.add( barcode.sequence ) # Check the reverse complement sequence rc_barcode = reverse_complement( barcode ) if rc_barcode.sequence in raw_seqs: raise ValueError("Duplicate barcode sequence in '%s'".format(name)) else: raw_seqs.add( rc_barcode.sequence ) # If both pass, add the sequences and pair-wise combinations sequences[(name, 'FORWARD')] = barcode.sequence sequences[(name, 'REVERSE')] = rc_barcode.sequence # Verify that all of the barcodes are the same length bc_lengths = list(set([len(s) for s in sequences.itervalues()])) if len(bc_lengths) > 1: msg = "Multiple barcode lengths detected - {0}".format(bc_lengths) logging.error( msg ) raise ValueError( msg ) self._barcodeLength = bc_lengths[0] self._barcodeSequences = sequences self._barcodeNames = names self._barcodePairs = [(names[i], names[i+1]) for i in range(0,len(names)-1,2)] self._barcodePairNames = ["{0}--{1}".format(p[0], p[1]) for p in self._barcodePairs]
def main(self): parseOptions() self._setupLogging() logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % consensusCoreVersion()) logging.info("BarcodeAnalysis version: %s" % __version__) logging.info("Custom SO File: %s" % options.soFile) logging.info("Starting.") self._loadWhiteList() self._loadData() self._loadChemistry() self._loadBarcodes() self._insertPad = options.insertSidePad self._adapterPad = options.adapterSidePad scorer = BarcodeScorer(self._inputReader, FastaReader(options.barcodeFilename), adapterSidePad = options.adapterSidePad, insertSidePad = options.insertSidePad, scoreMode = 'paired', maxHits = options.maxHits, scoreFirst = False, startTimeCutoff = 1, minScore = 30, soFile=options.soFile) # If tSNE was selected, output the barcode information as a CSV if options.tSNE: with self.openOutputFile() as handle: for holeNum in self._sequencingZmws: zmw = self.inputReader[holeNum] if len(zmw.adapterRegions) >= 3: for window in self._getWindowReads(zmw): handle.write( window.to_csv + '\n' ) # If AdapterSizes was selected, output the length of the barcode on either side of the adapter elif options.adapterSizes: with self.openOutputFile() as handle: for zmw in self._sequencingZmws: lengths = [(len(w) if w else 0) for w in self._getWindowReads(zmw)] print '%s,%s,%s' % (zmw.zmwName, ','.join([str(l) for l in lengths]), min(lengths)) # If PBbarcode was selected, use Bullard's BarcodeScorer to score sequences elif options.scoreBarcodes: print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg" for zmw in self._sequencingZmws: trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmw2( zmw ) adpScores = res[1] adpBestArg = [np.argmax(a) for a in adpScores] adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx] adpScoreCorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1] adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0] avgCorrect = sum(adpScoreCorrect)/float(len(adpScoreCorrect)) if len(adpScoreCorrect) else 'N/A' avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A' print "{0},{1},{2},{3},{4},{5}".format(zmw.zmwName, trueIdx, len(adpBestArg), sum(adpIdxCorrect), avgCorrect, avgIncorrect) elif options.scoreBarcodesOld: print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg" for zmw in self._sequencingZmws: trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmw3( zmw ) adpScores = res[1] adpBestArg = [np.argmax(a) for a in adpScores] adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx] adpScoreCorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1] adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0] avgCorrect = sum(adpScoreCorrect)/float(len(adpScoreCorrect)) if len(adpScoreCorrect) else 'N/A' avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A' print "{0},{1},{2},{3},{4},{5}".format(zmw.zmwName, trueIdx, len(adpBestArg), sum(adpIdxCorrect), avgCorrect, avgIncorrect) elif options.scoreBarcodesRc: print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg" for zmw in self._sequencingZmws: trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmwRc( zmw ) adpScores = res[1] adpBestArg = [np.argmax(a) for a in adpScores] adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx] adpScoreCorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1] adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0] avgCorrect = sum(adpScoreCorrect)/float(len(adpScoreCorrect)) if len(adpScoreCorrect) else 'N/A' avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A' print "{0},{1},{2},{3},{4},{5}".format(zmw.zmwName, trueIdx, len(adpBestArg), sum(adpIdxCorrect), avgCorrect, avgIncorrect) elif options.testBarcodesRc: print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg" for zmw in self._sequencingZmws: trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmwRc( zmw ) adpScores = res[1] adpBestArg = [np.argmax(a) for a in adpScores] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] uniqueBestIdx = sorted(set(adpBestIdx)) adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx] print zmw.zmwName, adpIdxIncorrect if sum(adpIdxIncorrect) > 0: print zmw.zmwName, trueIdx print adpBestIdx scorer.scoreSelectedAdaptersRc(zmw, adpIdxIncorrect, uniqueBestIdx) elif options.testBarcodesRc2: print "Zmw,TrueIdx,NumAdp,NumCorrect,CorrectAvg,IncorrectAvg" for zmw in self._sequencingZmws: for adp, scores in zip(zmw.adapterRegions, scorer.scoreZmwRc2( zmw )): leftEnd, rightStart = adp leftScore, rightScore = scores leftStart = leftEnd - self.windowSize - self.insertPad leftSeq = reverse_complement(zmw.read(leftStart, leftEnd).basecalls()) leftMax = max(leftScore) leftIdx = list(leftScore).index(leftMax) leftBc = self._barcodeNames[leftIdx] leftBcSeq = self._barcodeSequences[(leftBc, 'FORWARD')] if leftBc.startswith('F') else self._barcodeSequences[(leftBc, 'REVERSE')] print "{0},{1},{2},{3},{4}".format(zmw.zmwName, leftStart, leftEnd, leftBc, leftMax) scorer.aligner.score(leftSeq, leftBcSeq) rightEnd = rightStart + self.windowSize + self.insertPad rightSeq = zmw.read(rightStart, rightEnd).basecalls() rightMax = max(rightScore) rightIdx = list(rightScore).index(rightMax) rightBc = self._barcodeNames[rightIdx] rightBcSeq = self._barcodeSequences[(rightBc, 'FORWARD')] if rightBc.startswith('F') else self._barcodeSequences[(rightBc, 'REVERSE')] print "{0},{1},{2},{3},{4}".format(zmw.zmwName, rightStart, rightEnd, rightBc, rightMax) scorer.aligner.score(rightSeq, rightBcSeq) elif options.funnyAdapters: for zmw in self._sequencingZmws: adpEnds = [str(r[1]) for r in zmw.adapterRegions[:options.maxHits]] trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmwRc( zmw, trim=True ) adpScores = res[1] adpBestArg = [np.argmax(a) for a in adpScores] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx] isBadAdp = self.isBadAdp( adpBestIdx, adpIdxIncorrect ) isMissingAdp = self.isMissingAdp( adpBestIdx, adpIdxIncorrect ) bestIdxStr = '--'.join(adpBestIdx) adpEndStr = '--'.join(adpEnds) if isBadAdp: print "{0},ExtraAdp,{1},{2}".format(zmw.zmwName, bestIdxStr, adpEndStr) elif isMissingAdp: print "{0},MissingAdp,{1},{2}".format(zmw.zmwName, bestIdxStr, adpEndStr) elif sum(adpIdxIncorrect) == 0 and not isMissingAdp: print "{0},Perfect,{1},{2}".format(zmw.zmwName, bestIdxStr, adpEndStr) elif options.summarizeErrorsRc: errors = {'total': 0, 'true': 0, 'mixed':0, 'other': 0, 'badAdp':0, 'single':0, 'multi':0, 'map':0, 'bad':0, 'readScore':0} for zmw in self._sequencingZmws: # Score the Zmw as per normal trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmwRc( zmw ) adpScores = res[1] # Figure out the best scores and Ids for each Adapter adpBestArg = [np.argmax(a) for a in adpScores] adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] # Figure out which were correct, and which are in error adpIdxCorrect = [1 if idx in trueIdxPts else 0 for idx in adpBestIdx] adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx] numErrors = sum(adpIdxIncorrect) # Test for certain error patterns isBadAdp = self.isBadAdp( adpBestIdx, adpIdxIncorrect ) isEndError = self.isEndError( adpIdxIncorrect ) isMixedError = self.isMixedError( adpIdxIncorrect ) # Figure out the scores and averages for the correct/incorrect barcode calls adpScoreCorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 1] adpScoreIncorrect = [adpBestScores[i] for i,v in enumerate(adpIdxCorrect) if v == 0] avgCorrect = sum(adpScoreCorrect)/float(len(adpScoreCorrect)) if len(adpScoreCorrect) else 'N/A' avgIncorrect = sum(adpScoreIncorrect)/float(len(adpScoreIncorrect)) if len(adpScoreIncorrect) else 'N/A' # Summarize the errors errors['total'] += numErrors if numErrors == 1: errors['single'] += numErrors else: errors['multi'] += numErrors if avgCorrect == 'N/A': errors['map'] += numErrors elif isBadAdp: errors['badAdp'] += numErrors elif isEndError and avgIncorrect > 30.0: errors['true'] += numErrors elif isMixedError: errors['mixed'] += numErrors elif numErrors == 1 and avgIncorrect < 30.0: errors['bad'] += numErrors elif zmw.readScore <= 0.8: errors['readScore'] += numErrors else: errors['other'] += numErrors print zmw.zmwName print adpBestIdx print adpIdxIncorrect print avgCorrect, avgIncorrect print isBadAdp, isEndError, isMixedError print errors # Otherwise score the barcodes normally and return a CSV elif options.pbbarcode2: for zmw in self._sequencingZmws: trueIdx = self._whiteListIdx[ zmw.zmwName ] trueIdxPts = trueIdx.split('--') res = scorer.scoreZmw( zmw ) adpScores = res[1] print scorer.barcodeNames print res[0] print res[1] adpBestArg = [np.argmax(a) for a in adpScores] print adpBestArg print [max(a) for a in adpScores] adpBestScores = [adpScores[i][x] for i,x in enumerate(adpBestArg)] adpBestIdx = [scorer.barcodeNames[i] for i in adpBestArg] uniqueBestIdx = sorted(set(adpBestIdx)) adpIdxIncorrect = [0 if idx in trueIdxPts else 1 for idx in adpBestIdx] print zmw.zmwName print adpBestIdx print adpIdxIncorrect scorer.scoreSelectedAdapters(zmw, adpIdxIncorrect, uniqueBestIdx) print