def test_whitelist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name WHITELIST = set([24962, 32901, 30983]) def _run_with_whitelist(wl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == WHITELIST _run_with_whitelist(WHITELIST) _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(WHITELIST)])) _run_with_whitelist(tmp_wl) # now with a BAM file as whitelist rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=SUBREADS4) with BamReader(ofn) as bam_out: assert 117 == len([rec for rec in bam_out])
def __read_bam(fn): if op.exists(fn + ".pbi"): with IndexedBamReader(fn) as bam_in: return bam_in else: with BamReader(fn) as bam_in: return bam_in
def main(parser): args = parser.parse_args() bam = BamReader(args.ccsBAM) bcFofn = BarcodeH5Fofn(args.barcodeFofn) oFiles = { bc:FastqWriter('{dir}/{bc}.fastq'.format(dir=args.outDir,bc=bc)) for bc in bcFofn.barcodeLabels } for rec in bam: try: lZmw = bcFofn.labeledZmwFromName(rec.readName) except KeyError: #catch zmws with no barcode and skip continue if rec.readScore >= args.minPredictedAccuracy \ and lZmw.averageScore >= args.minAvgBarcodeScore \ and rec.numPasses >= args.minNumPasses: header = rec.readName if args.extendedHeader: header += ' predictedAccuracy={predAcc} numPasses={numPasses} barcodeScore={bcScore}'\ .format(predAcc=rec.readScore, numPasses=rec.numPasses, bcScore=lZmw.averageScore) qual = [ ord(q)-33 for q in rec.peer.qual ] writer = oFiles[bcFofn.barcodeLabels[lZmw.bestIdx]] writer.writeRecord(header, rec.read(aligned=False), qual) for f in oFile.values(): f.close()
def test_retrieve_read_group_properties(self): movie_names = [] with BamReader(self.bam_file) as bam_in: for aln in bam_in: assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M' movie_names.extend([rg.MovieName for rg in bam_in.readGroupTable]) assert movie_names == ['movie1', 'm64012_181222_192540']
def _run_with_whitelist(wl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == WHITELIST
def _run_with_whitelist(wl): rc = bamSieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) self.assertEqual(rc, 0) with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, WHITELIST)
def _verify(): with openDataSet(ofn, strict=False) as ds_out: ext_res = ds_out.externalResources[0] for bam_file in [ext_res.bam, ext_res.scraps]: with BamReader(bam_file) as bam: zmws = set([rec.HoleNumber for rec in bam]) self.assertEqual(len(zmws), 1) self.assertTrue(74056024 in zmws)
def _run_with_blacklist(bl): rc = bamSieve.filter_reads(input_bam=SUBREADS2, output_bam=ofn, blacklist=bl) self.assertEqual(rc, 0) with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([9]))
def _run_with_blacklist(bl): rc = bamsieve.filter_reads(input_bam=SUBREADS2, output_bam=ofn, blacklist=bl) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == set([9])
class TestUnalignedBam(object): def __init__(self): self.bam = BamReader (data.getUnalignedBam()) self.bax = BaxH5Reader(data.getBaxForBam()) self.baxRead0 = next(self.bax.subreads()) self.bamRead0 = next(iter(self.bam)) def testInvalidOperations(self): # These kinds of things presently work. Do we want them to # fail? # with assert_raises(UnavailableFeature): # self.bamRead0.isForwardStrand # with assert_raises(UnavailableFeature): # self.bamRead0.tStart # attempts to get read aligned or oriented with assert_raises(UnavailableFeature): self.bamRead0.read(aligned=True, orientation="native") with assert_raises(UnavailableFeature): self.bamRead0.read(aligned=False, orientation="genomic") with assert_raises(UnavailableFeature): self.bamRead0.read() with assert_raises(UnavailableFeature): self.bamRead0.InsertionQV(aligned=True, orientation="native") with assert_raises(UnavailableFeature): self.bamRead0.InsertionQV(aligned=False, orientation="genomic") with assert_raises(UnavailableFeature): self.bamRead0.InsertionQV() def testReadAccess(self): EQ(self.bamRead0.read(aligned=False, orientation="native"), self.baxRead0.basecalls()) def testQvAccess(self): AEQ(self.bamRead0.SubstitutionQV(aligned=False, orientation="native"), self.baxRead0.SubstitutionQV()) AEQ(self.bamRead0.InsertionQV(aligned=False, orientation="native"), self.baxRead0.InsertionQV()) AEQ(self.bamRead0.DeletionTag(aligned=False, orientation="native"), self.baxRead0.DeletionTag()) def testZmwInfo(self): # WAT. Need to make these accessors more uniform. This is # totally crazy. EQ(self.bamRead0.HoleNumber, self.baxRead0.holeNumber) EQ(self.bamRead0.qStart, self.baxRead0.readStart) EQ(self.bamRead0.qEnd, self.baxRead0.readEnd) def testNames(self): EQ(self.bamRead0.queryName, self.baxRead0.readName) def testIpd(self): """Check that 'Ipd' feature is recognized correctly.""" pfa = self.bam.baseFeaturesAvailable() EQ(pfa, frozenset(['Ipd', 'DeletionTag', 'MergeQV', 'SubstitutionQV', 'InsertionQV', 'DeletionQV'])) ipd = self.bamRead0.IPD(aligned=False, orientation="native")
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (BAM), not requiring index capability """ if fname.endswith("cmp.h5"): raise_no_h5() elif fname.endswith("bam"): return BamReader(fname, referenceFastaFname)
def test_percentage(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamSieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, percentage=50, seed=12345) self.assertEqual(rc, 0) with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(len(zmws), 24)
def _run_with_blacklist(bl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, blacklist=bl, use_subreads=True) assert rc == 0 with BamReader(ofn) as bam_out: qnames = set([rec.qName for rec in bam_out]) assert qnames & BLACKLIST == set() assert len([x for x in bam_out]) == 114
def test_count(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, count=1, seed=12345) assert rc == 0 with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) assert len(zmws) == 1
def test_sample_names(self): with BamReader(self.bam_file) as bam: samples = { rg.MovieName: rg.SampleName for rg in bam.readGroupTable } assert samples == { "movie1": "test_sample1", "m64012_181222_192540": "test_sample2" }
def test_barcodes(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamsieve.filter_reads(input_bam=BARCODED, output_bam=ofn, whitelist=[0], use_barcodes=True) with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) assert len(zmws) == 1 assert 74056024 in zmws
def _verify(): with SubreadSet(ofn, strict=False) as ds_out: ext_res = ds_out.externalResources[0] assert ext_res.bam.endswith(".subreads.bam") assert ext_res.scraps.endswith(".scraps.bam") for bam_file in [ext_res.bam, ext_res.scraps]: with BamReader(bam_file) as bam: zmws = set([rec.HoleNumber for rec in bam]) assert len(zmws) == 1 assert 74056024 in zmws
def _readCmpH5Input(self): """ Read the CmpH5 input file into a CmpH5 object and store it as self._inCmpH5. """ fname = options.inputFilename if options.usingBam: self._inCmpH5 = BamReader(fname, options.referenceFilename) else: logging.debug("Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) self._inCmpH5 = CmpH5Reader(fname)
def test_count_overflow(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name with warnings.catch_warnings(record=True) as w: rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, count=100000, seed=12345) assert rc == 0 assert len(w) == 1 with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) assert len(zmws) == 48
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (cmp.h5 or BAM), not requiring index capability (A `sharedIndex` can still be passed for opening a cmp.h5, for which the index is compulsory.) """ if fname.endswith("cmp.h5"): return CmpH5Reader(fname, sharedIndex=sharedIndex) elif fname.endswith("bam"): return BamReader(fname, referenceFastaFname)
def test_subreads_whitelist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name WHITELIST = set([ 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298' ]) ZMWS = set([1650, 7957]) def _run_with_whitelist(wl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl, use_subreads=True) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == ZMWS qnames = set([rec.qName for rec in bam_out]) assert qnames == WHITELIST _run_with_whitelist(WHITELIST) _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(WHITELIST)])) _run_with_whitelist(tmp_wl) # now with a BAM file as whitelist rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn2, use_subreads=True, whitelist=ofn) with BamReader(ofn) as bam_out: subreads = set([x.qName for x in bam_out]) with BamReader(ofn2) as bam_out: subreads2 = set([x.qName for x in bam_out]) assert subreads == subreads2
def test_alignment_identity_unindexed(self): """ Check that the value of the 'identity' property is the same whether or not the .pbi index was used to calculate it. """ fn1 = data.getAlignedBam() fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name shutil.copyfile(fn1, fn2) with IndexedBamReader(fn1) as bam_pbi: with BamReader(fn2) as bam_noindex: i1 = np.array([rec.identity for rec in bam_pbi]) i2 = np.array([rec.identity for rec in bam_noindex]) assert (i2 == i1).all()
def test_retrieve_read_group_properties(self): f1 = tempfile.NamedTemporaryFile(suffix=".sam").name f2 = tempfile.NamedTemporaryFile(suffix=".bam").name with open(f1, "w") as f: f.write(self.SAM_IN) with pysam.AlignmentFile(f1) as sam_in: with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out: for aln in sam_in: bam_out.write(aln) movie_names = [] with BamReader(f2) as bam_in: for aln in bam_in: EQ(aln.sequencingChemistry, "P6-C4") movie_names.append(aln.movieName) EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
def test_split_bam(self): bam_file1 = self._get_bam_path(self.DS1) CHUNKS_IN = [1, 2, 3, 4] CHUNKS_OUT = [1, 2, 3, 3] for n_in, n_expected in zip(CHUNKS_IN, CHUNKS_OUT): nchunks = split_bam(bam_file1, n_in) assert nchunks == n_expected bam_in = IndexedBamReader(bam_file1) records_in = [rec.qName for rec in bam_in] records_out = [] for i in range(n_expected): bam_out = BamReader("reads.chunk%d.bam" % i) records_out.extend([rec.qName for rec in bam_out]) assert records_in == records_out self._remove_all()
def test_retrieve_read_group_properties(self): f1 = tempfile.NamedTemporaryFile(suffix=".sam").name f2 = tempfile.NamedTemporaryFile(suffix=".bam").name with open(f1, "w") as f: f.write(self.SAM_IN) with AlignmentFile(f1) as sam_in: with AlignmentFile(f2, 'wb', template=sam_in) as bam_out: for aln in sam_in: bam_out.write(aln) movie_names = [] with BamReader(f2) as bam_in: for aln in bam_in: assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M' movie_names.append(aln.movieName) assert movie_names == ['movie1', 'm64012_181222_192540']
def test_integration(self): args = ["bamsieve", "--help"] with tempfile.TemporaryFile() as stdout: with tempfile.TemporaryFile() as stderr: rc = subprocess.call(args, stdout=stdout, stderr=stderr) assert rc == 0 ofn = tempfile.NamedTemporaryFile(suffix=".bam").name args = [ "bamsieve", "--log-level", "ERROR", "--whitelist", "8,233", SUBREADS2, ofn ] rc = subprocess.call(args) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == set([8])
def run(dataset_file): """Reads in the input.fofn and counts movies and cells. Outputs in XML.""" with openDataSet(dataset_file) as ds: movies = None movies = set([]) for file_name in ds.toExternalFiles(): if type(ds).__name__ == "HdfSubreadSet": movies.add(path_to_movie(file_name)) else: with BamReader(file_name) as bam: for rg in bam.peer.header["RG"]: movies.add(rg["PU"]) cells = set([movie_to_cell(movie) for movie in movies]) ncells_attr = Attribute(Constants.A_NCELLS, len(cells)) nmovies_attr = Attribute(Constants.A_NMOVIES, len(movies)) attrs = [ncells_attr, nmovies_attr] report = Report(Constants.R_ID, attributes=attrs) return spec.apply_view(report)
def test_combine_with_header(self): bam_file = self._get_bam_path(self.DS1) bam_size = op.getsize(bam_file) # see above - these are known boundaries for this particular input byte_ranges = [(396, 26575), (26575, 77209), (77209, bam_size)] with open(bam_file, "rb") as bam_in: with open("header.bam", "wb") as header_out: header_out.write(bam_in.read(396)) for i, (start, end) in enumerate(byte_ranges): with open("tmp.chunk%d.bam" % i, "wb") as chunk_out: bam_in.seek(start) nbytes = end - start chunk_out.write(bam_in.read(nbytes)) for i in range(3): combine_with_header("header.bam", "tmp.chunk%d.bam" % i, "combined.chunk%d.bam" % i) bam_in = IndexedBamReader(bam_file) records_in = [rec.qName for rec in bam_in] records_out = [] for i in range(3): bam_out = BamReader("combined.chunk%d.bam" % i) records_out.extend([rec.qName for rec in bam_out]) assert records_in == records_out
class ToolRunner(object): """ The main driver class for the GenomicConsensus tool. """ def __init__(self): self._inCmpH5 = None self._resultsQueue = None self._workQueue = None self._slaves = None self._algorithm = None self._algorithmConfiguration = None self._aborting = False def _setupLogging(self): if options.quiet: logLevel = logging.ERROR elif options.verbosity >= 2: logLevel = logging.DEBUG elif options.verbosity == 1: logLevel = logging.INFO else: logLevel = logging.WARNING logFormat = '[%(levelname)s] %(message)s' logging.basicConfig(level=logLevel, format=logFormat) def _makeTemporaryDirectory(self): """ Make a temp dir where we can stash things if necessary. """ options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp") logging.info("Created temporary directory %s" % (options.temporaryDirectory,) ) def _algorithmByName(self, name): if name=="plurality": algo = plurality elif name=="quiver": algo = quiver else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) return algo def _launchSlaves(self): """ Launch a group of worker processes (self._slaves), the queue (self._workQueue) that will be used to send them chunks of work, and the queue that will be used to receive back the results (self._resultsQueue). Additionally, launch the result collector process. """ availableCpus = multiprocessing.cpu_count() logging.info("Available CPUs: %d" % (availableCpus,)) logging.info("Requested workers: %d" % (options.numWorkers,)) logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process",)) if (options.numWorkers > availableCpus): logging.warn("More workers requested (%d) than CPUs available (%d);" " may result in suboptimal performance." % (options.numWorkers, availableCpus)) self._initQueues() WorkerType, ResultCollectorType = self._algorithm.slaveFactories(options.threaded) self._slaves = [] for i in xrange(options.numWorkers): p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration) self._slaves.append(p) p.start() logging.info("Launched compute slaves.") rcp = ResultCollectorType(self._resultsQueue, self._algorithmConfiguration) rcp.start() self._slaves.append(rcp) logging.info("Launched collector slave.") def _initQueues(self): if options.threaded: self._workQueue = Queue.Queue(options.queueSize) self._resultsQueue = Queue.Queue(options.queueSize) else: self._workQueue = multiprocessing.Queue(options.queueSize) self._resultsQueue = multiprocessing.Queue(options.queueSize) def _readCmpH5Input(self): """ Read the CmpH5 input file into a CmpH5 object and store it as self._inCmpH5. """ fname = options.inputFilename if options.usingBam: self._inCmpH5 = BamReader(fname, options.referenceFilename) else: logging.debug("Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) self._inCmpH5 = CmpH5Reader(fname) def _loadReference(self, cmpH5): logging.info("Loading reference") err = reference.loadFromFile(options.referenceFilename, cmpH5) if err: die("Error loading reference") # Grok the referenceWindow spec, if any. if options.referenceWindowsAsString is None: options.referenceWindows = () elif options.skipUnrecognizedContigs: # This is a workaround for smrtpipe scatter/gather. options.referenceWindows = [] for s in options.referenceWindowsAsString.split(","): try: win = reference.stringToWindow(s) options.referenceWindows.append(win) except: pass else: options.referenceWindows = map(reference.stringToWindow, options.referenceWindowsAsString.split(",")) def _checkFileCompatibility(self, cmpH5): if not cmpH5.isSorted: die("Input CmpH5 file must be sorted.") if cmpH5.isEmpty: die("Input CmpH5 file must be nonempty.") def _shouldDisableChunkCache(self, cmpH5): if isinstance(cmpH5, CmpH5Reader): threshold = options.autoDisableHdf5ChunkCache return datasetCountExceedsThreshold(cmpH5, threshold) else: return False def _configureAlgorithm(self, options, cmpH5): assert self._algorithm != None try: self._algorithmConfiguration = self._algorithm.configure(options, cmpH5) except IncompatibleDataException as e: die("Failure: %s" % e.message) def _mainLoop(self): # Split up reference genome into chunks and farm out the # a chunk as a unit of work. logging.debug("Starting main loop.") ids = reference.enumerateIds(options.referenceWindows) for _id in ids: if options.fancyChunking: chunks = reference.fancyEnumerateChunks(self._inCmpH5, _id, options.referenceChunkSize, options.minCoverage, options.minMapQV, options.referenceWindows) else: chunks = reference.enumerateChunks(_id, options.referenceChunkSize, options.referenceWindows) for chunk in chunks: if self._aborting: return self._workQueue.put(chunk) # Write sentinels ("end-of-work-stream") for i in xrange(options.numWorkers): self._workQueue.put(None) def _printProfiles(self): for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")): pstats.Stats(profile).sort_stats("time").print_stats(20) def _cleanup(self): if options.doProfiling: logging.info("Removing %s" % options.temporaryDirectory) shutil.rmtree(options.temporaryDirectory, ignore_errors=True) def _setupEvidenceDumpDirectory(self, directoryName): if os.path.exists(directoryName): shutil.rmtree(directoryName) os.makedirs(directoryName) @property def aborting(self): return self._aborting def abortWork(self, why): """ Performs a shutdown of all the slave processes. Called by the monitoring thread when a child process exits with a non-zero, or when a keyboard interrupt (Ctrl-C) is given. Not called during normal shutdown. """ logging.error(why) self._aborting = True self._resultsQueue.close() self._workQueue.close() @property def slaves(self): return self._slaves def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() parseOptions() self._algorithm = self._algorithmByName(options.algorithm) self._setupLogging() random.seed(42) logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() if options.usingBam: logging.warn("'fancyChunking' not yet available for BAM, disabling") options.fancyChunking = False # Peek at the bam file to build tables with BamReader(options.inputFilename) as peekCmpH5: logging.info("Peeking at BAM file %s" % options.inputFilename) logging.info("Input BAM data: numAlnHits=%d" % len(peekCmpH5)) resolveOptions(peekCmpH5) self._loadReference(peekCmpH5) self._checkFileCompatibility(peekCmpH5) self._configureAlgorithm(options, peekCmpH5) else: # We need to peek at the cmp.h5 file to build the The # refGroupId<->refGroupFullName mapping, and to determine # whether the selected algorithm parameters (Quiver) are # compatible with the data. But we then have to close the # file, and let the "real" open happen after the fork. with CmpH5Reader(options.inputFilename) as peekCmpH5: logging.info("Peeking at CmpH5 file %s" % options.inputFilename) logging.info("Input CmpH5 data: numAlnHits=%d" % len(peekCmpH5)) resolveOptions(peekCmpH5) self._loadReference(peekCmpH5) self._checkFileCompatibility(peekCmpH5) self._configureAlgorithm(options, peekCmpH5) options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekCmpH5) if options.disableHdf5ChunkCache: logging.info("Will disable HDF5 chunk cache (large number of datasets)") logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) if options.dumpEvidence: self._setupEvidenceDumpDirectory(options.evidenceDirectory) self._launchSlaves() self._readCmpH5Input() monitoringThread = threading.Thread(target=monitorSlaves, args=(self,)) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-main.out")) elif options.doDebugging: if not options.threaded: die("Debugging only works with -T (threaded) mode") logging.info("PID: %d", os.getpid()) import ipdb with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except: why = traceback.format_exc() self.abortWork(why) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inCmpH5.close() return 0
def setup_class(self): self.bam = BamReader (data.getUnalignedBam()) self.bax = BaxH5Reader(data.getBaxForBam()) self.baxRead0 = next(self.bax.subreads()) self.bamRead0 = next(iter(self.bam))
def __init__(self): self.bam = BamReader (data.getUnalignedBam()) self.bax = BaxH5Reader(data.getBaxForBam()) self.baxRead0 = next(self.bax.subreads()) self.bamRead0 = next(iter(self.bam))
def __init__(self): self.f = BamReader(data.getCCSBAM())
def test_mapped_bam_cigar_cref_skip(self): fn = "/pbi/dept/secondary/siv/testdata/pbcore-unittest/data/ITG-2283-cref-skip.subreads.bam" bam = BamReader(fn) for rec in bam: assert rec.read(aligned=True) is not None
def setup_class(cls): cls.f = BamReader(data.getCCSBAM())
def test_subreads_blacklist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name BLACKLIST = set([ 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298' ]) def _run_with_blacklist(bl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, blacklist=bl, use_subreads=True) assert rc == 0 with BamReader(ofn) as bam_out: qnames = set([rec.qName for rec in bam_out]) assert qnames & BLACKLIST == set() assert len([x for x in bam_out]) == 114 _run_with_blacklist(BLACKLIST) _run_with_blacklist(",".join([str(x) for x in list(BLACKLIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(BLACKLIST)])) _run_with_blacklist(tmp_wl) # now with the BAM file we just made as blacklist EXPECTED_OUT = BLACKLIST rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn2, use_subreads=True, blacklist=ofn) with BamReader(ofn) as bam_out: subreads = set([x.qName for x in bam_out]) with BamReader(ofn2) as bam_out: subreads2 = set([x.qName for x in bam_out]) assert subreads & subreads2 == set() assert subreads2 == EXPECTED_OUT # now an integration test, because this is used in Cromwell workflow ofn3 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name args = ["bamsieve", "--subreads", "--blacklist", ofn, SUBREADS3, ofn3] rc = subprocess.check_call(args) with BamReader(ofn3) as bam_out: subreads3 = set([x.qName for x in bam_out]) assert subreads & subreads3 == set() assert subreads3 == EXPECTED_OUT # and again, with a dataset as input ds_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name with SubreadSet(ofn) as ds: ds.write(ds_tmp) ofn4 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name args = [ "bamsieve", "--subreads", "--blacklist", ds_tmp, SUBREADS3, ofn4 ] rc = subprocess.check_call(args) with BamReader(ofn4) as bam_out: subreads4 = set([x.qName for x in bam_out]) assert subreads & subreads4 == set() assert subreads4 == EXPECTED_OUT