def test_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([24962, 32901, 30983])

        def _run_with_whitelist(wl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       whitelist=wl)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                assert have_zmws == WHITELIST

        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn,
                                   whitelist=SUBREADS4)
        with BamReader(ofn) as bam_out:
            assert 117 == len([rec for rec in bam_out])
Beispiel #2
0
 def __read_bam(fn):
     if op.exists(fn + ".pbi"):
         with IndexedBamReader(fn) as bam_in:
             return bam_in
     else:
         with BamReader(fn) as bam_in:
             return bam_in
Beispiel #3
0
def main(parser):

    args = parser.parse_args()

    bam    = BamReader(args.ccsBAM)
    bcFofn = BarcodeH5Fofn(args.barcodeFofn)

    oFiles =  { bc:FastqWriter('{dir}/{bc}.fastq'.format(dir=args.outDir,bc=bc)) for bc in bcFofn.barcodeLabels }
    for rec in bam:
        try:
            lZmw = bcFofn.labeledZmwFromName(rec.readName)
        except KeyError:
            #catch zmws with no barcode and skip
            continue
        if       rec.readScore     >= args.minPredictedAccuracy \
             and lZmw.averageScore >= args.minAvgBarcodeScore \
             and rec.numPasses     >= args.minNumPasses:
            header = rec.readName
            if args.extendedHeader:
                header +=  ' predictedAccuracy={predAcc} numPasses={numPasses} barcodeScore={bcScore}'\
                           .format(predAcc=rec.readScore, numPasses=rec.numPasses, bcScore=lZmw.averageScore)
            qual = [ ord(q)-33  for q in rec.peer.qual ]
            writer = oFiles[bcFofn.barcodeLabels[lZmw.bestIdx]]
            writer.writeRecord(header, rec.read(aligned=False), qual)
    
    for f in oFile.values():
        f.close()
 def test_retrieve_read_group_properties(self):
     movie_names = []
     with BamReader(self.bam_file) as bam_in:
         for aln in bam_in:
             assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M'
         movie_names.extend([rg.MovieName for rg in bam_in.readGroupTable])
     assert movie_names == ['movie1', 'm64012_181222_192540']
 def _run_with_whitelist(wl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                whitelist=wl)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == WHITELIST
Beispiel #6
0
 def _run_with_whitelist(wl):
     rc = bamSieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                whitelist=wl)
     self.assertEqual(rc, 0)
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, WHITELIST)
Beispiel #7
0
 def _verify():
     with openDataSet(ofn, strict=False) as ds_out:
         ext_res = ds_out.externalResources[0]
         for bam_file in [ext_res.bam, ext_res.scraps]:
             with BamReader(bam_file) as bam:
                 zmws = set([rec.HoleNumber for rec in bam])
                 self.assertEqual(len(zmws), 1)
                 self.assertTrue(74056024 in zmws)
Beispiel #8
0
 def _run_with_blacklist(bl):
     rc = bamSieve.filter_reads(input_bam=SUBREADS2,
                                output_bam=ofn,
                                blacklist=bl)
     self.assertEqual(rc, 0)
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([9]))
 def _run_with_blacklist(bl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS2,
                                output_bam=ofn,
                                blacklist=bl)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([9])
class TestUnalignedBam(object):

    def __init__(self):
        self.bam = BamReader  (data.getUnalignedBam())
        self.bax = BaxH5Reader(data.getBaxForBam())

        self.baxRead0 = next(self.bax.subreads())
        self.bamRead0 = next(iter(self.bam))

    def testInvalidOperations(self):

        # These kinds of things presently work.  Do we want them to
        # fail?

        # with assert_raises(UnavailableFeature):
        #     self.bamRead0.isForwardStrand
        # with assert_raises(UnavailableFeature):
        #     self.bamRead0.tStart

        # attempts to get read aligned or oriented
        with assert_raises(UnavailableFeature):
            self.bamRead0.read(aligned=True, orientation="native")
        with assert_raises(UnavailableFeature):
            self.bamRead0.read(aligned=False, orientation="genomic")
        with assert_raises(UnavailableFeature):
            self.bamRead0.read()
        with assert_raises(UnavailableFeature):
            self.bamRead0.InsertionQV(aligned=True, orientation="native")
        with assert_raises(UnavailableFeature):
            self.bamRead0.InsertionQV(aligned=False, orientation="genomic")
        with assert_raises(UnavailableFeature):
            self.bamRead0.InsertionQV()

    def testReadAccess(self):
        EQ(self.bamRead0.read(aligned=False, orientation="native"), self.baxRead0.basecalls())

    def testQvAccess(self):
        AEQ(self.bamRead0.SubstitutionQV(aligned=False, orientation="native"), self.baxRead0.SubstitutionQV())
        AEQ(self.bamRead0.InsertionQV(aligned=False, orientation="native"),    self.baxRead0.InsertionQV())
        AEQ(self.bamRead0.DeletionTag(aligned=False, orientation="native"),    self.baxRead0.DeletionTag())

    def testZmwInfo(self):
        # WAT.  Need to make these accessors more uniform.  This is
        # totally crazy.
        EQ(self.bamRead0.HoleNumber, self.baxRead0.holeNumber)
        EQ(self.bamRead0.qStart,     self.baxRead0.readStart)
        EQ(self.bamRead0.qEnd,       self.baxRead0.readEnd)

    def testNames(self):
        EQ(self.bamRead0.queryName, self.baxRead0.readName)

    def testIpd(self):
        """Check that 'Ipd' feature is recognized correctly."""
        pfa = self.bam.baseFeaturesAvailable()
        EQ(pfa, frozenset(['Ipd', 'DeletionTag', 'MergeQV', 'SubstitutionQV',
                           'InsertionQV', 'DeletionQV']))
        ipd = self.bamRead0.IPD(aligned=False, orientation="native")
Beispiel #11
0
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file (BAM),
    not requiring index capability
    """
    if fname.endswith("cmp.h5"):
        raise_no_h5()
    elif fname.endswith("bam"):
        return BamReader(fname, referenceFastaFname)
Beispiel #12
0
 def test_percentage(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamSieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                percentage=50,
                                seed=12345)
     self.assertEqual(rc, 0)
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(len(zmws), 24)
Beispiel #13
0
 def _run_with_blacklist(bl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                blacklist=bl,
                                use_subreads=True)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         qnames = set([rec.qName for rec in bam_out])
         assert qnames & BLACKLIST == set()
         assert len([x for x in bam_out]) == 114
Beispiel #14
0
 def test_count(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                count=1,
                                seed=12345)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         assert len(zmws) == 1
 def test_sample_names(self):
     with BamReader(self.bam_file) as bam:
         samples = {
             rg.MovieName: rg.SampleName
             for rg in bam.readGroupTable
         }
         assert samples == {
             "movie1": "test_sample1",
             "m64012_181222_192540": "test_sample2"
         }
Beispiel #16
0
 def test_barcodes(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=BARCODED,
                                output_bam=ofn,
                                whitelist=[0],
                                use_barcodes=True)
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         assert len(zmws) == 1
         assert 74056024 in zmws
Beispiel #17
0
 def _verify():
     with SubreadSet(ofn, strict=False) as ds_out:
         ext_res = ds_out.externalResources[0]
         assert ext_res.bam.endswith(".subreads.bam")
         assert ext_res.scraps.endswith(".scraps.bam")
         for bam_file in [ext_res.bam, ext_res.scraps]:
             with BamReader(bam_file) as bam:
                 zmws = set([rec.HoleNumber for rec in bam])
                 assert len(zmws) == 1
                 assert 74056024 in zmws
Beispiel #18
0
 def _readCmpH5Input(self):
     """
     Read the CmpH5 input file into a CmpH5 object and
     store it as self._inCmpH5.
     """
     fname = options.inputFilename
     if options.usingBam:
         self._inCmpH5 = BamReader(fname, options.referenceFilename)
     else:
         logging.debug("Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())
         self._inCmpH5 = CmpH5Reader(fname)
Beispiel #19
0
 def test_count_overflow(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     with warnings.catch_warnings(record=True) as w:
         rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                    output_bam=ofn,
                                    count=100000,
                                    seed=12345)
         assert rc == 0
         assert len(w) == 1
         with BamReader(ofn) as bam_out:
             zmws = set([rec.HoleNumber for rec in bam_out])
             assert len(zmws) == 48
Beispiel #20
0
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file
    (cmp.h5 or BAM), not requiring index capability

    (A `sharedIndex` can still be passed for opening a cmp.h5, for which
    the index is compulsory.)
    """
    if fname.endswith("cmp.h5"):
        return CmpH5Reader(fname, sharedIndex=sharedIndex)
    elif fname.endswith("bam"):
        return BamReader(fname, referenceFastaFname)
Beispiel #21
0
    def test_subreads_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298'
        ])
        ZMWS = set([1650, 7957])

        def _run_with_whitelist(wl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       whitelist=wl,
                                       use_subreads=True)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                assert have_zmws == ZMWS
                qnames = set([rec.qName for rec in bam_out])
                assert qnames == WHITELIST

        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn2,
                                   use_subreads=True,
                                   whitelist=ofn)

        with BamReader(ofn) as bam_out:
            subreads = set([x.qName for x in bam_out])
        with BamReader(ofn2) as bam_out:
            subreads2 = set([x.qName for x in bam_out])
        assert subreads == subreads2
 def test_alignment_identity_unindexed(self):
     """
     Check that the value of the 'identity' property is the same whether
     or not the .pbi index was used to calculate it.
     """
     fn1 = data.getAlignedBam()
     fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     shutil.copyfile(fn1, fn2)
     with IndexedBamReader(fn1) as bam_pbi:
         with BamReader(fn2) as bam_noindex:
             i1 = np.array([rec.identity for rec in bam_pbi])
             i2 = np.array([rec.identity for rec in bam_noindex])
             assert (i2 == i1).all()
 def test_retrieve_read_group_properties(self):
     f1 = tempfile.NamedTemporaryFile(suffix=".sam").name
     f2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     with open(f1, "w") as f:
         f.write(self.SAM_IN)
     with pysam.AlignmentFile(f1) as sam_in:
         with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out:
             for aln in sam_in:
                 bam_out.write(aln)
     movie_names = []
     with BamReader(f2) as bam_in:
         for aln in bam_in:
             EQ(aln.sequencingChemistry, "P6-C4")
             movie_names.append(aln.movieName)
     EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
Beispiel #24
0
 def test_split_bam(self):
     bam_file1 = self._get_bam_path(self.DS1)
     CHUNKS_IN = [1, 2, 3, 4]
     CHUNKS_OUT = [1, 2, 3, 3]
     for n_in, n_expected in zip(CHUNKS_IN, CHUNKS_OUT):
         nchunks = split_bam(bam_file1, n_in)
         assert nchunks == n_expected
         bam_in = IndexedBamReader(bam_file1)
         records_in = [rec.qName for rec in bam_in]
         records_out = []
         for i in range(n_expected):
             bam_out = BamReader("reads.chunk%d.bam" % i)
             records_out.extend([rec.qName for rec in bam_out])
         assert records_in == records_out
         self._remove_all()
 def test_retrieve_read_group_properties(self):
     f1 = tempfile.NamedTemporaryFile(suffix=".sam").name
     f2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     with open(f1, "w") as f:
         f.write(self.SAM_IN)
     with AlignmentFile(f1) as sam_in:
         with AlignmentFile(f2, 'wb', template=sam_in) as bam_out:
             for aln in sam_in:
                 bam_out.write(aln)
     movie_names = []
     with BamReader(f2) as bam_in:
         for aln in bam_in:
             assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M'
             movie_names.append(aln.movieName)
     assert movie_names == ['movie1', 'm64012_181222_192540']
Beispiel #26
0
 def test_integration(self):
     args = ["bamsieve", "--help"]
     with tempfile.TemporaryFile() as stdout:
         with tempfile.TemporaryFile() as stderr:
             rc = subprocess.call(args, stdout=stdout, stderr=stderr)
             assert rc == 0
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     args = [
         "bamsieve", "--log-level", "ERROR", "--whitelist", "8,233",
         SUBREADS2, ofn
     ]
     rc = subprocess.call(args)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([8])
Beispiel #27
0
def run(dataset_file):
    """Reads in the input.fofn and counts movies and cells. Outputs in XML."""

    with openDataSet(dataset_file) as ds:
        movies = None
        movies = set([])
        for file_name in ds.toExternalFiles():
            if type(ds).__name__ == "HdfSubreadSet":
                movies.add(path_to_movie(file_name))
            else:
                with BamReader(file_name) as bam:
                    for rg in bam.peer.header["RG"]:
                        movies.add(rg["PU"])
        cells = set([movie_to_cell(movie) for movie in movies])
        ncells_attr = Attribute(Constants.A_NCELLS, len(cells))
        nmovies_attr = Attribute(Constants.A_NMOVIES, len(movies))
        attrs = [ncells_attr, nmovies_attr]
        report = Report(Constants.R_ID, attributes=attrs)
        return spec.apply_view(report)
Beispiel #28
0
 def test_combine_with_header(self):
     bam_file = self._get_bam_path(self.DS1)
     bam_size = op.getsize(bam_file)
     # see above - these are known boundaries for this particular input
     byte_ranges = [(396, 26575), (26575, 77209), (77209, bam_size)]
     with open(bam_file, "rb") as bam_in:
         with open("header.bam", "wb") as header_out:
             header_out.write(bam_in.read(396))
         for i, (start, end) in enumerate(byte_ranges):
             with open("tmp.chunk%d.bam" % i, "wb") as chunk_out:
                 bam_in.seek(start)
                 nbytes = end - start
                 chunk_out.write(bam_in.read(nbytes))
     for i in range(3):
         combine_with_header("header.bam", "tmp.chunk%d.bam" % i, "combined.chunk%d.bam" % i)
     bam_in = IndexedBamReader(bam_file)
     records_in = [rec.qName for rec in bam_in]
     records_out = []
     for i in range(3):
         bam_out = BamReader("combined.chunk%d.bam" % i)
         records_out.extend([rec.qName for rec in bam_out])
     assert records_in == records_out
Beispiel #29
0
class ToolRunner(object):
    """
    The main driver class for the GenomicConsensus tool.
    """
    def __init__(self):
        self._inCmpH5 = None
        self._resultsQueue = None
        self._workQueue = None
        self._slaves = None
        self._algorithm = None
        self._algorithmConfiguration = None
        self._aborting = False

    def _setupLogging(self):
        if options.quiet:
            logLevel = logging.ERROR
        elif options.verbosity >= 2:
            logLevel = logging.DEBUG
        elif options.verbosity == 1:
            logLevel = logging.INFO
        else:
            logLevel = logging.WARNING
        logFormat = '[%(levelname)s] %(message)s'
        logging.basicConfig(level=logLevel, format=logFormat)

    def _makeTemporaryDirectory(self):
        """
        Make a temp dir where we can stash things if necessary.
        """
        options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp")
        logging.info("Created temporary directory %s" % (options.temporaryDirectory,) )

    def _algorithmByName(self, name):
        if name=="plurality":
            algo = plurality
        elif name=="quiver":
            algo = quiver
        else:
            die("Failure: unrecognized algorithm %s" % name)
        isOK, msg = algo.availability
        if not isOK:
            die("Failure: %s" % msg)
        return algo

    def _launchSlaves(self):
        """
        Launch a group of worker processes (self._slaves), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested workers: %d" % (options.numWorkers,))
        logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process",))
        if (options.numWorkers > availableCpus):
            logging.warn("More workers requested (%d) than CPUs available (%d);"
                         " may result in suboptimal performance."
                         % (options.numWorkers, availableCpus))
        self._initQueues()

        WorkerType, ResultCollectorType = self._algorithm.slaveFactories(options.threaded)
        self._slaves = []
        for i in xrange(options.numWorkers):
            p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration)
            self._slaves.append(p)
            p.start()
        logging.info("Launched compute slaves.")

        rcp = ResultCollectorType(self._resultsQueue, self._algorithmConfiguration)
        rcp.start()
        self._slaves.append(rcp)
        logging.info("Launched collector slave.")

    def _initQueues(self):
        if options.threaded:
            self._workQueue = Queue.Queue(options.queueSize)
            self._resultsQueue = Queue.Queue(options.queueSize)
        else:
            self._workQueue = multiprocessing.Queue(options.queueSize)
            self._resultsQueue = multiprocessing.Queue(options.queueSize)

    def _readCmpH5Input(self):
        """
        Read the CmpH5 input file into a CmpH5 object and
        store it as self._inCmpH5.
        """
        fname = options.inputFilename
        if options.usingBam:
            self._inCmpH5 = BamReader(fname, options.referenceFilename)
        else:
            logging.debug("Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())
            self._inCmpH5 = CmpH5Reader(fname)

    def _loadReference(self, cmpH5):
        logging.info("Loading reference")
        err = reference.loadFromFile(options.referenceFilename, cmpH5)
        if err:
            die("Error loading reference")
        # Grok the referenceWindow spec, if any.
        if options.referenceWindowsAsString is None:
            options.referenceWindows = ()
        elif options.skipUnrecognizedContigs:
            # This is a workaround for smrtpipe scatter/gather.
            options.referenceWindows = []
            for s in options.referenceWindowsAsString.split(","):
                try:
                    win = reference.stringToWindow(s)
                    options.referenceWindows.append(win)
                except:
                    pass
        else:
            options.referenceWindows = map(reference.stringToWindow,
                                           options.referenceWindowsAsString.split(","))

    def _checkFileCompatibility(self, cmpH5):
        if not cmpH5.isSorted:
            die("Input CmpH5 file must be sorted.")
        if cmpH5.isEmpty:
            die("Input CmpH5 file must be nonempty.")

    def _shouldDisableChunkCache(self, cmpH5):
        if isinstance(cmpH5, CmpH5Reader):
            threshold = options.autoDisableHdf5ChunkCache
            return datasetCountExceedsThreshold(cmpH5, threshold)
        else:
            return False

    def _configureAlgorithm(self, options, cmpH5):
        assert self._algorithm != None
        try:
            self._algorithmConfiguration = self._algorithm.configure(options, cmpH5)
        except IncompatibleDataException as e:
            die("Failure: %s" % e.message)

    def _mainLoop(self):
        # Split up reference genome into chunks and farm out the
        # a chunk as a unit of work.
        logging.debug("Starting main loop.")
        ids = reference.enumerateIds(options.referenceWindows)
        for _id in ids:
            if options.fancyChunking:
                chunks = reference.fancyEnumerateChunks(self._inCmpH5,
                                                        _id,
                                                        options.referenceChunkSize,
                                                        options.minCoverage,
                                                        options.minMapQV,
                                                        options.referenceWindows)
            else:
                chunks = reference.enumerateChunks(_id,
                                                   options.referenceChunkSize,
                                                   options.referenceWindows)
            for chunk in chunks:
                if self._aborting: return
                self._workQueue.put(chunk)

        # Write sentinels ("end-of-work-stream")
        for i in xrange(options.numWorkers):
            self._workQueue.put(None)

    def _printProfiles(self):
        for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")):
            pstats.Stats(profile).sort_stats("time").print_stats(20)

    def _cleanup(self):
        if options.doProfiling:
            logging.info("Removing %s" % options.temporaryDirectory)
            shutil.rmtree(options.temporaryDirectory, ignore_errors=True)

    def _setupEvidenceDumpDirectory(self, directoryName):
        if os.path.exists(directoryName):
            shutil.rmtree(directoryName)
        os.makedirs(directoryName)

    @property
    def aborting(self):
        return self._aborting

    def abortWork(self, why):
        """
        Performs a shutdown of all the slave processes.  Called by the
        monitoring thread when a child process exits with a non-zero,
        or when a keyboard interrupt (Ctrl-C) is given. Not called
        during normal shutdown.
        """
        logging.error(why)
        self._aborting = True
        self._resultsQueue.close()
        self._workQueue.close()

    @property
    def slaves(self):
        return self._slaves

    def main(self):

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        parseOptions()
        self._algorithm = self._algorithmByName(options.algorithm)
        self._setupLogging()
        random.seed(42)

        logging.info("h5py version: %s" % h5py.version.version)
        logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
        logging.info("ConsensusCore version: %s" %
                     (consensusCoreVersion() or "ConsensusCore unavailable"))
        logging.info("Starting.")

        atexit.register(self._cleanup)
        if options.doProfiling:
            self._makeTemporaryDirectory()

        if options.usingBam:
            logging.warn("'fancyChunking' not yet available for BAM, disabling")
            options.fancyChunking = False

            # Peek at the bam file to build tables
            with BamReader(options.inputFilename) as peekCmpH5:
                logging.info("Peeking at BAM file %s" % options.inputFilename)
                logging.info("Input BAM data: numAlnHits=%d" % len(peekCmpH5))
                resolveOptions(peekCmpH5)
                self._loadReference(peekCmpH5)
                self._checkFileCompatibility(peekCmpH5)
                self._configureAlgorithm(options, peekCmpH5)
        else:
            # We need to peek at the cmp.h5 file to build the The
            # refGroupId<->refGroupFullName mapping, and to determine
            # whether the selected algorithm parameters (Quiver) are
            # compatible with the data.  But we then have to close the
            # file, and let the "real" open happen after the fork.
            with CmpH5Reader(options.inputFilename) as peekCmpH5:
                logging.info("Peeking at CmpH5 file %s" % options.inputFilename)
                logging.info("Input CmpH5 data: numAlnHits=%d" % len(peekCmpH5))
                resolveOptions(peekCmpH5)
                self._loadReference(peekCmpH5)
                self._checkFileCompatibility(peekCmpH5)
                self._configureAlgorithm(options, peekCmpH5)
                options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekCmpH5)
                if options.disableHdf5ChunkCache:
                    logging.info("Will disable HDF5 chunk cache (large number of datasets)")
            logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())

        if options.dumpEvidence:
            self._setupEvidenceDumpDirectory(options.evidenceDirectory)

        self._launchSlaves()
        self._readCmpH5Input()

        monitoringThread = threading.Thread(target=monitorSlaves, args=(self,))
        monitoringThread.start()

        try:
            if options.doProfiling:
                cProfile.runctx("self._mainLoop()",
                                globals=globals(),
                                locals=locals(),
                                filename=os.path.join(options.temporaryDirectory,
                                                      "profile-main.out"))

            elif options.doDebugging:
                if not options.threaded:
                    die("Debugging only works with -T (threaded) mode")
                logging.info("PID: %d", os.getpid())
                import ipdb
                with ipdb.launch_ipdb_on_exception():
                    self._mainLoop()

            else:
                self._mainLoop()
        except:
            why = traceback.format_exc()
            self.abortWork(why)

        monitoringThread.join()

        if self._aborting:
            logging.error("Aborting")
            return -1
        else:
            logging.info("Finished.")

        if options.doProfiling:
            self._printProfiles()

        # close h5 file.
        self._inCmpH5.close()
        return 0
    def setup_class(self):
        self.bam = BamReader  (data.getUnalignedBam())
        self.bax = BaxH5Reader(data.getBaxForBam())

        self.baxRead0 = next(self.bax.subreads())
        self.bamRead0 = next(iter(self.bam))
    def __init__(self):
        self.bam = BamReader  (data.getUnalignedBam())
        self.bax = BaxH5Reader(data.getBaxForBam())

        self.baxRead0 = next(self.bax.subreads())
        self.bamRead0 = next(iter(self.bam))
 def __init__(self):
     self.f = BamReader(data.getCCSBAM())
 def test_mapped_bam_cigar_cref_skip(self):
     fn = "/pbi/dept/secondary/siv/testdata/pbcore-unittest/data/ITG-2283-cref-skip.subreads.bam"
     bam = BamReader(fn)
     for rec in bam:
         assert rec.read(aligned=True) is not None
 def setup_class(cls):
     cls.f = BamReader(data.getCCSBAM())
Beispiel #35
0
    def test_subreads_blacklist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        BLACKLIST = set([
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298'
        ])

        def _run_with_blacklist(bl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       blacklist=bl,
                                       use_subreads=True)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                qnames = set([rec.qName for rec in bam_out])
                assert qnames & BLACKLIST == set()
                assert len([x for x in bam_out]) == 114

        _run_with_blacklist(BLACKLIST)
        _run_with_blacklist(",".join([str(x) for x in list(BLACKLIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(BLACKLIST)]))
        _run_with_blacklist(tmp_wl)

        # now with the BAM file we just made as blacklist
        EXPECTED_OUT = BLACKLIST
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn2,
                                   use_subreads=True,
                                   blacklist=ofn)

        with BamReader(ofn) as bam_out:
            subreads = set([x.qName for x in bam_out])
        with BamReader(ofn2) as bam_out:
            subreads2 = set([x.qName for x in bam_out])
        assert subreads & subreads2 == set()
        assert subreads2 == EXPECTED_OUT

        # now an integration test, because this is used in Cromwell workflow
        ofn3 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
        args = ["bamsieve", "--subreads", "--blacklist", ofn, SUBREADS3, ofn3]
        rc = subprocess.check_call(args)
        with BamReader(ofn3) as bam_out:
            subreads3 = set([x.qName for x in bam_out])
            assert subreads & subreads3 == set()
            assert subreads3 == EXPECTED_OUT
        # and again, with a dataset as input
        ds_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        with SubreadSet(ofn) as ds:
            ds.write(ds_tmp)
        ofn4 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
        args = [
            "bamsieve", "--subreads", "--blacklist", ds_tmp, SUBREADS3, ofn4
        ]
        rc = subprocess.check_call(args)
        with BamReader(ofn4) as bam_out:
            subreads4 = set([x.qName for x in bam_out])
            assert subreads & subreads4 == set()
            assert subreads4 == EXPECTED_OUT