Beispiel #1
0
def main(parser):

    args = parser.parse_args()

    bam    = BamReader(args.ccsBAM)
    bcFofn = BarcodeH5Fofn(args.barcodeFofn)

    oFiles =  { bc:FastqWriter('{dir}/{bc}.fastq'.format(dir=args.outDir,bc=bc)) for bc in bcFofn.barcodeLabels }
    for rec in bam:
        try:
            lZmw = bcFofn.labeledZmwFromName(rec.readName)
        except KeyError:
            #catch zmws with no barcode and skip
            continue
        if       rec.readScore     >= args.minPredictedAccuracy \
             and lZmw.averageScore >= args.minAvgBarcodeScore \
             and rec.numPasses     >= args.minNumPasses:
            header = rec.readName
            if args.extendedHeader:
                header +=  ' predictedAccuracy={predAcc} numPasses={numPasses} barcodeScore={bcScore}'\
                           .format(predAcc=rec.readScore, numPasses=rec.numPasses, bcScore=lZmw.averageScore)
            qual = [ ord(q)-33  for q in rec.peer.qual ]
            writer = oFiles[bcFofn.barcodeLabels[lZmw.bestIdx]]
            writer.writeRecord(header, rec.read(aligned=False), qual)
    
    for f in oFile.values():
        f.close()
    def test_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([24962, 32901, 30983])

        def _run_with_whitelist(wl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       whitelist=wl)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                assert have_zmws == WHITELIST

        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn,
                                   whitelist=SUBREADS4)
        with BamReader(ofn) as bam_out:
            assert 117 == len([rec for rec in bam_out])
 def test_retrieve_read_group_properties(self):
     movie_names = []
     with BamReader(self.bam_file) as bam_in:
         for aln in bam_in:
             assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M'
         movie_names.extend([rg.MovieName for rg in bam_in.readGroupTable])
     assert movie_names == ['movie1', 'm64012_181222_192540']
Beispiel #4
0
 def __read_bam(fn):
     if op.exists(fn + ".pbi"):
         with IndexedBamReader(fn) as bam_in:
             return bam_in
     else:
         with BamReader(fn) as bam_in:
             return bam_in
 def _run_with_blacklist(bl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS2,
                                output_bam=ofn,
                                blacklist=bl)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([9])
 def _run_with_whitelist(wl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                whitelist=wl)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == WHITELIST
Beispiel #7
0
 def _run_with_whitelist(wl):
     rc = bamSieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                whitelist=wl)
     self.assertEqual(rc, 0)
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, WHITELIST)
Beispiel #8
0
 def _verify():
     with openDataSet(ofn, strict=False) as ds_out:
         ext_res = ds_out.externalResources[0]
         for bam_file in [ext_res.bam, ext_res.scraps]:
             with BamReader(bam_file) as bam:
                 zmws = set([rec.HoleNumber for rec in bam])
                 self.assertEqual(len(zmws), 1)
                 self.assertTrue(74056024 in zmws)
Beispiel #9
0
 def _run_with_blacklist(bl):
     rc = bamSieve.filter_reads(input_bam=SUBREADS2,
                                output_bam=ofn,
                                blacklist=bl)
     self.assertEqual(rc, 0)
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([9]))
Beispiel #10
0
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file (BAM),
    not requiring index capability
    """
    if fname.endswith("cmp.h5"):
        raise_no_h5()
    elif fname.endswith("bam"):
        return BamReader(fname, referenceFastaFname)
Beispiel #11
0
 def test_count(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                count=1,
                                seed=12345)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         assert len(zmws) == 1
Beispiel #12
0
 def _verify():
     with SubreadSet(ofn, strict=False) as ds_out:
         ext_res = ds_out.externalResources[0]
         assert ext_res.bam.endswith(".subreads.bam")
         assert ext_res.scraps.endswith(".scraps.bam")
         for bam_file in [ext_res.bam, ext_res.scraps]:
             with BamReader(bam_file) as bam:
                 zmws = set([rec.HoleNumber for rec in bam])
                 assert len(zmws) == 1
                 assert 74056024 in zmws
Beispiel #13
0
 def test_barcodes(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=BARCODED,
                                output_bam=ofn,
                                whitelist=[0],
                                use_barcodes=True)
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         assert len(zmws) == 1
         assert 74056024 in zmws
Beispiel #14
0
 def _run_with_blacklist(bl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                blacklist=bl,
                                use_subreads=True)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         qnames = set([rec.qName for rec in bam_out])
         assert qnames & BLACKLIST == set()
         assert len([x for x in bam_out]) == 114
 def test_sample_names(self):
     with BamReader(self.bam_file) as bam:
         samples = {
             rg.MovieName: rg.SampleName
             for rg in bam.readGroupTable
         }
         assert samples == {
             "movie1": "test_sample1",
             "m64012_181222_192540": "test_sample2"
         }
Beispiel #16
0
 def test_percentage(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamSieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                percentage=50,
                                seed=12345)
     self.assertEqual(rc, 0)
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(len(zmws), 24)
Beispiel #17
0
 def test_count_overflow(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     with warnings.catch_warnings(record=True) as w:
         rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                    output_bam=ofn,
                                    count=100000,
                                    seed=12345)
         assert rc == 0
         assert len(w) == 1
         with BamReader(ofn) as bam_out:
             zmws = set([rec.HoleNumber for rec in bam_out])
             assert len(zmws) == 48
Beispiel #18
0
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file
    (cmp.h5 or BAM), not requiring index capability

    (A `sharedIndex` can still be passed for opening a cmp.h5, for which
    the index is compulsory.)
    """
    if fname.endswith("cmp.h5"):
        return CmpH5Reader(fname, sharedIndex=sharedIndex)
    elif fname.endswith("bam"):
        return BamReader(fname, referenceFastaFname)
 def test_alignment_identity_unindexed(self):
     """
     Check that the value of the 'identity' property is the same whether
     or not the .pbi index was used to calculate it.
     """
     fn1 = data.getAlignedBam()
     fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     shutil.copyfile(fn1, fn2)
     with IndexedBamReader(fn1) as bam_pbi:
         with BamReader(fn2) as bam_noindex:
             i1 = np.array([rec.identity for rec in bam_pbi])
             i2 = np.array([rec.identity for rec in bam_noindex])
             assert (i2 == i1).all()
Beispiel #20
0
    def test_subreads_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298'
        ])
        ZMWS = set([1650, 7957])

        def _run_with_whitelist(wl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       whitelist=wl,
                                       use_subreads=True)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                assert have_zmws == ZMWS
                qnames = set([rec.qName for rec in bam_out])
                assert qnames == WHITELIST

        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn2,
                                   use_subreads=True,
                                   whitelist=ofn)

        with BamReader(ofn) as bam_out:
            subreads = set([x.qName for x in bam_out])
        with BamReader(ofn2) as bam_out:
            subreads2 = set([x.qName for x in bam_out])
        assert subreads == subreads2
Beispiel #21
0
 def test_split_bam(self):
     bam_file1 = self._get_bam_path(self.DS1)
     CHUNKS_IN = [1, 2, 3, 4]
     CHUNKS_OUT = [1, 2, 3, 3]
     for n_in, n_expected in zip(CHUNKS_IN, CHUNKS_OUT):
         nchunks = split_bam(bam_file1, n_in)
         assert nchunks == n_expected
         bam_in = IndexedBamReader(bam_file1)
         records_in = [rec.qName for rec in bam_in]
         records_out = []
         for i in range(n_expected):
             bam_out = BamReader("reads.chunk%d.bam" % i)
             records_out.extend([rec.qName for rec in bam_out])
         assert records_in == records_out
         self._remove_all()
 def test_retrieve_read_group_properties(self):
     f1 = tempfile.NamedTemporaryFile(suffix=".sam").name
     f2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     with open(f1, "w") as f:
         f.write(self.SAM_IN)
     with pysam.AlignmentFile(f1) as sam_in:
         with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out:
             for aln in sam_in:
                 bam_out.write(aln)
     movie_names = []
     with BamReader(f2) as bam_in:
         for aln in bam_in:
             EQ(aln.sequencingChemistry, "P6-C4")
             movie_names.append(aln.movieName)
     EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
 def test_retrieve_read_group_properties(self):
     f1 = tempfile.NamedTemporaryFile(suffix=".sam").name
     f2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     with open(f1, "w") as f:
         f.write(self.SAM_IN)
     with AlignmentFile(f1) as sam_in:
         with AlignmentFile(f2, 'wb', template=sam_in) as bam_out:
             for aln in sam_in:
                 bam_out.write(aln)
     movie_names = []
     with BamReader(f2) as bam_in:
         for aln in bam_in:
             assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M'
             movie_names.append(aln.movieName)
     assert movie_names == ['movie1', 'm64012_181222_192540']
Beispiel #24
0
 def test_integration(self):
     args = ["bamsieve", "--help"]
     with tempfile.TemporaryFile() as stdout:
         with tempfile.TemporaryFile() as stderr:
             rc = subprocess.call(args, stdout=stdout, stderr=stderr)
             assert rc == 0
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     args = [
         "bamsieve", "--log-level", "ERROR", "--whitelist", "8,233",
         SUBREADS2, ofn
     ]
     rc = subprocess.call(args)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([8])
Beispiel #25
0
def run(dataset_file):
    """Reads in the input.fofn and counts movies and cells. Outputs in XML."""

    with openDataSet(dataset_file) as ds:
        movies = None
        movies = set([])
        for file_name in ds.toExternalFiles():
            if type(ds).__name__ == "HdfSubreadSet":
                movies.add(path_to_movie(file_name))
            else:
                with BamReader(file_name) as bam:
                    for rg in bam.peer.header["RG"]:
                        movies.add(rg["PU"])
        cells = set([movie_to_cell(movie) for movie in movies])
        ncells_attr = Attribute(Constants.A_NCELLS, len(cells))
        nmovies_attr = Attribute(Constants.A_NMOVIES, len(movies))
        attrs = [ncells_attr, nmovies_attr]
        report = Report(Constants.R_ID, attributes=attrs)
        return spec.apply_view(report)
Beispiel #26
0
 def test_combine_with_header(self):
     bam_file = self._get_bam_path(self.DS1)
     bam_size = op.getsize(bam_file)
     # see above - these are known boundaries for this particular input
     byte_ranges = [(396, 26575), (26575, 77209), (77209, bam_size)]
     with open(bam_file, "rb") as bam_in:
         with open("header.bam", "wb") as header_out:
             header_out.write(bam_in.read(396))
         for i, (start, end) in enumerate(byte_ranges):
             with open("tmp.chunk%d.bam" % i, "wb") as chunk_out:
                 bam_in.seek(start)
                 nbytes = end - start
                 chunk_out.write(bam_in.read(nbytes))
     for i in range(3):
         combine_with_header("header.bam", "tmp.chunk%d.bam" % i, "combined.chunk%d.bam" % i)
     bam_in = IndexedBamReader(bam_file)
     records_in = [rec.qName for rec in bam_in]
     records_out = []
     for i in range(3):
         bam_out = BamReader("combined.chunk%d.bam" % i)
         records_out.extend([rec.qName for rec in bam_out])
     assert records_in == records_out
 def __init__(self):
     self.f = BamReader(data.getCCSBAM())
 def test_mapped_bam_cigar_cref_skip(self):
     fn = "/pbi/dept/secondary/siv/testdata/pbcore-unittest/data/ITG-2283-cref-skip.subreads.bam"
     bam = BamReader(fn)
     for rec in bam:
         assert rec.read(aligned=True) is not None
 def setup_class(cls):
     cls.f = BamReader(data.getCCSBAM())
Beispiel #30
0
    def test_subreads_blacklist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        BLACKLIST = set([
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298'
        ])

        def _run_with_blacklist(bl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       blacklist=bl,
                                       use_subreads=True)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                qnames = set([rec.qName for rec in bam_out])
                assert qnames & BLACKLIST == set()
                assert len([x for x in bam_out]) == 114

        _run_with_blacklist(BLACKLIST)
        _run_with_blacklist(",".join([str(x) for x in list(BLACKLIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(BLACKLIST)]))
        _run_with_blacklist(tmp_wl)

        # now with the BAM file we just made as blacklist
        EXPECTED_OUT = BLACKLIST
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn2,
                                   use_subreads=True,
                                   blacklist=ofn)

        with BamReader(ofn) as bam_out:
            subreads = set([x.qName for x in bam_out])
        with BamReader(ofn2) as bam_out:
            subreads2 = set([x.qName for x in bam_out])
        assert subreads & subreads2 == set()
        assert subreads2 == EXPECTED_OUT

        # now an integration test, because this is used in Cromwell workflow
        ofn3 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
        args = ["bamsieve", "--subreads", "--blacklist", ofn, SUBREADS3, ofn3]
        rc = subprocess.check_call(args)
        with BamReader(ofn3) as bam_out:
            subreads3 = set([x.qName for x in bam_out])
            assert subreads & subreads3 == set()
            assert subreads3 == EXPECTED_OUT
        # and again, with a dataset as input
        ds_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        with SubreadSet(ofn) as ds:
            ds.write(ds_tmp)
        ofn4 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
        args = [
            "bamsieve", "--subreads", "--blacklist", ds_tmp, SUBREADS3, ofn4
        ]
        rc = subprocess.check_call(args)
        with BamReader(ofn4) as bam_out:
            subreads4 = set([x.qName for x in bam_out])
            assert subreads & subreads4 == set()
            assert subreads4 == EXPECTED_OUT