def main(parser): args = parser.parse_args() bam = BamReader(args.ccsBAM) bcFofn = BarcodeH5Fofn(args.barcodeFofn) oFiles = { bc:FastqWriter('{dir}/{bc}.fastq'.format(dir=args.outDir,bc=bc)) for bc in bcFofn.barcodeLabels } for rec in bam: try: lZmw = bcFofn.labeledZmwFromName(rec.readName) except KeyError: #catch zmws with no barcode and skip continue if rec.readScore >= args.minPredictedAccuracy \ and lZmw.averageScore >= args.minAvgBarcodeScore \ and rec.numPasses >= args.minNumPasses: header = rec.readName if args.extendedHeader: header += ' predictedAccuracy={predAcc} numPasses={numPasses} barcodeScore={bcScore}'\ .format(predAcc=rec.readScore, numPasses=rec.numPasses, bcScore=lZmw.averageScore) qual = [ ord(q)-33 for q in rec.peer.qual ] writer = oFiles[bcFofn.barcodeLabels[lZmw.bestIdx]] writer.writeRecord(header, rec.read(aligned=False), qual) for f in oFile.values(): f.close()
def test_whitelist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name WHITELIST = set([24962, 32901, 30983]) def _run_with_whitelist(wl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == WHITELIST _run_with_whitelist(WHITELIST) _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(WHITELIST)])) _run_with_whitelist(tmp_wl) # now with a BAM file as whitelist rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=SUBREADS4) with BamReader(ofn) as bam_out: assert 117 == len([rec for rec in bam_out])
def test_retrieve_read_group_properties(self): movie_names = [] with BamReader(self.bam_file) as bam_in: for aln in bam_in: assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M' movie_names.extend([rg.MovieName for rg in bam_in.readGroupTable]) assert movie_names == ['movie1', 'm64012_181222_192540']
def __read_bam(fn): if op.exists(fn + ".pbi"): with IndexedBamReader(fn) as bam_in: return bam_in else: with BamReader(fn) as bam_in: return bam_in
def _run_with_blacklist(bl): rc = bamsieve.filter_reads(input_bam=SUBREADS2, output_bam=ofn, blacklist=bl) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == set([9])
def _run_with_whitelist(wl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == WHITELIST
def _run_with_whitelist(wl): rc = bamSieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) self.assertEqual(rc, 0) with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, WHITELIST)
def _verify(): with openDataSet(ofn, strict=False) as ds_out: ext_res = ds_out.externalResources[0] for bam_file in [ext_res.bam, ext_res.scraps]: with BamReader(bam_file) as bam: zmws = set([rec.HoleNumber for rec in bam]) self.assertEqual(len(zmws), 1) self.assertTrue(74056024 in zmws)
def _run_with_blacklist(bl): rc = bamSieve.filter_reads(input_bam=SUBREADS2, output_bam=ofn, blacklist=bl) self.assertEqual(rc, 0) with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([9]))
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (BAM), not requiring index capability """ if fname.endswith("cmp.h5"): raise_no_h5() elif fname.endswith("bam"): return BamReader(fname, referenceFastaFname)
def test_count(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, count=1, seed=12345) assert rc == 0 with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) assert len(zmws) == 1
def _verify(): with SubreadSet(ofn, strict=False) as ds_out: ext_res = ds_out.externalResources[0] assert ext_res.bam.endswith(".subreads.bam") assert ext_res.scraps.endswith(".scraps.bam") for bam_file in [ext_res.bam, ext_res.scraps]: with BamReader(bam_file) as bam: zmws = set([rec.HoleNumber for rec in bam]) assert len(zmws) == 1 assert 74056024 in zmws
def test_barcodes(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamsieve.filter_reads(input_bam=BARCODED, output_bam=ofn, whitelist=[0], use_barcodes=True) with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) assert len(zmws) == 1 assert 74056024 in zmws
def _run_with_blacklist(bl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, blacklist=bl, use_subreads=True) assert rc == 0 with BamReader(ofn) as bam_out: qnames = set([rec.qName for rec in bam_out]) assert qnames & BLACKLIST == set() assert len([x for x in bam_out]) == 114
def test_sample_names(self): with BamReader(self.bam_file) as bam: samples = { rg.MovieName: rg.SampleName for rg in bam.readGroupTable } assert samples == { "movie1": "test_sample1", "m64012_181222_192540": "test_sample2" }
def test_percentage(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamSieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, percentage=50, seed=12345) self.assertEqual(rc, 0) with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(len(zmws), 24)
def test_count_overflow(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name with warnings.catch_warnings(record=True) as w: rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, count=100000, seed=12345) assert rc == 0 assert len(w) == 1 with BamReader(ofn) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) assert len(zmws) == 48
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (cmp.h5 or BAM), not requiring index capability (A `sharedIndex` can still be passed for opening a cmp.h5, for which the index is compulsory.) """ if fname.endswith("cmp.h5"): return CmpH5Reader(fname, sharedIndex=sharedIndex) elif fname.endswith("bam"): return BamReader(fname, referenceFastaFname)
def test_alignment_identity_unindexed(self): """ Check that the value of the 'identity' property is the same whether or not the .pbi index was used to calculate it. """ fn1 = data.getAlignedBam() fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name shutil.copyfile(fn1, fn2) with IndexedBamReader(fn1) as bam_pbi: with BamReader(fn2) as bam_noindex: i1 = np.array([rec.identity for rec in bam_pbi]) i2 = np.array([rec.identity for rec in bam_noindex]) assert (i2 == i1).all()
def test_subreads_whitelist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name WHITELIST = set([ 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298' ]) ZMWS = set([1650, 7957]) def _run_with_whitelist(wl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, whitelist=wl, use_subreads=True) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == ZMWS qnames = set([rec.qName for rec in bam_out]) assert qnames == WHITELIST _run_with_whitelist(WHITELIST) _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(WHITELIST)])) _run_with_whitelist(tmp_wl) # now with a BAM file as whitelist rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn2, use_subreads=True, whitelist=ofn) with BamReader(ofn) as bam_out: subreads = set([x.qName for x in bam_out]) with BamReader(ofn2) as bam_out: subreads2 = set([x.qName for x in bam_out]) assert subreads == subreads2
def test_split_bam(self): bam_file1 = self._get_bam_path(self.DS1) CHUNKS_IN = [1, 2, 3, 4] CHUNKS_OUT = [1, 2, 3, 3] for n_in, n_expected in zip(CHUNKS_IN, CHUNKS_OUT): nchunks = split_bam(bam_file1, n_in) assert nchunks == n_expected bam_in = IndexedBamReader(bam_file1) records_in = [rec.qName for rec in bam_in] records_out = [] for i in range(n_expected): bam_out = BamReader("reads.chunk%d.bam" % i) records_out.extend([rec.qName for rec in bam_out]) assert records_in == records_out self._remove_all()
def test_retrieve_read_group_properties(self): f1 = tempfile.NamedTemporaryFile(suffix=".sam").name f2 = tempfile.NamedTemporaryFile(suffix=".bam").name with open(f1, "w") as f: f.write(self.SAM_IN) with pysam.AlignmentFile(f1) as sam_in: with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out: for aln in sam_in: bam_out.write(aln) movie_names = [] with BamReader(f2) as bam_in: for aln in bam_in: EQ(aln.sequencingChemistry, "P6-C4") movie_names.append(aln.movieName) EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
def test_retrieve_read_group_properties(self): f1 = tempfile.NamedTemporaryFile(suffix=".sam").name f2 = tempfile.NamedTemporaryFile(suffix=".bam").name with open(f1, "w") as f: f.write(self.SAM_IN) with AlignmentFile(f1) as sam_in: with AlignmentFile(f2, 'wb', template=sam_in) as bam_out: for aln in sam_in: bam_out.write(aln) movie_names = [] with BamReader(f2) as bam_in: for aln in bam_in: assert aln.sequencingChemistry == 'S/P4-C2/5.0-8M' movie_names.append(aln.movieName) assert movie_names == ['movie1', 'm64012_181222_192540']
def test_integration(self): args = ["bamsieve", "--help"] with tempfile.TemporaryFile() as stdout: with tempfile.TemporaryFile() as stderr: rc = subprocess.call(args, stdout=stdout, stderr=stderr) assert rc == 0 ofn = tempfile.NamedTemporaryFile(suffix=".bam").name args = [ "bamsieve", "--log-level", "ERROR", "--whitelist", "8,233", SUBREADS2, ofn ] rc = subprocess.call(args) assert rc == 0 with BamReader(ofn) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) assert have_zmws == set([8])
def run(dataset_file): """Reads in the input.fofn and counts movies and cells. Outputs in XML.""" with openDataSet(dataset_file) as ds: movies = None movies = set([]) for file_name in ds.toExternalFiles(): if type(ds).__name__ == "HdfSubreadSet": movies.add(path_to_movie(file_name)) else: with BamReader(file_name) as bam: for rg in bam.peer.header["RG"]: movies.add(rg["PU"]) cells = set([movie_to_cell(movie) for movie in movies]) ncells_attr = Attribute(Constants.A_NCELLS, len(cells)) nmovies_attr = Attribute(Constants.A_NMOVIES, len(movies)) attrs = [ncells_attr, nmovies_attr] report = Report(Constants.R_ID, attributes=attrs) return spec.apply_view(report)
def test_combine_with_header(self): bam_file = self._get_bam_path(self.DS1) bam_size = op.getsize(bam_file) # see above - these are known boundaries for this particular input byte_ranges = [(396, 26575), (26575, 77209), (77209, bam_size)] with open(bam_file, "rb") as bam_in: with open("header.bam", "wb") as header_out: header_out.write(bam_in.read(396)) for i, (start, end) in enumerate(byte_ranges): with open("tmp.chunk%d.bam" % i, "wb") as chunk_out: bam_in.seek(start) nbytes = end - start chunk_out.write(bam_in.read(nbytes)) for i in range(3): combine_with_header("header.bam", "tmp.chunk%d.bam" % i, "combined.chunk%d.bam" % i) bam_in = IndexedBamReader(bam_file) records_in = [rec.qName for rec in bam_in] records_out = [] for i in range(3): bam_out = BamReader("combined.chunk%d.bam" % i) records_out.extend([rec.qName for rec in bam_out]) assert records_in == records_out
def __init__(self): self.f = BamReader(data.getCCSBAM())
def test_mapped_bam_cigar_cref_skip(self): fn = "/pbi/dept/secondary/siv/testdata/pbcore-unittest/data/ITG-2283-cref-skip.subreads.bam" bam = BamReader(fn) for rec in bam: assert rec.read(aligned=True) is not None
def setup_class(cls): cls.f = BamReader(data.getCCSBAM())
def test_subreads_blacklist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name BLACKLIST = set([ 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634', 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298' ]) def _run_with_blacklist(bl): rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn, blacklist=bl, use_subreads=True) assert rc == 0 with BamReader(ofn) as bam_out: qnames = set([rec.qName for rec in bam_out]) assert qnames & BLACKLIST == set() assert len([x for x in bam_out]) == 114 _run_with_blacklist(BLACKLIST) _run_with_blacklist(",".join([str(x) for x in list(BLACKLIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(BLACKLIST)])) _run_with_blacklist(tmp_wl) # now with the BAM file we just made as blacklist EXPECTED_OUT = BLACKLIST rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn2, use_subreads=True, blacklist=ofn) with BamReader(ofn) as bam_out: subreads = set([x.qName for x in bam_out]) with BamReader(ofn2) as bam_out: subreads2 = set([x.qName for x in bam_out]) assert subreads & subreads2 == set() assert subreads2 == EXPECTED_OUT # now an integration test, because this is used in Cromwell workflow ofn3 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name args = ["bamsieve", "--subreads", "--blacklist", ofn, SUBREADS3, ofn3] rc = subprocess.check_call(args) with BamReader(ofn3) as bam_out: subreads3 = set([x.qName for x in bam_out]) assert subreads & subreads3 == set() assert subreads3 == EXPECTED_OUT # and again, with a dataset as input ds_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name with SubreadSet(ofn) as ds: ds.write(ds_tmp) ofn4 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name args = [ "bamsieve", "--subreads", "--blacklist", ds_tmp, SUBREADS3, ofn4 ] rc = subprocess.check_call(args) with BamReader(ofn4) as bam_out: subreads4 = set([x.qName for x in bam_out]) assert subreads & subreads4 == set() assert subreads4 == EXPECTED_OUT