def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format( chunks, maxChunks)) dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1): bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, subreads_bam, scraps_bam ] print args log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) # FIXME we need a more general method for this ext_res_new = ExternalResource() ext_res_new.resourceId = subreads_bam ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile' ext_res_new.addIndices([subreads_bam + ".pbi"]) ext_res_inner = ExternalResources() ext_res_scraps = ExternalResource() ext_res_scraps.resourceId = scraps_bam ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile' ext_res_scraps.addIndices([scraps_bam + ".pbi"]) ext_res_inner.append(ext_res_scraps) ext_res_new.append(ext_res_inner) ds_new.externalResources.append(ext_res_new) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.updateCounts() ds_new.write(output_file_name) return 0
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(7), strict=True) assert len(aln) == 92 assert aln._length == (92, 123588) assert aln.totalLength == 123588 assert aln.numRecords == 92 aln.totalLength = -1 aln.numRecords = -1 assert aln.totalLength == -1 assert aln.numRecords == -1 aln.updateCounts() assert aln.totalLength == 123588 assert aln.numRecords == 92 assert sum(1 for _ in aln) == 92 assert sum(len(rec) for rec in aln) == 123588 # AlignmentSet with filters aln = AlignmentSet(data.getXml(14), strict=True) assert len(aln) == 40 assert aln._length == (40, 52023) assert aln.totalLength == 52023 assert aln.numRecords == 40 aln.totalLength = -1 aln.numRecords = -1 assert aln.totalLength == -1 assert aln.numRecords == -1 aln.updateCounts() assert aln.totalLength == 52023 assert aln.numRecords == 40 # SubreadSet sset = SubreadSet(data.getXml(9), strict=True) assert len(sset) == 92 assert sset._length == (92, 124093) assert sset.totalLength == 124093 assert sset.numRecords == 92 sset.totalLength = -1 sset.numRecords = -1 assert sset.totalLength == -1 assert sset.numRecords == -1 sset.updateCounts() assert sset.totalLength == 124093 assert sset.numRecords == 92 assert sum(1 for _ in sset) == 92 assert sum(len(rec) for rec in sset) == 124093 # ReferenceSet sset = ReferenceSet(data.getXml(8), strict=True) assert len(sset) == 59 assert sset.totalLength == 85774 assert sset.numRecords == 59 sset.totalLength = -1 sset.numRecords = -1 assert sset.totalLength == -1 assert sset.numRecords == -1 sset.updateCounts() assert sset.totalLength == 85774 assert sset.numRecords == 59
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(8), strict=True) self.assertEqual(len(aln), 92) self.assertEqual(aln._length, (92, 123588)) self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) self.assertEqual(sum(1 for _ in aln), 92) self.assertEqual(sum(len(rec) for rec in aln), 123588) # AlignmentSet with filters aln = AlignmentSet(data.getXml(15), strict=True) self.assertEqual(len(aln), 40) self.assertEqual(aln._length, (40, 52023)) self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) # SubreadSet sset = SubreadSet(data.getXml(10), strict=True) self.assertEqual(len(sset), 92) self.assertEqual(sset._length, (92, 124093)) self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) self.assertEqual(sum(1 for _ in sset), 92) self.assertEqual(sum(len(rec) for rec in sset), 124093) # ReferenceSet sset = ReferenceSet(data.getXml(9), strict=True) self.assertEqual(len(sset), 59) self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59)
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError( "Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join( op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format( f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def test_barcode_split_cornercases(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn) ssets = sset.split(chunks=3, barcodes=True) self.assertEqual([str(ss.filters) for ss in ssets], ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"]) sset = SubreadSet(fn) self.assertEqual(len(sset), 15133) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters.addRequirement(bc=[('=', '[2, 2]')]) self.assertEqual(str(sset.filters), "( bc = [2, 2] )") sset.updateCounts() self.assertEqual(len(sset), 4710) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters.addRequirement(bc=[('=', '[2,2]')]) self.assertEqual(str(sset.filters), "( bc = [2,2] )") sset.updateCounts() self.assertEqual(len(sset), 4710)
def test_barcode_split_maxChunks(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = sset.split(maxChunks=2, barcodes=True) self.assertEqual( [str(ss.filters) for ss in ssets], ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"]) sset = SubreadSet(fn, skipMissing=True) self.assertEqual(len(sset), 15133) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters = ssets[0].filters self.assertEqual(str(sset.filters), "( bc = [0, 0] )") sset.updateCounts() self.assertEqual(len(sset), 5370) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters = ssets[1].filters self.assertEqual(str(sset.filters), "( bc = [1, 1] ) OR ( bc = [2, 2] )") sset.updateCounts() self.assertEqual(len(sset), 9763)
def test_barcode_split_cornercases(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = sset.split(chunks=3, barcodes=True) self.assertEqual( [str(ss.filters) for ss in ssets], ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"]) sset = SubreadSet(fn, skipMissing=True) self.assertEqual(len(sset), 15133) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters.addRequirement(bc=[('=', '[2, 2]')]) self.assertEqual(str(sset.filters), "( bc = [2, 2] )") sset.updateCounts() self.assertEqual(len(sset), 4710) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters.addRequirement(bc=[('=', '[2,2]')]) self.assertEqual(str(sset.filters), "( bc = [2,2] )") sset.updateCounts() self.assertEqual(len(sset), 4710)
def test_barcode_split_maxChunks(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = list(sset.split(maxChunks=2, barcodes=True)) assert [str(ss.filters) for ss in ssets ] == ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"] sset = SubreadSet(fn, skipMissing=True) assert len(sset) == 15133 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters = ssets[0].filters assert str(sset.filters) == "( bc = [0, 0] )" sset.updateCounts() assert len(sset) == 5370 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters = ssets[1].filters assert str(sset.filters) == "( bc = [1, 1] ) OR ( bc = [2, 2] )" sset.updateCounts() assert len(sset) == 9763
def test_barcode_split_cornercases(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = list(sset.split(chunks=3, barcodes=True)) assert [str(ss.filters) for ss in ssets ] == ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"] sset = SubreadSet(fn, skipMissing=True) assert len(sset) == 15133 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters.addRequirement(bc=[('=', '[2, 2]')]) assert str(sset.filters) == "( bc = [2, 2] )" sset.updateCounts() assert len(sset) == 4710 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters.addRequirement(bc=[('=', '[2,2]')]) assert str(sset.filters) == "( bc = [2,2] )" sset.updateCounts() assert len(sset) == 4710
def test_barcode_split_maxChunks(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = sset.split(maxChunks=2, barcodes=True) self.assertEqual([str(ss.filters) for ss in ssets], ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"]) sset = SubreadSet(fn, skipMissing=True) self.assertEqual(len(sset), 15133) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters = ssets[0].filters self.assertEqual(str(sset.filters), "( bc = [0, 0] )") sset.updateCounts() self.assertEqual(len(sset), 5370) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters = ssets[1].filters self.assertEqual(str(sset.filters), "( bc = [1, 1] ) OR ( bc = [2, 2] )") sset.updateCounts() self.assertEqual(len(sset), 9763)
def test_multi_movie_split_zmws_existing_filters(self): N_RECORDS = 1745161 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) # used to get total: #self.assertEqual(sum(1 for _ in ds1), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) ds1.filters.addRequirement( movie=[('=', 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0'), ('=', 'm141115_075238_ethan_c100699872550000001823139203261572_s1_p0')], zm=[('>', 10), ('>', 127900)]) ds1.filters.mapRequirement( zm=[('<', 10000), ('<', 140000)]) FILT_RECORDS = 117776 self.assertEqual(len(ds1), FILT_RECORDS) ds1._index = None ds1.updateCounts() self.assertEqual(len(ds1), FILT_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(len(dss[0]), FILT_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), FILT_RECORDS) dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 12) self.assertEqual(sum([len(ds_) for ds_ in dss]), FILT_RECORDS) self.assertEqual( dss[0].zmwRanges, [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 11, 1515)]) self.assertEqual( dss[-1].zmwRanges, [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0', 137634, 139999)])
def test_multi_movie_split_zmws_existing_filters(self): N_RECORDS = 1745161 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) # used to get total: #self.assertEqual(sum(1 for _ in ds1), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) ds1.filters.addRequirement(movie=[ ('=', 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0'), ('=', 'm141115_075238_ethan_c100699872550000001823139203261572_s1_p0') ], zm=[('>', 10), ('>', 127900)]) ds1.filters.mapRequirement(zm=[('<', 10000), ('<', 140000)]) FILT_RECORDS = 117776 self.assertEqual(len(ds1), FILT_RECORDS) ds1._index = None ds1.updateCounts() self.assertEqual(len(ds1), FILT_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(len(dss[0]), FILT_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), FILT_RECORDS) dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 12) self.assertEqual(sum([len(ds_) for ds_ in dss]), FILT_RECORDS) self.assertEqual( dss[0].zmwRanges, [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 11, 1515)]) self.assertEqual( dss[-1].zmwRanges, [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0', 137634, 139999)])
def test_multi_movie_split_zmws_existing_simple_filters(self): N_RECORDS = 1745161 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) # used to get total: #assert sum(1 for _ in ds1) == N_RECORDS assert len(ds1) == N_RECORDS ds1.filters.addRequirement(rq=[('>', '0.7'), ('<', '0.5')]) FILT_RECORDS = 1732613 assert len(ds1) == FILT_RECORDS ds1._index = None ds1.updateCounts() assert len(ds1) == FILT_RECORDS dss = list(ds1.split(chunks=1, zmws=True)) dss[0]._index = None dss[0].updateCounts() assert len(dss) == 1 assert len(dss[0]) == FILT_RECORDS assert sum([len(ds_) for ds_ in dss]) == FILT_RECORDS dss = list(ds1.split(chunks=12, zmws=True)) assert len(dss) == 12 assert sum([len(ds_) for ds_ in dss]) == FILT_RECORDS assert dss[0].zmwRanges == [ ('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 7, 22073) ] assert dss[-1].zmwRanges == [ ('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0', 127695, 163468) ]
def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info( 'Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)' .format(chunks, maxChunks)) dset_chunks = dset.split( zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(8), strict=True) self.assertEqual(len(aln), 92) self.assertEqual(aln._length, (92, 123588)) self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) self.assertEqual(sum(1 for _ in aln), 92) self.assertEqual(sum(len(rec) for rec in aln), 123588) # AlignmentSet with filters aln = AlignmentSet(data.getXml(15), strict=True) self.assertEqual(len(aln), 40) self.assertEqual(aln._length, (40, 52023)) self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) # AlignmentSet with cmp.h5 aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True) self.assertEqual(len(aln), 112) self.assertEqual(aln._length, (112, 59970)) self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112) # SubreadSet sset = SubreadSet(data.getXml(10), strict=True) self.assertEqual(len(sset), 92) self.assertEqual(sset._length, (92, 124093)) self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) self.assertEqual(sum(1 for _ in sset), 92) self.assertEqual(sum(len(rec) for rec in sset), 124093) # HdfSubreadSet # len means something else in bax/bas land. These numbers may actually # be correct... sset = HdfSubreadSet(data.getXml(17), strict=True) self.assertEqual(len(sset), 9) self.assertEqual(sset._length, (9, 128093)) self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) # ReferenceSet sset = ReferenceSet(data.getXml(9), strict=True) self.assertEqual(len(sset), 59) self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59)