def test_discard_bio_samples(self): ds = SubreadSet(self.SUBREADS) discard_bio_samples(ds, "lbc1--lbc1") coll = ds.metadata.collections[0] bioSamples = ds.metadata.collections[0].wellSample.bioSamples assert len(bioSamples) == 1 assert bioSamples[0].name == "Alice" # No matching BioSample records ds = SubreadSet(self.SUBREADS) ds.metadata.collections[0].wellSample.bioSamples.pop(1) ds.metadata.collections[0].wellSample.bioSamples.pop(1) bioSample = ds.metadata.collections[0].wellSample.bioSamples[0] while len(bioSample.DNABarcodes) > 0: bioSample.DNABarcodes.pop(0) assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 discard_bio_samples(ds, "lbc1--lbc1") assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 assert ds.metadata.collections[0].wellSample.bioSamples[ 0].name == "lbc1--lbc1" assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[ 0].name == "lbc1--lbc1" # no BioSample records ds = SubreadSet(pbtestdata.get_file("subreads-sequel")) assert len(ds.metadata.collections[0].wellSample.bioSamples) == 0 discard_bio_samples(ds, "lbc1--lbc1") assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 assert ds.metadata.collections[0].wellSample.bioSamples[ 0].name == "lbc1--lbc1" assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[ 0].name == "lbc1--lbc1"
def test_barcode_split_cornercases(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = list(sset.split(chunks=3, barcodes=True)) assert [str(ss.filters) for ss in ssets ] == ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"] sset = SubreadSet(fn, skipMissing=True) assert len(sset) == 15133 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters.addRequirement(bc=[('=', '[2, 2]')]) assert str(sset.filters) == "( bc = [2, 2] )" sset.updateCounts() assert len(sset) == 4710 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters.addRequirement(bc=[('=', '[2,2]')]) assert str(sset.filters) == "( bc = [2,2] )" sset.updateCounts() assert len(sset) == 4710
def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a list of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True, skipCounts=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) split_fns = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn, skipCounts=True) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) split_fns.append(out_fn) return split_fns
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(no=8)) self.assertFalse(aln.metadata.collections) aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') self.assertTrue(aln.metadata.collections) sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) self.assertEqual(sset.metadata, orig_metadata) # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) with self.assertRaises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def test_dataset_create_set_sample_names(self): sample_args = "--well-sample-name WELLSAMPLE --bio-sample-name BIOSAMPLE".split( ) outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name cmd = " ".join([ "dataset", "create", "--force", outfile, pbtestdata.get_file("subreads-bam") ] + sample_args) self._run_cmd_with_output(cmd, outfile) with SubreadSet(outfile) as ds: assert len(ds.metadata.collections) == 1 assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE" assert ds.metadata.collections[0].wellSample.bioSamples[ 0].name == "BIOSAMPLE" assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 # now with existing samples outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name cmd = " ".join([ "dataset", "create", "--force", outfile, pbtestdata.get_file("barcoded-subreadset") ] + sample_args) self._run_cmd_with_output(cmd, outfile) with SubreadSet(outfile) as ds: assert len(ds.metadata.collections) == 1 assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE" biosamples = { s.name for s in ds.metadata.collections[0].wellSample.bioSamples } assert biosamples == {"BIOSAMPLE"}
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") new_prefix = re.sub(".subreadset.xml$", "", output_file_name) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_set_file, "--scoreMode", score_mode, subread_set_file ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code assert op.isfile(output_file_name) tmp_out = op.join(op.dirname(output_file_name), "tmp_" + op.basename(output_file_name)) shutil.move(output_file_name, tmp_out) with SubreadSet(tmp_out, strict=True) as ds: with SubreadSet(subread_set_file) as ds_in: ds.metadata = ds_in.metadata ds.name = ds_in.name + " (barcoded)" ds.updateCounts() ds.newUuid() ds.write(output_file_name) return 0
def setup_class(cls): bam_files = [] with SubreadSet(pbtestdata.get_file("barcoded-subreadset")) as ds_in: for er in ds_in.externalResources: bam_files.append(er.bam) with SubreadSet(*bam_files, strict=True) as ds_out: ds_out.write(cls.INPUT_FILE)
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(7)) assert not aln.metadata.collections aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') assert aln.metadata.collections sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None assert not sset.metadata.collections sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) assert sset.metadata == orig_metadata # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None assert not sset.metadata.collections with pytest.raises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def test_split_zmws_around_read_groups(self): ds1 = pbtestdata.get_file("subreads-xml") ds2 = pbtestdata.get_file("subreads-sequel") ds = SubreadSet(ds1, ds2) assert len(ds) == 137 # this is still the default behavior chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=True)) assert len(chunks[0]) == 72 assert len(chunks[1]) == 65 # don't break up movies chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=False)) assert len(chunks[0]) == 20 assert len(chunks[1]) == 117 assert np.all(chunks[0].index.qId == -2081539485) assert np.all(chunks[1].index.qId == -1197849594) chunks = list( ds.split(chunks=4, targetSize=1, zmws=True, breakReadGroups=False)) assert [len(c) for c in chunks] == [8, 12, 54, 63] assert np.all(chunks[0].index.qId == -2081539485) assert np.all(chunks[1].index.qId == -2081539485) assert np.all(chunks[2].index.qId == -1197849594) assert np.all(chunks[3].index.qId == -1197849594) # control: single-movie dataset ds = SubreadSet(ds1) chunks1 = list(ds.split(chunks=4, zmws=True, breakReadGroups=False)) chunks2 = list(ds.split(chunks=4, zmws=True, breakReadGroups=True)) assert [len(x) for x in chunks1] == [len(y) for y in chunks2]
def test_reports_with_fixed_bins(self): # TODO readQualDists are currently unpopulated, turn back on when # they're repopulated # for dist_name, nbins in zip(['medianInsertDists', 'readLenDists', # 'readQualDists'], [200, 200, 50]): for dist_name, nbins in zip(['medianInsertDists', 'readLenDists'], [200, 200]): ss = SubreadSet() ss.loadStats(get_fixed_bin_sts()) ss2 = SubreadSet() ss2.loadStats(get_fixed_bin_sts()) # shift ss2 mdist = getattr(ss2.metadata.summaryStats, dist_name)[0].bins mdist = [0, 0, 0] + mdist[:-3] getattr(ss2.metadata.summaryStats, dist_name)[0].bins = mdist ss3 = ss + ss2 ss4 = SubreadSet() ss4.loadStats(get_fixed_bin_sts()) # shift ss4 mdist = getattr(ss4.metadata.summaryStats, dist_name)[0].bins mdist = [0 for _ in mdist] getattr(ss4.metadata.summaryStats, dist_name)[0].bins = mdist dists = getattr(ss4.metadata.summaryStats, dist_name) self.assertEqual(len(dists), 1) for n in [0, 1, 2, 10, 40, 41, 49, 50, 51, 200, 500]: ds = continuous_dist_shaper(dists, nbins=n) fixed_dists = [ds(dist) for dist in dists] self.assertEqual(len(dists[0].bins), nbins) self.assertEqual(len(fixed_dists[0].bins), nbins) self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins)) sss = [ss, ss2, ss3] for sset in sss: dists = getattr(sset.metadata.summaryStats, dist_name) self.assertEqual(len(dists), 1) # 0, requested nbins > numBins fails back to no-op ops = [1, 2, 3, 4, 7, 10, 40, 41, 49, 50, 51, 200, 500] no_ops = [0] for n in no_ops: ds = continuous_dist_shaper(dists, nbins=n) fixed_dists = [ds(dist) for dist in dists] self.assertEqual(len(dists[0].bins), nbins) self.assertEqual(len(fixed_dists[0].bins), nbins) self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins)) for n in ops: ds = continuous_dist_shaper(dists, nbins=n) fixed_dists = [ds(dist) for dist in dists] self.assertEqual(len(dists[0].bins), nbins) self.assertEqual(len(fixed_dists[0].bins), n) self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins))
def test_isBarcoded(self): empty = upstreamdata.getEmptyBam() nonempty = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') # One empty one non empty sset = SubreadSet(nonempty, empty, skipMissing=True) self.assertTrue(sset.isBarcoded) # Just nonempty sset = SubreadSet(nonempty, skipMissing=True) self.assertEqual(len(sset), 15133) self.assertTrue(sset.isBarcoded) # Just empty # This is crazy, the pbi must be out of date: sset = SubreadSet(empty) self.assertEqual(len(sset), 0) self.assertTrue(sset.isBarcoded) # To confirm current behavior, I will regenerate the pbi with a # current pbindex: efn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info("Copying to {}".format(efn)) sset.copyTo(efn) sset.induceIndices(force=True) self.assertFalse(sset.isBarcoded)
def test_barcode_split_maxChunks(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = sset.split(maxChunks=2, barcodes=True) self.assertEqual( [str(ss.filters) for ss in ssets], ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"]) sset = SubreadSet(fn, skipMissing=True) self.assertEqual(len(sset), 15133) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters = ssets[0].filters self.assertEqual(str(sset.filters), "( bc = [0, 0] )") sset.updateCounts() self.assertEqual(len(sset), 5370) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters = ssets[1].filters self.assertEqual(str(sset.filters), "( bc = [1, 1] ) OR ( bc = [2, 2] )") sset.updateCounts() self.assertEqual(len(sset), 9763)
def test_barcode_split_cornercases(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = sset.split(chunks=3, barcodes=True) self.assertEqual( [str(ss.filters) for ss in ssets], ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"]) sset = SubreadSet(fn, skipMissing=True) self.assertEqual(len(sset), 15133) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters.addRequirement(bc=[('=', '[2, 2]')]) self.assertEqual(str(sset.filters), "( bc = [2, 2] )") sset.updateCounts() self.assertEqual(len(sset), 4710) sset.filters = None self.assertEqual(str(sset.filters), "") sset.updateCounts() self.assertEqual(len(sset), 2667562) sset.filters.addRequirement(bc=[('=', '[2,2]')]) self.assertEqual(str(sset.filters), "( bc = [2,2] )") sset.updateCounts() self.assertEqual(len(sset), 4710)
def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets ]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_get_barcode_sample_mappings(self): with SubreadSet(self._subreads) as ds: # just double-checking that the XML defines more samples than are # actually present in the BAM assert len(ds.metadata.collections[0].wellSample.bioSamples) == 3 samples = get_barcode_sample_mappings(SubreadSet(self._subreads)) assert samples == {'lbc3--lbc3': 'Charles', 'lbc1--lbc1': 'Alice'}
def test_barcode_split_maxChunks(self): fn = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') sset = SubreadSet(fn, skipMissing=True) ssets = list(sset.split(maxChunks=2, barcodes=True)) assert [str(ss.filters) for ss in ssets ] == ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"] sset = SubreadSet(fn, skipMissing=True) assert len(sset) == 15133 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters = ssets[0].filters assert str(sset.filters) == "( bc = [0, 0] )" sset.updateCounts() assert len(sset) == 5370 sset.filters = None assert str(sset.filters) == "" sset.updateCounts() assert len(sset) == 2667562 sset.filters = ssets[1].filters assert str(sset.filters) == "( bc = [1, 1] ) OR ( bc = [2, 2] )" sset.updateCounts() assert len(sset) == 9763
def test_subreadset_metadata_element_name(self): # without touching the element: sset = SubreadSet(data.getXml(9)) log.debug(data.getXml(9)) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml") log.debug(fn.name) sset.write(fn.name) f = ET.parse(fn.name) assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')) == 0 assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')) == 1 fn.close() # with touching the element: sset = SubreadSet(data.getXml(9)) sset.metadata.description = 'foo' fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml") sset.write(fn.name, validate=False) f = ET.parse(fn.name) assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')) == 0 assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')) == 1 fn.close()
def test_output_subreadset_name(self): """ Verify that the output SubreadSet name is identical to the input name plus ' (barcoded)'. """ with SubreadSet(self.entrypoints.data['eid_subread']) as ds_in: with SubreadSet(self._get_subreadset_out()) as ds_out: self.assertEqual(ds_out.name, ds_in.name + " (barcoded)")
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError( "Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join( op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format( f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def test_file_arg(self): fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 10 qn = [r.qName for r in sset[:size]] with open(fn, 'w') as ofh: for q in qn: ofh.write(q) ofh.write('\n') good_qn = [('=', fn)] sset.filters.addRequirement(qname=good_qn) assert size == sum(1 for _ in sset) assert size == len(sset) og = set(qn) for r in sset: og.discard(r.qName) assert len(og) == 0 fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 10 qn = [r.qName for r in sset[:size]] with open(fn, 'w') as ofh: for q in qn: ofh.write(q) ofh.write('\n') good_qn = [('=', fn)] sset.filters.addRequirement(qname_file=good_qn) assert size == sum(1 for _ in sset) assert size == len(sset) og = set(qn) for r in sset: og.discard(r.qName) assert len(og) == 0 fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 4 hn = [r for r in sorted(list(set(sset.index.holeNumber)))[:size]] with open(fn, 'w') as ofh: for h in hn: ofh.write(str(h)) ofh.write('\n') good_hn = [('=', fn)] sset.filters.addRequirement(zm=good_hn) assert size == len(set(sset.index.holeNumber)) og = set(hn) for r in sset: og.discard(r.holeNumber) assert len(og) == 0
def test_subreads_parent_dataset(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) assert ds1.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4" ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True) assert ds2.metadata.provenance.parentDataSet.uniqueId is None ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4", "PacBio.DataSet.SubreadSet", "timestamped_name") assert ds2.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4" ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds2.write(ds_out, validate=False)
def test_adapters_resource(self): ifn = ("/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/" "m54075_161031_164015.subreadset.xml") s = SubreadSet(ifn) assert s.externalResources[0].adapters.endswith( 'm54075_161031_164015_adapter.fasta') ifn = ("/pbi/dept/secondary/siv/testdata/SA3-Sequel/ecoli/315/" "3150319/r54011_20160727_213451/1_A01/" "m54011_160727_213918.subreads.bam") s = SubreadSet(ifn) assert s.externalResources[0].adapters.endswith( 'm54011_160727_213918.adapters.fasta')
def test_output_subreadset_has_metadata(self): """ Verify that metadata from the instrument are propagated to the barcoded SubreadSet. """ with SubreadSet(self.entrypoints.data['eid_subread']) as ds_in: with SubreadSet(self._get_subreadset_out()) as ds_out: md_in = ds_in.metadata md_out = ds_out.metadata self.assertTrue(len(md_out.collections.submetadata) > 0) self.assertEqual( md_in.collections.submetadata[0].attrib['InstrumentName'], md_out.collections.submetadata[0].attrib['InstrumentName'])
def test_merge(self): sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) assert len(sset.metadata.collections) == 1 sset.metadata.collections.merge(orig_metadata.collections) assert len(sset.metadata.collections) == 2 sset = SubreadSet(sset_fn) sset.metadata.collections.merge(orig_metadata.collections, forceUnique=True) assert len(sset.metadata.collections) == 1
def test_build(self): # Progs like pbalign provide a .bam file: # e.g. d = DataSet("aligned.bam") # Something like the test files we have: inBam = data.getBam() self.assertTrue(inBam.endswith('.bam')) d = DataSet(inBam) # A UniqueId is generated, despite being a BAM input self.assertTrue(d.uuid != '') dOldUuid = d.uuid # They can write this BAM to an XML: # e.g. d.write("alignmentset.xml") outdir = tempfile.mkdtemp(suffix="dataset-unittest") outXml = os.path.join(outdir, 'tempfile.xml') d.write(outXml) # And then recover the same XML (or a different one): # e.g. d = DataSet("alignmentset.xml") d = DataSet(outXml) # The UniqueId will be the same self.assertTrue(d.uuid == dOldUuid) # Inputs can be many and varied ds1 = DataSet(data.getXml(11), data.getBam()) self.assertEquals(ds1.numExternalResources, 2) ds1 = DataSet(data.getFofn()) self.assertEquals(ds1.numExternalResources, 2) # New! Use the correct constructor: self.assertEquals( type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet') # Even with untyped inputs self.assertTrue( str(SubreadSet(data.getBam())).startswith('<SubreadSet')) self.assertEquals( type(SubreadSet(data.getBam())).__name__, 'SubreadSet') self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet') # You can also cast up and down, but casting between siblings # is limited (abuse at your own risk) self.assertEquals( type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__, 'SubreadSet') self.assertEquals( type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__, 'DataSet') # Add external Resources: ds = DataSet() ds.externalResources.addResources(["IdontExist.bam"]) self.assertTrue( ds.externalResources[-1].resourceId == "IdontExist.bam") # Add an index file ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"]) self.assertTrue(ds.externalResources[-1].indices[0].resourceId == "IdontExist.bam.pbi")
def test_qname_filter_scaling(self): # unaligned bam bam0 = ("/pbi/dept/secondary/siv/testdata/" "SA3-DS/ecoli/2590956/0003/" "Analysis_Results/m140913_222218_42240_c10069" "9952400000001823139203261564_s1_p0.all.subreadset.xml") bam1 = ("/pbi/dept/secondary/siv/testdata/" "SA3-DS/ecoli/2590953/0001/" "Analysis_Results/m140913_005018_42139_c10071" "3652400000001823152404301534_s1_p0.all.subreadset.xml") # separate '==' takes 120 seconds to addReq for 10k qnames: """ sset = SubreadSet(bam0, bam1) self.assertEqual(len(sset), 178570) size = 100 qn = [r.qName for r in sset[:size]] good_qn = [('=', name) for name in qn] sset.filters.addRequirement(qname=good_qn) #self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset)) sset = SubreadSet(data.getXml(10)) self.assertEqual(len(sset), 92) size = 10 qn = [r.qName for r in sset[:size]] good_qn = [('=', name) for name in qn] sset.filters.addRequirement(qname=good_qn) self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset)) """ # "in" takes 1.2 seconds to addReq for 10k qnames: sset = SubreadSet(bam0, bam1) self.assertEqual(len(sset), 178570) size = 100 qn = [r.qName for r in sset[:size]] good_qn = [('=', qn)] sset.filters.addRequirement(qname=good_qn) #self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset)) sset = SubreadSet(data.getXml(10)) self.assertEqual(len(sset), 92) size = 10 qn = [r.qName for r in sset[:size]] good_qn = [('=', qn)] sset.filters.addRequirement(qname=good_qn) self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset))
def test_provenance_record_ordering(self): ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(tmp_out) ds = SubreadSet(tmp_out, strict=True) tags = [r['tag'] for r in ds.metadata.record['children']] self.assertEqual(tags, [ 'TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats' ])
def test_nested_external_resources(self): log.debug("Testing nested externalResources in AlignmentSets") aln = AlignmentSet(data.getXml(0), skipMissing=True) self.assertTrue(aln.externalResources[0].pbi) self.assertTrue(aln.externalResources[0].reference) self.assertEqual( aln.externalResources[0].externalResources[0].metaType, 'PacBio.ReferenceFile.ReferenceFastaFile') self.assertEqual(aln.externalResources[0].scraps, None) log.debug("Testing nested externalResources in SubreadSets") subs = SubreadSet(data.getXml(5), skipMissing=True) self.assertTrue(subs.externalResources[0].scraps) self.assertEqual( subs.externalResources[0].externalResources[0].metaType, 'PacBio.SubreadFile.ScrapsBamFile') self.assertEqual(subs.externalResources[0].reference, None) log.debug("Testing added nested externalResoruces to SubreadSet") subs = SubreadSet(data.getXml(10)) self.assertFalse(subs.externalResources[0].scraps) subs.externalResources[0].scraps = 'fake.fasta' self.assertTrue(subs.externalResources[0].scraps) self.assertEqual( subs.externalResources[0].externalResources[0].metaType, 'PacBio.SubreadFile.ScrapsBamFile') subs.externalResources[0].barcodes = 'bc.fasta' self.assertTrue(subs.externalResources[0].barcodes) self.assertEqual( subs.externalResources[0].externalResources[1].metaType, "PacBio.DataSet.BarcodeSet") subs.externalResources[0].adapters = 'foo.adapters.fasta' self.assertEqual(subs.externalResources[0].adapters, 'foo.adapters.fasta') self.assertEqual( subs.externalResources[0].externalResources[2].metaType, "PacBio.SubreadFile.AdapterFastaFile") log.debug("Testing adding nested externalResources to AlignmetnSet " "manually") aln = AlignmentSet(data.getXml(8)) self.assertTrue(aln.externalResources[0].bai) self.assertTrue(aln.externalResources[0].pbi) self.assertFalse(aln.externalResources[0].reference) aln.externalResources[0].reference = 'fake.fasta' self.assertTrue(aln.externalResources[0].reference) self.assertEqual( aln.externalResources[0].externalResources[0].metaType, 'PacBio.ReferenceFile.ReferenceFastaFile')
def test_subreadset_consolidate(self): log.debug("Test through API") aln = SubreadSet(data.getXml(10), data.getXml(13)) self.assertEqual(len(aln.toExternalFiles()), 2) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'merged.bam') aln.consolidate(outfn) self.assertTrue(os.path.exists(outfn)) self.assertEqual(len(aln.toExternalFiles()), 1) nonCons = SubreadSet(data.getXml(10), data.getXml(13)) self.assertEqual(len(nonCons.toExternalFiles()), 2) for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))): self.assertEqual(read1, read2) self.assertEqual(len(aln), len(nonCons))
def test_subread_build(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) ds2 = SubreadSet(data.getXml(no=5), skipMissing=True) assert type(ds1).__name__ == 'SubreadSet' assert ds1._metadata.__class__.__name__ == 'SubreadSetMetadata' assert type(ds1._metadata).__name__ == 'SubreadSetMetadata' assert type(ds1.metadata).__name__ == 'SubreadSetMetadata' assert len(ds1.metadata.collections) == 1 assert len(ds2.metadata.collections) == 1 ds3 = ds1 + ds2 assert len(ds3.metadata.collections) == 2 ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True) assert type(ds4).__name__ == 'SubreadSet' assert type(ds4._metadata).__name__ == 'SubreadSetMetadata' assert len(ds4.metadata.collections) == 1