def test_update_barcoded_sample_metadata(self): datastore_tmp = tempfile.NamedTemporaryFile( suffix=".datastore.json").name barcodes = pbtestdata.get_file("barcodeset") ds = split_barcoded_dataset(self.SUBREADS) ds.write_json(datastore_tmp) base_dir = tempfile.mkdtemp() datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp, self.SUBREADS, barcodes) validate_barcoded_datastore_files(self, self.SUBREADS, datastore) # now with use_barcode_uuids=False datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp, self.SUBREADS, barcodes, use_barcode_uuids=False) validate_barcoded_datastore_files(self, self.SUBREADS, datastore, use_barcode_uuids=False) # test that it doesn't break with no collection metadata ss = SubreadSet(self.SUBREADS) ss.metadata.collections = None ss_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ss.write(ss_tmp) ds = split_barcoded_dataset(ss_tmp) ds.write_json(datastore_tmp) base_dir = tempfile.mkdtemp() datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp, self.SUBREADS, barcodes) validate_barcoded_datastore_files(self, self.SUBREADS, datastore, have_collection_metadata=False, number_of_expected_collections=0)
def main(parser): args = parser.parse_args() filt = Filters() dset = SubreadSet(args.inXml) names = nameGen(args.inFile, fileType='list' if args.list else 'fasta') if args.subreads: if args.inverted: for name in names: filt.addRequirement(QNAME=[('!=', name)]) else: filt.addRequirement(QNAME=[('=', name) for name in names]) else: assert len( dset.movieIds ) == 1, 'This method only works for single-movie subreadsets. use --subreads option for multi-movie subreadsets' uniqHn = set(map(getZmw, names)) if args.inverted: for hn in uniqHn: filt.addRequirement(zm=[('!=', hn)]) else: filt.addRequirement(zm=[('=', hn) for hn in uniqHn]) dset.addFilters(filt) if args.newUuid: dset.newUuid() if args.name: dset.name = args.name dset.write(args.outXml)
def _generateSubreadSet(output_bam_file): sset = SubreadSet(output_bam_file, generateIndices=True) sset_output_name = output_bam_file[:-12] + 'subreadset.xml' sset.name = sset_output_name.split('.')[0] sset.write(sset_output_name) return sset_output_name
def test_subreadset_metadata_element_name(self): # without touching the element: sset = SubreadSet(data.getXml(10)) log.debug(data.getXml(10)) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) sset.write(fn) f = ET.parse(fn) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')), 0) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')), 1) # with touching the element: sset = SubreadSet(data.getXml(10)) sset.metadata.description = 'foo' fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn, validate=False) f = ET.parse(fn) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')), 0) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')), 1)
def test_subreadset_from_bam(self): # DONE control experiment for bug 28698 bam = upstreamData.getUnalignedBam() ds1 = SubreadSet(bam, strict=False) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) ds1.write(fn)
def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a FOFN of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) fofn = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) fofn.append(out_fn) return fofn
def test_subreadset_metadata_element_name(self): # without touching the element: sset = SubreadSet(data.getXml(9)) log.debug(data.getXml(9)) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml") log.debug(fn.name) sset.write(fn.name) f = ET.parse(fn.name) assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')) == 0 assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')) == 1 fn.close() # with touching the element: sset = SubreadSet(data.getXml(9)) sset.metadata.description = 'foo' fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml") sset.write(fn.name, validate=False) f = ET.parse(fn.name) assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')) == 0 assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')) == 1 fn.close()
def run_bax_to_bam(input_file_name, output_file_name): with HdfSubreadSet(input_file_name) as ds_in: movies = set() for rr in ds_in.resourceReaders(): movies.add(rr.movieName) if len(movies) > 1: out_dir = os.path.dirname(output_file_name) ds_out_files = [] for bax_file in ds_in.toExternalFiles(): output_file_name_tmp = os.path.join( out_dir, ".".join(os.path.basename(bax_file).split(".")[:-2]) + ".hdfsubreadset.xml") rc = _run_bax_to_bam(bax_file, output_file_name_tmp) if rc != 0: log.error("bax2bam failed") return rc ds_out_files.append(output_file_name_tmp) ds = SubreadSet(*ds_out_files) ds.name = ds_in.name if 'Description' in ds_in.objMetadata: ds.objMetadata['Description'] = ds_in.objMetadata[ 'Description'] ds.metadata.merge(ds_in.metadata) ds.write(output_file_name) else: return _run_bax_to_bam(input_file_name, output_file_name) return 0
def setUpClass(cls): super(TestToolContract, cls).setUpClass() ds = SubreadSet(BAM_FILE, strict=True) ds.write(cls.INPUT_FILES[0]) with FastaWriter(cls.INPUT_FILES[1]) as fa_out: for i in range(1010): fa_out.writeRecord("%04d_Forward" % i, "A" * 16)
def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a list of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True, skipCounts=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) split_fns = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn, skipCounts=True) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) split_fns.append(out_fn) return split_fns
def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format( chunks, maxChunks)) dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def run_bax_to_bam(input_file_name, output_file_name): with HdfSubreadSet(input_file_name) as ds_in: movies = set() for rr in ds_in.resourceReaders(): movies.add(rr.movieName) if len(movies) > 1: out_dir = os.path.dirname(output_file_name) ds_out_files = [] for bax_file in ds_in.toExternalFiles(): output_file_name_tmp = os.path.join(out_dir, ".".join( os.path.basename(bax_file).split(".")[:-2]) + ".hdfsubreadset.xml") rc = _run_bax_to_bam(bax_file, output_file_name_tmp) if rc != 0: log.error("bax2bam failed") return rc ds_out_files.append(output_file_name_tmp) ds = SubreadSet(*ds_out_files) ds.name = ds_in.name if 'Description' in ds_in.objMetadata: ds.objMetadata['Description'] = ds_in.objMetadata['Description'] ds.metadata.merge(ds_in.metadata) ds.write(output_file_name) else: return _run_bax_to_bam(input_file_name, output_file_name) return 0
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(no=8)) self.assertFalse(aln.metadata.collections) aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') self.assertTrue(aln.metadata.collections) sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) self.assertEqual(sset.metadata, orig_metadata) # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) with self.assertRaises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(7)) assert not aln.metadata.collections aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') assert aln.metadata.collections sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None assert not sset.metadata.collections sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) assert sset.metadata == orig_metadata # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None assert not sset.metadata.collections with pytest.raises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) ss.write(ofn, validate=False)
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1): bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, subreads_bam, scraps_bam ] print args log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) # FIXME we need a more general method for this ext_res_new = ExternalResource() ext_res_new.resourceId = subreads_bam ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile' ext_res_new.addIndices([subreads_bam + ".pbi"]) ext_res_inner = ExternalResources() ext_res_scraps = ExternalResource() ext_res_scraps.resourceId = scraps_bam ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile' ext_res_scraps.addIndices([scraps_bam + ".pbi"]) ext_res_inner.append(ext_res_scraps) ext_res_new.append(ext_res_inner) ds_new.externalResources.append(ext_res_new) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.updateCounts() ds_new.write(output_file_name) return 0
def test_bam2fastx_filtered(self): input_file = pbtestdata.get_file("subreads-xml") ds = SubreadSet(input_file, strict=True) ds.filters.addRequirement(length=[('>=', 1000)]) input_tmp = get_temp_file(suffix=".subreadset.xml") ds.write(input_tmp) nrecords_expected = 13 self.run_and_check_fastx(input_tmp, nrecords_expected)
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError( "Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join( op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format( f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def setUpClass(cls): tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam) shutil.copyfile(pbcore.data.getUnalignedBam()+".pbi", tmp_bam+".pbi") ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True) ds.write(cls.INPUT_FILES[0]) _write_fasta_or_contigset(cls.INPUT_FILES[1], make_faidx=True, ds_class=BarcodeSet) super(TestScatterSubreadBAMs, cls).setUpClass()
def _make_dataset(file_name=None, barcodes=None): if file_name is None: file_name = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds = SubreadSet(BAM_FILE, strict=True) if barcodes is not None: for er in ds.externalResources: er.barcodes = barcodes ds.write(file_name) return file_name
def _set_up_basic(self): input_file = get_temp_file(suffix=".subreadset.xml") ds = SubreadSet(data.getXml(9), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") ds.write(input_file) return input_file, len(ds)
def test_provenance_record_ordering(self): import pbtestdata ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(tmp_out) ds = SubreadSet(tmp_out, strict=True) tags = [r['tag'] for r in ds.metadata.record['children']] self.assertEqual(tags, ['TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats'])
def test_get_dataset_uuid(self): ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) uuid = getDataSetUuid(ds_file) self.assertEqual(uuid, ds.uuid) with open(ds_file, "w") as out: out.write("hello world!") uuid = getDataSetUuid(ds_file) self.assertEqual(uuid, None)
def test_get_dataset_uuid(self): ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) uuid = getDataSetUuid(ds_file) assert uuid == ds.uuid with open(ds_file, "w") as out: out.write("hello world!") uuid = getDataSetUuid(ds_file) assert uuid is None
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) # There are no existing biosamples: self.assertFalse( 'BioSamples' in ss.metadata.collections[0].wellSample.tags) # Therefore the metadata is falsy self.assertFalse(ss.metadata.collections[0].wellSample.bioSamples) ss.metadata.collections[0].wellSample.bioSamples.addSample('Clown') self.assertEqual( 'Clown', ss.metadata.collections[0].wellSample.bioSamples[0].name) ss.metadata.collections[0].wellSample.bioSamples[ 0].DNABarcodes.addBarcode('Dentist') self.assertEqual( 'Dentist', ss.metadata.collections[0].wellSample.bioSamples[0]. DNABarcodes[0].name) # check that we are adding one additional biosamples element: self.assertEqual( Counter(ss.metadata.collections[0].wellSample.tags)['BioSamples'], 1) # Therefore the metadata is truthy self.assertTrue(ss.metadata.collections[0].wellSample.bioSamples) ss.write(ofn, validate=False)
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def test_subreads_parent_dataset(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) assert ds1.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4" ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True) assert ds2.metadata.provenance.parentDataSet.uniqueId is None ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4", "PacBio.DataSet.SubreadSet", "timestamped_name") assert ds2.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4" ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds2.write(ds_out, validate=False)
def setUpClass(cls): tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam) shutil.copyfile(pbcore.data.getUnalignedBam() + ".pbi", tmp_bam + ".pbi") ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True) ds.write(cls.INPUT_FILES[0]) _write_fasta_or_contigset(cls.INPUT_FILES[1], make_faidx=True, ds_class=BarcodeSet) super(TestScatterSubreadBAMs, cls).setUpClass()
def test_filter_dataset_bq(self): ds_in = get_temp_file(suffix=".subreadset.xml") ds = SubreadSet(pbtestdata.get_file("barcoded-subreadset"), strict=True) ds.filters.addRequirement(bq=[('>=', 31)]) assert len(ds) == 1 ds.write(ds_in) ds_out = get_temp_file(suffix=".subreadset.xml") args = self.BASE_ARGS + [ds_in, ds_out, "length >= 10 AND bq >= 10"] self._check_call(args) n_expected = 2 expected_filter_str = "( bq >= 10 AND length >= 10 )" self.run_after(ds_out, n_expected, expected_filter_str)
def test_subreads_parent_dataset(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) self.assertEqual(ds1.metadata.provenance.parentDataSet.uniqueId, "f81cf391-b3da-41f8-84cb-a0de71f460f4") ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True) self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId, None) ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4", "PacBio.DataSet.SubreadSet", "timestamped_name") self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId, "f81cf391-b3da-41f8-84cb-a0de71f460f4") ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds2.write(ds_out, validate=False)
def test_provenance_record_ordering(self): ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(tmp_out) ds = SubreadSet(tmp_out, strict=True) tags = [r['tag'] for r in ds.metadata.record['children']] self.assertEqual(tags, [ 'TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats' ])
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) # There are no existing biosamples: self.assertFalse( 'BioSamples' in ss.metadata.tags) # Therefore the metadata is falsy self.assertFalse(ss.metadata.bioSamples) ss.metadata.bioSamples.addSample('Clown') self.assertEqual('Clown', ss.metadata.bioSamples[0].name) ss.metadata.bioSamples[0].DNABarcodes.addBarcode('Dentist') self.assertEqual('Dentist', ss.metadata.bioSamples[0].DNABarcodes[0].name) # check that we are adding one additional biosamples element: self.assertEqual(Counter(ss.metadata.tags)['BioSamples'], 1) # Therefore the metadata is truthy self.assertTrue(ss.metadata.bioSamples) ss.write(ofn, validate=False)
def to_zmw_chunked_subreadset_files(subreadset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): """Identical to to_chunked_subreadset_files, but chunks subreads by ZMW ranges for input to pbccs.""" dset = SubreadSet(subreadset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, zmws=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(9)) col = CollectionMetadata() assert not ss.metadata.collections ss.metadata.collections.append(col) assert ss.metadata.collections col.cellIndex = 1 assert ss.metadata.collections[0].cellIndex == '1' col.instrumentName = "foo" assert ss.metadata.collections[0].instrumentName == "foo" col.context = 'bar' assert ss.metadata.collections[0].context == "bar" ss.metadata.collections[0].runDetails.name = 'foo' assert 'foo' == ss.metadata.collections[0].runDetails.name ss.metadata.collections[0].wellSample.name = 'bar' assert 'bar' == ss.metadata.collections[0].wellSample.name ss.metadata.collections[0].wellSample.wellName = 'baz' assert 'baz' == ss.metadata.collections[0].wellSample.wellName ss.metadata.collections[0].wellSample.concentration = 'baz' assert 'baz' == ss.metadata.collections[0].wellSample.concentration # There are no existing biosamples: assert not 'BioSamples' in ss.metadata.tags # Therefore the metadata is falsy assert not ss.metadata.bioSamples ss.metadata.bioSamples.addSample('Clown') assert 'Clown' == ss.metadata.bioSamples[0].name ss.metadata.bioSamples[0].DNABarcodes.addBarcode('Dentist') assert 'Dentist' == ss.metadata.bioSamples[0].DNABarcodes[0].name # check that we are adding one additional biosamples element: assert Counter(ss.metadata.tags)['BioSamples'] == 1 # Therefore the metadata is truthy assert ss.metadata.bioSamples ss.write(ofn, validate=False)
def test_mock_update_barcoded_sample_metadata(self): tmp_dir = tempfile.mkdtemp() datastore_tmp = op.join(tmp_dir, "lima.datastore.json") barcodeset = pbtestdata.get_file("barcodeset") barcodes = ["lbc1--lbc1", "lbc3--lbc3"] files = [ op.join(tmp_dir, "lima.lbc1--lbc1.subreadset.xml"), op.join(tmp_dir, "lima.lbc3--lbc3.subreadset.xml") ] uuids = [uuid.uuid4() for fn in files] # XXX these are hardcoded to match the actual barcoded test input bc_uuids = [ "dffb30e8-9243-4743-9980-468a20952167", "eef1a8ea-c6a7-4233-982a-d426e1e7d8c9" ] ds = SubreadSet(pbtestdata.get_file("subreads-sequel")) def _add_barcoded_sample(sn, bn, id_): ds.metadata.collections[0].wellSample.bioSamples.addSample(sn) ds.metadata.collections[0].wellSample.bioSamples[ -1].DNABarcodes.addBarcode(bn) ds.metadata.collections[0].wellSample.bioSamples[-1].DNABarcodes[ -1].uniqueId = id_ _add_barcoded_sample("Alice", "lbc1--lbc1", bc_uuids[0]) _add_barcoded_sample("Charles", "lbc3--lbc3", bc_uuids[1]) tmp_ds = op.join(tmp_dir, "input.subreadset.xml") ds.write(tmp_ds) for fn, bc, dsid in zip(files, barcodes, uuids): ds = SubreadSet(tmp_ds) ds.uuid = str(dsid) ds.name = ds.name + " ({b})".format(b=bc) ds.write(fn) ds_files = [ DataStoreFile(dsid, "barcoding.tasks.lima-0", FileTypes.DS_SUBREADS.file_type_id, fn) for (dsid, fn) in zip(uuids, files) ] ds = DataStore(ds_files) ds.write_json(datastore_tmp) base_dir = tempfile.mkdtemp() datastore = mock_update_barcoded_sample_metadata( base_dir, datastore_tmp, tmp_ds, barcodeset) validate_barcoded_datastore_files(self, tmp_ds, datastore, number_of_expected_filters=0)
def test_merge_biosamples(self): import pbtestdata ds1 = pbtestdata.get_file("subreads-biosample-1") ds2 = pbtestdata.get_file("subreads-biosample-2") # Case 1: two biosamples ds = SubreadSet(ds1, ds2) samples = [bs.name for bs in ds.metadata.bioSamples] assert samples == ["Alice", "Bob"] # Case 2: same biosample in both files ds = SubreadSet(ds1, ds1) samples = [bs.name for bs in ds.metadata.bioSamples] assert samples == ["Alice"] assert len(ds.metadata.bioSamples[0].DNABarcodes) == 1 # Case 3: same biosample, different barcodes dsTmp = SubreadSet(ds1) dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7" tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name dsTmp.write(tmpFile) ds = SubreadSet(ds1, tmpFile) samples = [bs.name for bs in ds.metadata.bioSamples] assert samples == ["Alice"] bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes] assert bcs == ["F1--R1", "F7--R7"]
def test_merge_biosamples(self): import pbtestdata ds1 = pbtestdata.get_file("subreads-biosample-1") ds2 = pbtestdata.get_file("subreads-biosample-2") # Case 1: two biosamples ds = SubreadSet(ds1, ds2) samples = [bs.name for bs in ds.metadata.bioSamples] self.assertEqual(samples, ["Alice", "Bob"]) # Case 2: same biosample in both files ds = SubreadSet(ds1, ds1) samples = [bs.name for bs in ds.metadata.bioSamples] self.assertEqual(samples, ["Alice"]) self.assertEqual(len(ds.metadata.bioSamples[0].DNABarcodes), 1) # Case 3: same biosample, different barcodes dsTmp = SubreadSet(ds1) dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7" tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name dsTmp.write(tmpFile) ds = SubreadSet(ds1, tmpFile) samples = [bs.name for bs in ds.metadata.bioSamples] self.assertEqual(samples, ["Alice"]) bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes] self.assertEqual(bcs, ["F1--R1", "F7--R7"])
def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info( 'Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)' .format(chunks, maxChunks)) dset_chunks = dset.split( zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(no=8)) self.assertFalse(aln.metadata.collections) aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') self.assertTrue(aln.metadata.collections) sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') stack = zip(sset.metadata, orig_metadata) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) self.assertEqual(sset.metadata, orig_metadata) # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) with self.assertRaises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def setUpClass(cls): ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True) ds.write(cls.INPUT_FILES[0]) super(TestBam2Fasta, cls).setUpClass()
def test_get_dataset_metatype(self): ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) meta_type = getDataSetMetaType(ds_file) self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
def setUpClass(cls): ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True) ds.filters.addRequirement(length=[('>=', 1000)]) ds.write(cls.INPUT_FILES[0]) super(TestBam2FastaFiltered, cls).setUpClass()
class SmrtCell(object): ''' Initializes a new SmrtCell object from a smrtcell xml file @param xml_file: the path to a subreadset.xml file of a smrtcell ''' def __init__(self,xml_file): self.__logger = logging.getLogger('support.smrtcell') self.__is_valid = False self.__xml_file = check_file(xml_file) if not self.__xml_file: self.show_log('error', 'XML file '+self.__xml_file+' does not exist or is not a file!') return #TODO: read xml content from encrypted file self.__subreadset = None try: self.__subreadset = SubreadSet(self.__xml_file) except IOError as err: self.show_log('error', 'Parsing of XML file '+self.__xml_file+' was not successful: '+err+'!') return self.__is_valid = True ''' Tests if the SmrtCell object is valid. @return: return true if the SmrtCell object is valid otherwise false @rtype: bool ''' def is_valid(self): return self.__is_valid ''' Returns the name of the SmrtCell object. @return: the name @rtype: str ''' def get_name(self): return self.__subreadset.name if self.__is_valid else None ''' Returns the total number of reads in the SmrtCell object. @return: the number of reads @rtype: integer ''' def get_total_number_of_reads(self): return int(self.__subreadset.metadata.numRecords) if self.__is_valid else None ''' Returns the total number of bp in the SmrtCell object. @return: the number of bp @rtype: integer ''' def get_total_sum_of_bp(self): return int(self.__subreadset.metadata.totalLength) if self.__is_valid else None ''' Returns the number of collections ('sequencing runs') in the SmrtCell object. Should be 1 in almost all cases. If not, all other functions have an optional argument to specify the collection. @return: the number of sequencing runs @rtype: integer ''' def get_number_of_collections(self): return len(self.__subreadset.metadata.collections) if self.__is_valid else None ''' Checks if a provided collection index is valid, i.e. can access a collection. Do not confuse with collection index. @param collection_index: the index of the collection @return: true if collection index is valid otherwise false @rtype: bool ''' def check_collection_index(self,collection_index): return self.__is_valid and collection_index >= 0 and collection_index<len(self.__subreadset.metadata.collections) ''' Returns the names of the samples that were loaded onto this SmrtCell. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: a list with sample names @rtype: list of str ''' def get_biosample_names(self,collection_index=0): biosample_names = [] if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' num_biosamples = len(self.__subreadset.metadata.collections[collection_index].wellSample.bioSamples) for i in range(0,num_biosamples): biosample_names.append(self.__subreadset.metadata.collections[collection_index].wellSample.bioSamples[i].name) return biosample_names ''' Returns the cell index. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the cell index @rtype: integer ''' def get_cell_index(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return int(self.__subreadset.metadata.collections[collection_index].cellIndex) else: return None ''' Returns the collection number. Do not confuse with collection index. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the collection number @rtype: integer ''' def get_collection_number(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return int(self.__subreadset.metadata.collections[collection_index].collectionNumber) else: return None ''' Returns the raw data path. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the raw data path @rtype: str ''' def get_raw_data_path(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return get_absolute_path(self.__subreadset.metadata.collections[collection_index].primary.outputOptions.collectionPathUri) else: return None ''' Returns the run id. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the run id @rtype: str ''' def get_run_id(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].runDetails.timeStampedName else: return None ''' Returns the run name. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the run name @rtype: str ''' def get_run_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].runDetails.name else: return None ''' Returns the cellpac barcode. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the cellpac barcode @rtype: str ''' def get_cellpac_barcode(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].cellPac.barcode else: return None ''' Returns the cellpac lot number. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the cellpac lot number @rtype: str ''' def get_cellpac_lot_number(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].cellPac.lotNumber else: return None ''' Returns the instrument code. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the instrument code @rtype: str ''' def get_instrument_code(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].instrumentName else: return None ''' Returns the sequencing date. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the sequencing date as string @rtype: str ''' def get_sequencing_date(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' val = self.__subreadset.metadata.collections[collection_index].createdAt reduced_date_string, suffix = val.split(".") suffix = suffix return datetime.strptime(reduced_date_string, "%Y-%m-%dT%H:%M:%S").strftime('%Y-%m-%d') else: return None ''' Returns the well name. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the well name @rtype: str ''' def get_well_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].wellSample.name else: return None ''' Returns the concentration. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the concentration @rtype: float ''' def get_concentration(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return float(self.__subreadset.metadata.collections[collection_index].wellSample.concentration) else: return None ''' Returns the UseCount property. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the UseCount property @rtype: str ''' def get_usecount(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].wellSample.useCount else: return None ''' Returns the instrument control software version. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the instrument control software version @rtype: str ''' def get_instrument_control_software_version(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].instCtrlVer else: return None ''' Returns the signal processing software version. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the signal processing software version @rtype: str ''' def get_signal_processing_software_version(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].sigProcVer else: return None ''' Returns the notes (i.e. additional free text description). @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the notes @rtype: str ''' def get_notes(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].wellSample.description else: return None ''' Returns the automation parameters. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: a hash with key - value pairs @rtype: dict of str ''' def get_automation_parameters(self,collection_index=0): automation_parameters = {} if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' for i in range(0,len(self.__subreadset.metadata.collections[collection_index].automation.automationParameters)): hashed_data = self.__subreadset.metadata.collections[collection_index].automation.automationParameters[i].metadata automation_parameters[hashed_data['Name']] = hashed_data['SimpleValue'] return automation_parameters ''' Returns the movie length in minutes. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the movie length in minutes @rtype: integer ''' def get_movie_length(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' automation_parameters = self.get_automation_parameters(collection_index) return int(automation_parameters['MovieLength']) else: return None ''' Returns the immobilization time in minutes. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the immobilization time in minutes @rtype: integer ''' def get_immobilisation_time(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' automation_parameters = self.get_automation_parameters(collection_index) return int(automation_parameters['ImmobilizationTime']) else: return None ''' Returns the insert size. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the insert size @rtype: integer ''' def get_insert_size(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' automation_parameters = self.get_automation_parameters(collection_index) return int(automation_parameters['InsertSize']) else: return None ''' Returns true if hot start was enabled otherwise false. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: true if hot start was enabled false otherwise @rtype: bool ''' def get_stage_hotstart_enabled(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return bool(self.__subreadset.metadata.collections[collection_index].wellSample.stageHotstartEnabled) else: return None ''' Returns the primary protocol name @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the primary protocol name @rtype: str ''' def get_primary_protocol_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].primary.automationName else: return None ''' Returns the primary protocol config @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the primary protocol config @rtype: str ''' def get_primary_protocol_config(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].primary.configFileName else: return None ''' Returns the adapter sequences used in template prep @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: a dictionary with left adapter sequence and right adapter sequence @rtype: dict of str ''' def get_adapter_sequences(self,collection_index=0): adapter_sequences = {} if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' adapter_sequences['LeftAdapterSequence'] = self.__subreadset.metadata.collections[collection_index].templatePrepKit.leftAdaptorSequence adapter_sequences['RightAdapterSequence'] = self.__subreadset.metadata.collections[collection_index].templatePrepKit.rightAdaptorSequence return adapter_sequences ''' Returns the sample name. Deprecated, use get_biosample_names instead. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the sample name @rtype: str ''' def get_sample_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' return self.__subreadset.metadata.collections[collection_index].wellSample.name else: return None ''' Returns the name of the Sequel binding kit. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the binding kit name @rtype: str ''' def get_binding_kit_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' name = self.__subreadset.metadata.collections[collection_index].bindingKit.name return normalize('NFKD',name).encode('ascii','ignore') return else: return None ''' Returns the name of the Sequel template prep kit. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the template prep kit name @rtype: str ''' def get_template_prep_kit_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' name = self.__subreadset.metadata.collections[collection_index].templatePrepKit.name return normalize('NFKD',name).encode('ascii','ignore') return else: return None ''' Returns the name of the Sequel sequencing plate kit. @param collection_index: the index of the collection (optional, zero-based, default: 0) @return: the sequencing plate kit name @rtype: str ''' def get_sequencing_plate_kit_name(self,collection_index=0): if self.__is_valid: assert self.check_collection_index(collection_index),'Specified collection index is invalid!' name = self.__subreadset.metadata.collections[collection_index].sequencingKitPlate.name return normalize('NFKD',name).encode('ascii','ignore') return else: return None ''' Make all paths encoded in the SmrtCell object relative. @param outdir: a directory from which the paths should originate (optional, default: ".") ''' def make_paths_relative(self,outdir="."): if self.__is_valid: self.__subreadset.makePathsRelative(outdir) ''' Make all paths encoded in the SmrtCell object absolute. ''' def make_paths_absolute(self): if self.__is_valid: self.__subreadset.makePathsAbsolute() ''' Write new (subreadset) xml file for SmrtCell object. @param filename: a file name ''' def write(self,filename): if self.__is_valid: self.__subreadset.write(filename) ''' Helper function for log messages @param level: the log level (debug, info, warning, error, critical) @param message: the log message ''' def show_log(self, level, message): if level == 'debug': self.__logger.debug(message) elif level == 'info': self.__logger.info(message) elif level == 'warning': self.__logger.warning(message) elif level == 'error': self.__logger.error(message) elif level == 'critical': self.__logger.critical(message)
def setUpClass(cls): ds = SubreadSet(data.getXml(10), strict=True) ds.write(cls.INPUT_FILES[0])
def setUpClass(cls): ds = SubreadSet(cls.SRC_FILE, strict=True) ds.filters.addRequirement(length=[('>=', 1000)]) ds.write(cls.INPUT_FILES[0]) super(TestBam2FastqFiltered, cls).setUpClass()
def setUpClass(cls): ds = SubreadSet(cls.SRC_FILE, strict=True) ds.write(cls.INPUT_FILES[0]) super(TestBam2FastaArchive, cls).setUpClass()
def setUpClass(cls): ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True) ds.write(cls.INPUT_FILES[0])
def test_get_dataset_metatype(self): ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) meta_type = getDataSetMetaType(ds_file) assert meta_type == "PacBio.DataSet.SubreadSet"