def run_bax_to_bam(input_file_name, output_file_name): with HdfSubreadSet(input_file_name) as ds_in: movies = set() for rr in ds_in.resourceReaders(): movies.add(rr.movieName) if len(movies) > 1: out_dir = os.path.dirname(output_file_name) ds_out_files = [] for bax_file in ds_in.toExternalFiles(): output_file_name_tmp = os.path.join(out_dir, ".".join( os.path.basename(bax_file).split(".")[:-2]) + ".hdfsubreadset.xml") rc = _run_bax_to_bam(bax_file, output_file_name_tmp) if rc != 0: log.error("bax2bam failed") return rc ds_out_files.append(output_file_name_tmp) ds = SubreadSet(*ds_out_files) ds.name = ds_in.name if 'Description' in ds_in.objMetadata: ds.objMetadata['Description'] = ds_in.objMetadata['Description'] ds.metadata.merge(ds_in.metadata) ds.write(output_file_name) else: return _run_bax_to_bam(input_file_name, output_file_name) return 0
def main(parser): args = parser.parse_args() filt = Filters() dset = SubreadSet(args.inXml) names = nameGen(args.inFile, fileType='list' if args.list else 'fasta') if args.subreads: if args.inverted: for name in names: filt.addRequirement(QNAME=[('!=', name)]) else: filt.addRequirement(QNAME=[('=', name) for name in names]) else: assert len( dset.movieIds ) == 1, 'This method only works for single-movie subreadsets. use --subreads option for multi-movie subreadsets' uniqHn = set(map(getZmw, names)) if args.inverted: for hn in uniqHn: filt.addRequirement(zm=[('!=', hn)]) else: filt.addRequirement(zm=[('=', hn) for hn in uniqHn]) dset.addFilters(filt) if args.newUuid: dset.newUuid() if args.name: dset.name = args.name dset.write(args.outXml)
def run_bax_to_bam(input_file_name, output_file_name): with HdfSubreadSet(input_file_name) as ds_in: movies = set() for rr in ds_in.resourceReaders(): movies.add(rr.movieName) if len(movies) > 1: out_dir = os.path.dirname(output_file_name) ds_out_files = [] for bax_file in ds_in.toExternalFiles(): output_file_name_tmp = os.path.join( out_dir, ".".join(os.path.basename(bax_file).split(".")[:-2]) + ".hdfsubreadset.xml") rc = _run_bax_to_bam(bax_file, output_file_name_tmp) if rc != 0: log.error("bax2bam failed") return rc ds_out_files.append(output_file_name_tmp) ds = SubreadSet(*ds_out_files) ds.name = ds_in.name if 'Description' in ds_in.objMetadata: ds.objMetadata['Description'] = ds_in.objMetadata[ 'Description'] ds.metadata.merge(ds_in.metadata) ds.write(output_file_name) else: return _run_bax_to_bam(input_file_name, output_file_name) return 0
def _generateSubreadSet(output_bam_file): sset = SubreadSet(output_bam_file, generateIndices=True) sset_output_name = output_bam_file[:-12] + 'subreadset.xml' sset.name = sset_output_name.split('.')[0] sset.write(sset_output_name) return sset_output_name
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError( "Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join( op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format( f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def test_sanitize_dataset_tags(self): ds = SubreadSet(pbtestdata.get_file("subreads-sequel")) base_name = ds.name ds.name = ds.name + " (filtered) (CCS)" ds.tags = "subreads,hidden,testdata,filtered " sanitize_dataset_tags(ds) assert ds.name == base_name + " (CCS)" assert ds.tags == "hidden,subreads,testdata" sanitize_dataset_tags(ds, remove_hidden=True) assert ds.name == base_name + " (CCS)" assert ds.tags == "subreads,testdata" ds.tags = ", hidden, ccs" sanitize_dataset_tags(ds, remove_hidden=True) assert ds.tags == "ccs"
def test_mock_update_barcoded_sample_metadata(self): tmp_dir = tempfile.mkdtemp() datastore_tmp = op.join(tmp_dir, "lima.datastore.json") barcodeset = pbtestdata.get_file("barcodeset") barcodes = ["lbc1--lbc1", "lbc3--lbc3"] files = [ op.join(tmp_dir, "lima.lbc1--lbc1.subreadset.xml"), op.join(tmp_dir, "lima.lbc3--lbc3.subreadset.xml") ] uuids = [uuid.uuid4() for fn in files] # XXX these are hardcoded to match the actual barcoded test input bc_uuids = [ "dffb30e8-9243-4743-9980-468a20952167", "eef1a8ea-c6a7-4233-982a-d426e1e7d8c9" ] ds = SubreadSet(pbtestdata.get_file("subreads-sequel")) def _add_barcoded_sample(sn, bn, id_): ds.metadata.collections[0].wellSample.bioSamples.addSample(sn) ds.metadata.collections[0].wellSample.bioSamples[ -1].DNABarcodes.addBarcode(bn) ds.metadata.collections[0].wellSample.bioSamples[-1].DNABarcodes[ -1].uniqueId = id_ _add_barcoded_sample("Alice", "lbc1--lbc1", bc_uuids[0]) _add_barcoded_sample("Charles", "lbc3--lbc3", bc_uuids[1]) tmp_ds = op.join(tmp_dir, "input.subreadset.xml") ds.write(tmp_ds) for fn, bc, dsid in zip(files, barcodes, uuids): ds = SubreadSet(tmp_ds) ds.uuid = str(dsid) ds.name = ds.name + " ({b})".format(b=bc) ds.write(fn) ds_files = [ DataStoreFile(dsid, "barcoding.tasks.lima-0", FileTypes.DS_SUBREADS.file_type_id, fn) for (dsid, fn) in zip(uuids, files) ] ds = DataStore(ds_files) ds.write_json(datastore_tmp) base_dir = tempfile.mkdtemp() datastore = mock_update_barcoded_sample_metadata( base_dir, datastore_tmp, tmp_ds, barcodeset) validate_barcoded_datastore_files(self, tmp_ds, datastore, number_of_expected_filters=0)