Exemple #1
0
def run_bax_to_bam(input_file_name, output_file_name):
    with HdfSubreadSet(input_file_name) as ds_in:
        movies = set()
        for rr in ds_in.resourceReaders():
            movies.add(rr.movieName)
        if len(movies) > 1:
            out_dir = os.path.dirname(output_file_name)
            ds_out_files = []
            for bax_file in ds_in.toExternalFiles():
                output_file_name_tmp = os.path.join(out_dir, ".".join(
                    os.path.basename(bax_file).split(".")[:-2]) +
                    ".hdfsubreadset.xml")
                rc = _run_bax_to_bam(bax_file, output_file_name_tmp)
                if rc != 0:
                    log.error("bax2bam failed")
                    return rc
                ds_out_files.append(output_file_name_tmp)
            ds = SubreadSet(*ds_out_files)
            ds.name = ds_in.name
            if 'Description' in ds_in.objMetadata:
                ds.objMetadata['Description'] = ds_in.objMetadata['Description']
                ds.metadata.merge(ds_in.metadata)
            ds.write(output_file_name)
        else:
            return _run_bax_to_bam(input_file_name, output_file_name)
    return 0
def main(parser):
    args = parser.parse_args()

    filt = Filters()
    dset = SubreadSet(args.inXml)
    names = nameGen(args.inFile, fileType='list' if args.list else 'fasta')
    if args.subreads:
        if args.inverted:
            for name in names:
                filt.addRequirement(QNAME=[('!=', name)])
        else:
            filt.addRequirement(QNAME=[('=', name) for name in names])
    else:
        assert len(
            dset.movieIds
        ) == 1, 'This method only works for single-movie subreadsets.  use --subreads option for multi-movie subreadsets'
        uniqHn = set(map(getZmw, names))
        if args.inverted:
            for hn in uniqHn:
                filt.addRequirement(zm=[('!=', hn)])
        else:
            filt.addRequirement(zm=[('=', hn) for hn in uniqHn])
    dset.addFilters(filt)
    if args.newUuid:
        dset.newUuid()
    if args.name:
        dset.name = args.name
    dset.write(args.outXml)
Exemple #3
0
def run_bax_to_bam(input_file_name, output_file_name):
    with HdfSubreadSet(input_file_name) as ds_in:
        movies = set()
        for rr in ds_in.resourceReaders():
            movies.add(rr.movieName)
        if len(movies) > 1:
            out_dir = os.path.dirname(output_file_name)
            ds_out_files = []
            for bax_file in ds_in.toExternalFiles():
                output_file_name_tmp = os.path.join(
                    out_dir,
                    ".".join(os.path.basename(bax_file).split(".")[:-2]) +
                    ".hdfsubreadset.xml")
                rc = _run_bax_to_bam(bax_file, output_file_name_tmp)
                if rc != 0:
                    log.error("bax2bam failed")
                    return rc
                ds_out_files.append(output_file_name_tmp)
            ds = SubreadSet(*ds_out_files)
            ds.name = ds_in.name
            if 'Description' in ds_in.objMetadata:
                ds.objMetadata['Description'] = ds_in.objMetadata[
                    'Description']
                ds.metadata.merge(ds_in.metadata)
            ds.write(output_file_name)
        else:
            return _run_bax_to_bam(input_file_name, output_file_name)
    return 0
Exemple #4
0
def _generateSubreadSet(output_bam_file):
    sset = SubreadSet(output_bam_file, generateIndices=True)

    sset_output_name = output_bam_file[:-12] + 'subreadset.xml'
    sset.name = sset_output_name.split('.')[0]
    sset.write(sset_output_name)
    return sset_output_name
Exemple #5
0
def run_bam_to_bam(subread_set_file,
                   barcode_set_file,
                   output_file_name,
                   nproc=1,
                   score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError(
            "Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(
                op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded",
                       op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                                       subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam", "-j",
                str(nproc), "-b",
                str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta,
                "--scoreMode", score_mode, subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(
                f=subreads_bam)
            add_subread_resources(ds_new,
                                  subreads=subreads_bam,
                                  scraps=scraps_bam,
                                  barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
Exemple #6
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                "--scoreMode", score_mode,
                subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            add_subread_resources(ds_new,
                subreads=subreads_bam,
                scraps=scraps_bam,
                barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
 def test_sanitize_dataset_tags(self):
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"))
     base_name = ds.name
     ds.name = ds.name + " (filtered) (CCS)"
     ds.tags = "subreads,hidden,testdata,filtered "
     sanitize_dataset_tags(ds)
     assert ds.name == base_name + " (CCS)"
     assert ds.tags == "hidden,subreads,testdata"
     sanitize_dataset_tags(ds, remove_hidden=True)
     assert ds.name == base_name + " (CCS)"
     assert ds.tags == "subreads,testdata"
     ds.tags = ", hidden, ccs"
     sanitize_dataset_tags(ds, remove_hidden=True)
     assert ds.tags == "ccs"
    def test_mock_update_barcoded_sample_metadata(self):
        tmp_dir = tempfile.mkdtemp()
        datastore_tmp = op.join(tmp_dir, "lima.datastore.json")
        barcodeset = pbtestdata.get_file("barcodeset")
        barcodes = ["lbc1--lbc1", "lbc3--lbc3"]
        files = [
            op.join(tmp_dir, "lima.lbc1--lbc1.subreadset.xml"),
            op.join(tmp_dir, "lima.lbc3--lbc3.subreadset.xml")
        ]
        uuids = [uuid.uuid4() for fn in files]
        # XXX these are hardcoded to match the actual barcoded test input
        bc_uuids = [
            "dffb30e8-9243-4743-9980-468a20952167",
            "eef1a8ea-c6a7-4233-982a-d426e1e7d8c9"
        ]
        ds = SubreadSet(pbtestdata.get_file("subreads-sequel"))

        def _add_barcoded_sample(sn, bn, id_):
            ds.metadata.collections[0].wellSample.bioSamples.addSample(sn)
            ds.metadata.collections[0].wellSample.bioSamples[
                -1].DNABarcodes.addBarcode(bn)
            ds.metadata.collections[0].wellSample.bioSamples[-1].DNABarcodes[
                -1].uniqueId = id_

        _add_barcoded_sample("Alice", "lbc1--lbc1", bc_uuids[0])
        _add_barcoded_sample("Charles", "lbc3--lbc3", bc_uuids[1])
        tmp_ds = op.join(tmp_dir, "input.subreadset.xml")
        ds.write(tmp_ds)
        for fn, bc, dsid in zip(files, barcodes, uuids):
            ds = SubreadSet(tmp_ds)
            ds.uuid = str(dsid)
            ds.name = ds.name + " ({b})".format(b=bc)
            ds.write(fn)
        ds_files = [
            DataStoreFile(dsid, "barcoding.tasks.lima-0",
                          FileTypes.DS_SUBREADS.file_type_id, fn)
            for (dsid, fn) in zip(uuids, files)
        ]
        ds = DataStore(ds_files)
        ds.write_json(datastore_tmp)
        base_dir = tempfile.mkdtemp()
        datastore = mock_update_barcoded_sample_metadata(
            base_dir, datastore_tmp, tmp_ds, barcodeset)
        validate_barcoded_datastore_files(self,
                                          tmp_ds,
                                          datastore,
                                          number_of_expected_filters=0)