Ejemplo n.º 1
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a FOFN of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    fofn = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        fofn.append(out_fn)

    return fofn
Ejemplo n.º 2
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a list of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True, skipCounts=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    split_fns = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn, skipCounts=True)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        split_fns.append(out_fn)

    return split_fns
Ejemplo n.º 3
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1):
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                subreads_bam, scraps_bam
            ]
            print args
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            # FIXME we need a more general method for this
            ext_res_new = ExternalResource()
            ext_res_new.resourceId = subreads_bam
            ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile'
            ext_res_new.addIndices([subreads_bam + ".pbi"])
            ext_res_inner = ExternalResources()
            ext_res_scraps = ExternalResource()
            ext_res_scraps.resourceId = scraps_bam
            ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile'
            ext_res_scraps.addIndices([scraps_bam + ".pbi"])
            ext_res_inner.append(ext_res_scraps)
            ext_res_new.append(ext_res_inner)
            ds_new.externalResources.append(ext_res_new)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.updateCounts()
        ds_new.write(output_file_name)
    return 0
Ejemplo n.º 4
0
def run_bam_to_bam(subread_set_file,
                   barcode_set_file,
                   output_file_name,
                   nproc=1,
                   score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError(
            "Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(
                op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded",
                       op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                                       subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam", "-j",
                str(nproc), "-b",
                str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta,
                "--scoreMode", score_mode, subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(
                f=subreads_bam)
            add_subread_resources(ds_new,
                                  subreads=subreads_bam,
                                  scraps=scraps_bam,
                                  barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
Ejemplo n.º 5
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                "--scoreMode", score_mode,
                subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            add_subread_resources(ds_new,
                subreads=subreads_bam,
                scraps=scraps_bam,
                barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0