Beispiel #1
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    new_prefix = re.sub(".subreadset.xml$", "", output_file_name)
    args = [
        "bam2bam",
        "-j", str(nproc),
        "-b", str(nproc),
        "-o", new_prefix,
        "--barcodes", barcode_set_file,
        "--scoreMode", score_mode,
        subread_set_file
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    assert op.isfile(output_file_name)
    tmp_out = op.join(op.dirname(output_file_name),
                      "tmp_" + op.basename(output_file_name))
    shutil.move(output_file_name, tmp_out)
    with SubreadSet(tmp_out, strict=True) as ds:
        with SubreadSet(subread_set_file) as ds_in:
            ds.metadata = ds_in.metadata
            ds.name = ds_in.name + " (barcoded)"
        ds.updateCounts()
        ds.newUuid()
        ds.write(output_file_name)
    return 0
Beispiel #2
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    new_prefix = re.sub(".subreadset.xml$", "", output_file_name)
    args = [
        "bam2bam",
        "-j", str(nproc),
        "-b", str(nproc),
        "-o", new_prefix,
        "--barcodes", barcode_set_file,
        "--scoreMode", score_mode,
        subread_set_file
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    assert op.isfile(output_file_name)
    tmp_out = op.join(op.dirname(output_file_name),
                      "tmp_" + op.basename(output_file_name))
    shutil.move(output_file_name, tmp_out)
    with SubreadSet(tmp_out, strict=True) as ds:
        with SubreadSet(subread_set_file) as ds_in:
            ds.metadata = ds_in.metadata
            ds.name = ds_in.name + " (barcoded)"
        ds.updateCounts()
        ds.newUuid()
        ds.write(output_file_name)
    return 0
Beispiel #3
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1):
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                subreads_bam, scraps_bam
            ]
            print args
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            # FIXME we need a more general method for this
            ext_res_new = ExternalResource()
            ext_res_new.resourceId = subreads_bam
            ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile'
            ext_res_new.addIndices([subreads_bam + ".pbi"])
            ext_res_inner = ExternalResources()
            ext_res_scraps = ExternalResource()
            ext_res_scraps.resourceId = scraps_bam
            ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile'
            ext_res_scraps.addIndices([scraps_bam + ".pbi"])
            ext_res_inner.append(ext_res_scraps)
            ext_res_new.append(ext_res_inner)
            ds_new.externalResources.append(ext_res_new)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.updateCounts()
        ds_new.write(output_file_name)
    return 0
Beispiel #4
0
def run_bam_to_bam(subread_set_file,
                   barcode_set_file,
                   output_file_name,
                   nproc=1,
                   score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError(
            "Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(
                op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded",
                       op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                                       subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam", "-j",
                str(nproc), "-b",
                str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta,
                "--scoreMode", score_mode, subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(
                f=subreads_bam)
            add_subread_resources(ds_new,
                                  subreads=subreads_bam,
                                  scraps=scraps_bam,
                                  barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
 def test_barcodeset(self):
     fa_out = tempfile.NamedTemporaryFile(suffix=".fasta").name
     with open(fa_out, "w") as f:
         f.write(">bc1\nAAAAAAAAAAAAAAAA\n>bc2\nCCCCCCCCCCCCCCCC")
     ds = BarcodeSet(fa_out)
     ds.induceIndices()
     self.assertEqual([r.id for r in ds], ["bc1","bc2"])
     ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
     ds.write(ds_out)
Beispiel #6
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                "--scoreMode", score_mode,
                subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            add_subread_resources(ds_new,
                subreads=subreads_bam,
                scraps=scraps_bam,
                barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
def _make_barcodes(file_name=None):
    if file_name is None:
        file_name = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
    fasta_file_name = file_name
    if file_name.endswith(".barcodeset.xml"):
        fasta_file_name = re.sub(".barcodeset.xml", ".fasta", file_name)
    with FastaWriter(fasta_file_name) as fa_out:
        for i in range(1010):
            fa_out.writeRecord("%04d_Forward" % i, "A" * 16)
    pysam.faidx(fasta_file_name, catch_stdout=False)
    ds = BarcodeSet(fasta_file_name, strict=True)
    ds.write(file_name)
    return file_name
def estimate_lima_memory(barcodes, dataset, symmetric):
    """
    Returns estimated system memory for lima, with a minimum of 2GB (which is
    wildly excessive).  Memory is increased only if there are more than 500
    unique barcode pairs being identified.
    """
    n_barcodes = len(BarcodeSet(barcodes, strict=True))
    if symmetric and n_barcodes < 500:
        log.info("symmetric with {n} barcodes, will use default memory".format(
            n=n_barcodes))
        return DEFAULT_MEM_GB
    else:
        with openDataSet(dataset, skipCounts=True) as reads:
            bioSampleBarcodes = []
            for coll in reads.metadata.collections:
                for bioSample in coll.wellSample.bioSamples:
                    bioSampleBarcodes.extend(bioSample.DNABarcodes)
            n_bc_pairs = len(bioSampleBarcodes)
            if n_bc_pairs == 0:
                log.warn("No biosamples defined, assuming all barcodes used")
                n_bc_pairs = n_barcodes
                if not symmetric:
                    n_bc_pairs *= n_bc_pairs
            if 0 < n_bc_pairs < 500:
                log.info(
                    "only {n} sample barcodes, will use default memory".format(
                        n=n_bc_pairs))
                return DEFAULT_MEM_GB
            else:
                bam_files = [er.bam for er in reads.externalResources]
                bam_size_bytes = sum([op.getsize(f) for f in bam_files])
                bam_size_gb = int(math.ceil(bam_size_bytes / 1024**3))
                log.info("guessing memory from total BAM size ({m} GB)".format(
                    m=bam_size_gb))
                return DEFAULT_MEM_GB + (bam_size_gb * BAM_COMPRESSION_FACTOR)
Beispiel #9
0
def _labels_reads_iterator(reads, barcodes, subreads=True):
    with openDataSet(reads) as ds:
        for er in ds.externalResources:
            if er.barcodes != barcodes:
                raise ValueError(
                    "Mismatch between external resource " +
                    "barcodes and input BarcodeSet: " +
                    "{a} != {b}".format(a=er.barcodes, b=barcodes))
        assert ds.isIndexed
        zmws_by_barcode = defaultdict(set)
        reads_by_zmw = defaultdict(list)
        for rr in ds.resourceReaders():
            for i, (b, z, q) in enumerate(
                    zip(rr.pbi.bcForward, rr.pbi.holeNumber, rr.pbi.qId)):
                movie = rr.readGroupInfo(q).MovieName
                zmws_by_barcode[b].add((movie, z))
                reads_by_zmw[(movie, z)].append((rr, i))
        with BarcodeSet(barcodes) as bc:
            for i_bc, barcode in enumerate(bc):
                zmws = sorted(list(zmws_by_barcode[i_bc]))
                for (movie, zmw) in zmws:
                    for rr, i_read in reads_by_zmw[(movie, zmw)]:
                        # FIXME(nechols)(2016-03-15) this will not work on CCS
                        qlen = rr.pbi.qEnd[i_read] - rr.pbi.qStart[i_read]
                        barcode_id = "{f}--{r}".format(
                            f=rr.pbi.bcForward[i_read],
                            r=rr.pbi.bcReverse[i_read])
                        yield barcode_id, barcode, ["n"] * qlen
Beispiel #10
0
def _load_files_for_update(input_reads,
                           barcode_set,
                           datastore_file,
                           require_file_id="barcoding.tasks.lima-0"):
    barcode_names = []
    with BarcodeSet(barcode_set) as bc_in:
        for rec in bc_in:
            barcode_names.append(rec.id)
    parent_ds = openDataSet(input_reads)
    parent_info = (parent_ds.uuid, parent_ds.datasetType, parent_ds.name)
    update_files = []
    for f in iterate_datastore_read_set_files(datastore_file):
        if require_file_id is None or f.file_id == require_file_id:
            update_files.append(f)
    bio_samples_d = {}
    barcode_uuids_d = {}

    def _update_from_sample(bio_sample):
        for dnabc in bio_sample.DNABarcodes:
            bio_samples_d[dnabc.name] = bio_sample.name
            barcode_uuids_d[dnabc.name] = dnabc.uniqueId

    for collection in parent_ds.metadata.collections:
        for bio_sample in collection.wellSample.bioSamples:
            _update_from_sample(bio_sample)
        break
    if len(parent_ds.metadata.collections) == 0:
        log.warning("No CollectionMetadata, looking for BioSamples elsewhere")
        for bio_sample in parent_ds.metadata.bioSamples:
            _update_from_sample(bio_sample)
    return barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info
Beispiel #11
0
def run_to_report(ds_bc_file,
                  barcodes_file,
                  subreads_in_file,
                  base_dir=None,
                  isoseq_mode=False):
    """
    Generate a Report instance from a SubreadSet and BarcodeSet.
    """
    barcoded_reads = get_barcoded_dataset(ds_bc_file)
    subreads_in = SubreadSet(subreads_in_file, strict=True)
    barcodes = BarcodeSet(barcodes_file)
    ds_bc_uuids = [barcoded_reads.uuid]
    if len(barcoded_reads.subdatasets) > 0:
        ds_bc_uuids = [ds.uuid for ds in barcoded_reads.subdatasets]
    dataset_uuids = [barcodes.uuid, subreads_in.uuid] + ds_bc_uuids
    biosamples = get_biosample_dict(barcoded_reads)
    read_info = list(iter_reads_by_barcode(barcoded_reads, barcodes, isoseq_mode)) + \
        list(get_unbarcoded_reads_info(subreads_in, barcoded_reads))
    if isinstance(barcoded_reads, SubreadSet):
        return make_report(biosamples=biosamples,
                           read_info=read_info,
                           dataset_uuids=dataset_uuids,
                           base_dir=base_dir)
    else:
        return make_report_ccs(biosamples=biosamples,
                               read_info=read_info,
                               dataset_uuids=dataset_uuids,
                               base_dir=base_dir)
Beispiel #12
0
def _make_barcode_table(bam_stats, ccs_set):
    """
    Generate a table of per-barcode results
    """
    barcode_counts = defaultdict(int)
    barcode_nbases = defaultdict(int)
    barcode_npasses = defaultdict(list)
    barcode_readscores = defaultdict(list)
    is_symmetric = all([r.bcForward == r.bcReverse for r in bam_stats])
    for r in bam_stats:
        key = r.bcForward
        if not is_symmetric:
            key = (r.bcForward, r.bcReverse)
        barcode_counts[key] += 1
        barcode_nbases[key] += r.qLen
        barcode_npasses[key].append(r.numPasses)
        barcode_readscores[key].append(r.readScore)
    barcode_labels = {}
    for er in ccs_set.externalResources:
        bcs = er.barcodes
        if bcs is not None:
            with BarcodeSet(bcs) as bc_set:
                for i_bc, rec in enumerate(bc_set):
                    if i_bc in barcode_labels:
                        assert barcode_labels[
                            i_bc] == rec.id, "Barcode ID mismatch: {l} versus {r}".format(
                                l=barcode_labels[i_bc], r=rec.id)
                    else:
                        barcode_labels[i_bc] = rec.id
    barcode_ids = sorted(barcode_counts.keys())
    counts = [barcode_counts[i_bc] for i_bc in barcode_ids]
    nbases = [barcode_nbases[i_bc] for i_bc in barcode_ids]
    mean_length = [int(float(n) / c) for (c, n) in zip(counts, nbases)]
    labels = []
    for i_bc in barcode_ids:
        if is_symmetric:
            labels.append(barcode_labels.get(i_bc, Constants.NO_BC_LABEL))
        else:
            labels.append("{f}, {r}".format(
                f=barcode_labels.get(i_bc[0], Constants.NO_BC_LABEL),
                r=barcode_labels.get(i_bc[1], Constants.NO_BC_LABEL)))
    npasses = [
        sum(barcode_npasses[i_bc]) / len(barcode_npasses[i_bc])
        for i_bc in barcode_ids
    ]
    readquals = [
        sum(barcode_readscores[i_bc]) / len(barcode_readscores[i_bc])
        for i_bc in barcode_ids
    ]
    assert len(labels) == len(counts) == len(nbases)
    columns = [
        Column(Constants.C_BARCODE_ID, values=labels),
        Column(Constants.C_BARCODE_COUNTS, values=counts),
        Column(Constants.C_BARCODE_NBASES, values=nbases),
        Column(Constants.C_BARCODE_READLENGTH, values=mean_length),
        Column(Constants.C_BARCODE_QUALITY, values=readquals),
        Column(Constants.C_BARCODE_NPASSES, values=npasses)
    ]
    return Table(Constants.T_BARCODES, columns=columns)
Beispiel #13
0
 def _make_barcodeset(self, fasta_str):
     tmp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta").name
     with open(tmp_fasta, "w") as fasta_out:
         fasta_out.write(fasta_str)
     tmp_bc = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
     with BarcodeSet(tmp_fasta, generateIndices=True) as ds_bc:
         ds_bc.write(tmp_bc)
     return tmp_bc
Beispiel #14
0
def _make_barcode_table(bam_stats, ccs_set):
    """
    Generate a table of per-barcode results
    """
    barcode_counts = defaultdict(int)
    barcode_nbases = defaultdict(int)
    barcode_npasses = defaultdict(list)
    barcode_readscores = defaultdict(list)
    for r in bam_stats:
        barcode_counts[r.bc] += 1
        barcode_nbases[r.bc] += r.qLen
        barcode_npasses[r.bc].append(r.numPasses)
        barcode_readscores[r.bc].append(r.readScore)
    barcode_labels = {}
    for er in ccs_set.externalResources:
        bcs = er.barcodes
        if bcs is not None:
            with BarcodeSet(bcs) as bc_set:
                for i_bc, rec in enumerate(bc_set):
                    if i_bc in barcode_labels:
                        assert barcode_labels[
                            i_bc] == rec.id, "Barcode ID mismatch: {l} versus {r}".format(
                                l=barcode_labels[i_bc], r=rec.id)
                    else:
                        barcode_labels[i_bc] = rec.id
    barcode_ids = sorted(barcode_counts.keys())
    counts = [barcode_counts[i_bc] for i_bc in barcode_ids]
    nbases = [barcode_nbases[i_bc] for i_bc in barcode_ids]
    mean_length = [int(float(n) / c) for (c, n) in zip(counts, nbases)]
    labels = [str(barcode_labels.get(i_bc, i_bc)) for i_bc in barcode_ids]
    npasses = [
        sum(barcode_npasses[i_bc]) / len(barcode_npasses[i_bc])
        for i_bc in barcode_ids
    ]
    readquals = [
        sum(barcode_readscores[i_bc]) / len(barcode_readscores[i_bc])
        for i_bc in barcode_ids
    ]
    assert len(labels) == len(counts) == len(nbases)
    columns = [
        Column(Constants.C_BARCODE_ID, values=labels, header="Barcode ID"),
        Column(Constants.C_BARCODE_COUNTS, values=counts, header="CCS reads"),
        Column(Constants.C_BARCODE_NBASES,
               values=nbases,
               header="Number of CCS bases"),
        Column(Constants.C_BARCODE_READLENGTH,
               values=mean_length,
               header="CCS Read Length (mean)"),
        Column(Constants.C_BARCODE_QUALITY,
               values=readquals,
               header="CCS Read Score (mean)"),
        Column(Constants.C_BARCODE_NPASSES,
               values=npasses,
               header="Number of Passes (mean)")
    ]
    return Table(Constants.T_BARCODES, columns=columns, title="By Barcode")
Beispiel #15
0
def resolved_tool_contract_runner(rtc):
    log.info("Starting {f} version {v} report generation".format(
        f=__file__, v=__version__))
    dataset_uuids = [
        openDataSet(rtc.task.input_files[0]).uuid,
        BarcodeSet(rtc.task.input_files[1]).uuid
    ]
    report = run_to_report(reads=rtc.task.input_files[0],
                           barcodes=rtc.task.input_files[1],
                           subreads=True,
                           dataset_uuids=dataset_uuids)
    log.info(pformat(report.to_dict()))
    report.write_json(rtc.task.output_files[0])
    return 0
Beispiel #16
0
 def test_iter_reads_by_barcode(self):
     datasets = [
         SubreadSet(self.subreads),
         get_barcoded_dataset(_make_datastore(self.subreads))
     ]
     barcodes = BarcodeSet(self.barcodes)
     for ds in datasets:
         table = sorted(list(iter_reads_by_barcode(ds, barcodes)),
                        lambda a, b: cmp(b.nbases, a.nbases))
         self.assertEqual([r.label for r in table],
                          ["Not Barcoded", "lbc1--lbc1", "lbc3--lbc3"])
         self.assertEqual([r.idx for r in table], ["None", "0--0", "2--2"])
         self.assertEqual([r.nbases for r in table], [9791, 1436, 204])
         self.assertEqual([r.n_subreads for r in table], [1, 1, 1])
Beispiel #17
0
 def test_barcodeset(self):
     fa_out = tempfile.NamedTemporaryFile(suffix=".fasta").name
     with open(fa_out, "w") as f:
         f.write(">bc1\nAAAAAAAAAAAAAAAA\n>bc2\nCCCCCCCCCCCCCCCC")
     ds = BarcodeSet(fa_out)
     ds.induceIndices()
     assert [r.id for r in ds] == ["bc1", "bc2"]
     ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
     ds.write(ds_out)
def _run_args(args):
    ds = BarcodeSet(args.barcodeset, strict=True)
    if len(ds) > 1:
        alarm = PacBioAlarm(
            exception=None,
            info=None,
            message=
            "This application currently only supports a single PCR primer.  To trim adapters when multiple primers are used, please run lima on the command line.",
            name="Multiple Primers",
            severity=logging.ERROR,
            owner="python")
        alarm.to_json("alarms.json")
        return 1
    else:
        return 0
Beispiel #19
0
def get_barcode_sample_mappings(ds):
    barcoded_samples = []
    for collection in ds.metadata.collections:
        for bioSample in collection.wellSample.bioSamples:
            for dnaBc in bioSample.DNABarcodes:
                barcoded_samples.append((dnaBc.name, bioSample.name))
    # recover the original barcode FASTA file so we can map the barcode
    # indices in the BAM file to the labels
    bc_sets = {
        extRes.barcodes
        for extRes in ds.externalResources if extRes.barcodes is not None
    }
    if len(bc_sets) > 1:
        log.warning(
            "Multiple BarcodeSets detected - further processing skipped.")
    elif len(bc_sets) == 0:
        log.warning(
            "Can't find original BarcodeSet - further processing skipped.")
    else:
        with BarcodeSet(list(bc_sets)[0]) as bcs:
            labels = [rec.id for rec in bcs]
            bam_bc = set()  # barcode labels actually present in BAM files
            for rr in ds.resourceReaders():

                def mk_lbl(i, j):
                    return "{}--{}".format(labels[i], labels[j])

                for fw, rev in zip(rr.pbi.bcForward, rr.pbi.bcReverse):
                    if fw == -1 or rev == -1:
                        continue
                    bam_bc.add(mk_lbl(fw, rev))
            bc_filtered = []
            bc_with_sample = set()
            # exclude barcodes from XML that are not present in BAM
            for bc_label, bio_sample in barcoded_samples:
                bc_with_sample.add(bc_label)
                if not bc_label in bam_bc:
                    log.info("Leaving out %s (not present in BAM files)",
                             bc_label)
                else:
                    bc_filtered.append((bc_label, bio_sample))
            # add barcodes that are in the BAM but not the XML metadata
            for bc_label in list(bam_bc):
                if not bc_label in bc_with_sample:
                    log.info("Adding barcode %s with unknown sample", bc_label)
                    bc_filtered.append((bc_label, "unknown"))
            barcoded_samples = bc_filtered
    return dict(barcoded_samples)
Beispiel #20
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                      input_file_name, output_file_name, tmp_dir=None,
                      seqid_prefix=None, subreads_in=None):
    """
    Converts a dataset to a set of fastx file, possibly archived.
    Can take a subreadset or consensusreadset as input.
    Will convert to either fasta or fastq.
    If the dataset is barcoded, it will split the fastx files per-barcode.
    If the output file is .zip, the fastx file(s) will be archived accordingly.
    """
    assert isinstance(program_name, str)
    barcode_mode = False
    barcode_sets = set()
    output_is_archive = (output_file_name.endswith(".zip") or
                         output_file_name.endswith(".tar.gz") or
                         output_file_name.endswith(".tgz"))
    if output_is_archive:
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
            if barcode_mode:
                # attempt to collect the labels of barcodes used on this
                # dataset.  assumes that all BAM files used the same barcodes
                for bam in ds_in.externalResources:
                    if bam.barcodes is not None:
                        barcode_sets.add(bam.barcodes)
    barcode_labels = []
    bio_samples_to_bc = None
    if barcode_mode:
        if len(barcode_sets) == 1:
            bc_file = list(barcode_sets)[0]
            log.info("Reading barcode labels from %s", bc_file)
            try:
                with BarcodeSet(bc_file) as bc_in:
                    for bc in bc_in:
                        barcode_labels.append(bc.id)
            except IOError as e:
                log.error("Can't read %s", bc_file)
                log.error(e)
        elif len(barcode_sets) > 1:
            log.warning("Multiple barcode sets used for this SubreadSet:")
            for fn in sorted(list(barcode_sets)):
                log.warning("  %s", fn)
        else:
            log.info("No barcode labels available")
        if subreads_in is not None:
            bio_samples_to_bc = {}
            with SubreadSet(subreads_in, strict=True) as subread_ds:
                if subread_ds.isBarcoded:  # pylint: disable=no-member
                    bio_samples_to_bc = get_barcode_sample_mappings(subread_ds)
    base_ext = re.sub("bam2", ".", program_name)
    suffix = "{f}.gz".format(f=base_ext)
    tmp_out_dir = tempfile.mkdtemp(dir=tmp_dir)
    tmp_out_prefix = op.join(tmp_out_dir, "tmp_fastx")
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    if seqid_prefix is not None:
        args.extend(["--seqid-prefix", pipes.quote(seqid_prefix)])
    log.info(" ".join(args))
    remove_files = []
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)

    def _is_fastx_file(fn):
        return fn.startswith(tmp_out_prefix) and fn.endswith(suffix)

    try:
        assert result.exit_code == 0, "{p} exited with code {c}".format(
            p=program_name, c=result.exit_code)
        if output_is_archive:
            tc_out_dir = op.dirname(output_file_name)
            fastx_file_names = []
            # find the barcoded FASTX files and un-gzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in walker(tmp_out_dir, _is_fastx_file):
                if barcode_mode:
                    # bam2fastx outputs files with the barcode indices
                    # encoded in the file names; here we attempt to
                    # translate these to barcode labels, falling back on
                    # the original indices if necessary
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    bc_label = "unbarcoded"
                    if (bc_fwd_rev != ["65535", "65535"] and
                            bc_fwd_rev != ["-1", "-1"]):
                        def _label_or_none(x):
                            try:
                                bc = int(x)
                                if bc < 0:
                                    return "none"
                                elif bc < len(barcode_labels):
                                    return barcode_labels[bc]
                            except ValueError as e:
                                pass
                            return x
                        bc_fwd_label = _label_or_none(bc_fwd_rev[0])
                        bc_rev_label = _label_or_none(bc_fwd_rev[1])
                        bc_label = "{f}--{r}".format(f=bc_fwd_label,
                                                     r=bc_rev_label)
                    suffix2 = ".{l}{t}".format(l=bc_label, t=base_ext)
                    if bio_samples_to_bc is not None:
                        sample = bio_samples_to_bc.get(bc_label, "unknown")
                        suffix2 = ".{}".format(sample) + suffix2
                else:
                    suffix2 = base_ext
                base = re.sub(".zip$", "",
                              re.sub(".tar.gz", "",
                                     re.sub(".tgz", "",
                                            op.basename(output_file_name))))
                fn_out = base
                if not fn_out.endswith(suffix2):
                    fn_out = re.sub(base_ext, suffix2, fn_out)
                fastx_out = op.join(tc_out_dir, fn_out)
                _ungzip_fastx(fn, fastx_out)
                fastx_file_names.append(fastx_out)
                remove_files.append(fn)
            assert len(fastx_file_names) > 0
            remove_files.extend(fastx_file_names)
            return archive_files(fastx_file_names, output_file_name)
        else:
            tmp_out = "{p}{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            _ungzip_fastx(tmp_out, output_file_name)
            remove_files = [tmp_out]
    finally:
        for fn in remove_files:
            os.remove(fn)
    return 0