def estimate_lima_memory(barcodes, dataset, symmetric): """ Returns estimated system memory for lima, with a minimum of 2GB (which is wildly excessive). Memory is increased only if there are more than 500 unique barcode pairs being identified. """ n_barcodes = len(BarcodeSet(barcodes, strict=True)) if symmetric and n_barcodes < 500: log.info("symmetric with {n} barcodes, will use default memory".format( n=n_barcodes)) return DEFAULT_MEM_GB else: with openDataSet(dataset, skipCounts=True) as reads: bioSampleBarcodes = [] for coll in reads.metadata.collections: for bioSample in coll.wellSample.bioSamples: bioSampleBarcodes.extend(bioSample.DNABarcodes) n_bc_pairs = len(bioSampleBarcodes) if n_bc_pairs == 0: log.warn("No biosamples defined, assuming all barcodes used") n_bc_pairs = n_barcodes if not symmetric: n_bc_pairs *= n_bc_pairs if 0 < n_bc_pairs < 500: log.info( "only {n} sample barcodes, will use default memory".format( n=n_bc_pairs)) return DEFAULT_MEM_GB else: bam_files = [er.bam for er in reads.externalResources] bam_size_bytes = sum([op.getsize(f) for f in bam_files]) bam_size_gb = int(math.ceil(bam_size_bytes / 1024**3)) log.info("guessing memory from total BAM size ({m} GB)".format( m=bam_size_gb)) return DEFAULT_MEM_GB + (bam_size_gb * BAM_COMPRESSION_FACTOR)
def _labels_reads_iterator(reads, barcodes, subreads=True): with openDataSet(reads) as ds: for er in ds.externalResources: if er.barcodes != barcodes: raise ValueError( "Mismatch between external resource " + "barcodes and input BarcodeSet: " + "{a} != {b}".format(a=er.barcodes, b=barcodes)) assert ds.isIndexed zmws_by_barcode = defaultdict(set) reads_by_zmw = defaultdict(list) for rr in ds.resourceReaders(): for i, (b, z, q) in enumerate( zip(rr.pbi.bcForward, rr.pbi.holeNumber, rr.pbi.qId)): movie = rr.readGroupInfo(q).MovieName zmws_by_barcode[b].add((movie, z)) reads_by_zmw[(movie, z)].append((rr, i)) with BarcodeSet(barcodes) as bc: for i_bc, barcode in enumerate(bc): zmws = sorted(list(zmws_by_barcode[i_bc])) for (movie, zmw) in zmws: for rr, i_read in reads_by_zmw[(movie, zmw)]: # FIXME(nechols)(2016-03-15) this will not work on CCS qlen = rr.pbi.qEnd[i_read] - rr.pbi.qStart[i_read] barcode_id = "{f}--{r}".format( f=rr.pbi.bcForward[i_read], r=rr.pbi.bcReverse[i_read]) yield barcode_id, barcode, ["n"] * qlen
def _load_files_for_update(input_reads, barcode_set, datastore_file, require_file_id="barcoding.tasks.lima-0"): barcode_names = [] with BarcodeSet(barcode_set) as bc_in: for rec in bc_in: barcode_names.append(rec.id) parent_ds = openDataSet(input_reads) parent_info = (parent_ds.uuid, parent_ds.datasetType, parent_ds.name) update_files = [] for f in iterate_datastore_read_set_files(datastore_file): if require_file_id is None or f.file_id == require_file_id: update_files.append(f) bio_samples_d = {} barcode_uuids_d = {} def _update_from_sample(bio_sample): for dnabc in bio_sample.DNABarcodes: bio_samples_d[dnabc.name] = bio_sample.name barcode_uuids_d[dnabc.name] = dnabc.uniqueId for collection in parent_ds.metadata.collections: for bio_sample in collection.wellSample.bioSamples: _update_from_sample(bio_sample) break if len(parent_ds.metadata.collections) == 0: log.warning("No CollectionMetadata, looking for BioSamples elsewhere") for bio_sample in parent_ds.metadata.bioSamples: _update_from_sample(bio_sample) return barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info
def run_to_report(ds_bc_file, barcodes_file, subreads_in_file, base_dir=None, isoseq_mode=False): """ Generate a Report instance from a SubreadSet and BarcodeSet. """ barcoded_reads = get_barcoded_dataset(ds_bc_file) subreads_in = SubreadSet(subreads_in_file, strict=True) barcodes = BarcodeSet(barcodes_file) ds_bc_uuids = [barcoded_reads.uuid] if len(barcoded_reads.subdatasets) > 0: ds_bc_uuids = [ds.uuid for ds in barcoded_reads.subdatasets] dataset_uuids = [barcodes.uuid, subreads_in.uuid] + ds_bc_uuids biosamples = get_biosample_dict(barcoded_reads) read_info = list(iter_reads_by_barcode(barcoded_reads, barcodes, isoseq_mode)) + \ list(get_unbarcoded_reads_info(subreads_in, barcoded_reads)) if isinstance(barcoded_reads, SubreadSet): return make_report(biosamples=biosamples, read_info=read_info, dataset_uuids=dataset_uuids, base_dir=base_dir) else: return make_report_ccs(biosamples=biosamples, read_info=read_info, dataset_uuids=dataset_uuids, base_dir=base_dir)
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") new_prefix = re.sub(".subreadset.xml$", "", output_file_name) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_set_file, "--scoreMode", score_mode, subread_set_file ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code assert op.isfile(output_file_name) tmp_out = op.join(op.dirname(output_file_name), "tmp_" + op.basename(output_file_name)) shutil.move(output_file_name, tmp_out) with SubreadSet(tmp_out, strict=True) as ds: with SubreadSet(subread_set_file) as ds_in: ds.metadata = ds_in.metadata ds.name = ds_in.name + " (barcoded)" ds.updateCounts() ds.newUuid() ds.write(output_file_name) return 0
def _make_barcode_table(bam_stats, ccs_set): """ Generate a table of per-barcode results """ barcode_counts = defaultdict(int) barcode_nbases = defaultdict(int) barcode_npasses = defaultdict(list) barcode_readscores = defaultdict(list) is_symmetric = all([r.bcForward == r.bcReverse for r in bam_stats]) for r in bam_stats: key = r.bcForward if not is_symmetric: key = (r.bcForward, r.bcReverse) barcode_counts[key] += 1 barcode_nbases[key] += r.qLen barcode_npasses[key].append(r.numPasses) barcode_readscores[key].append(r.readScore) barcode_labels = {} for er in ccs_set.externalResources: bcs = er.barcodes if bcs is not None: with BarcodeSet(bcs) as bc_set: for i_bc, rec in enumerate(bc_set): if i_bc in barcode_labels: assert barcode_labels[ i_bc] == rec.id, "Barcode ID mismatch: {l} versus {r}".format( l=barcode_labels[i_bc], r=rec.id) else: barcode_labels[i_bc] = rec.id barcode_ids = sorted(barcode_counts.keys()) counts = [barcode_counts[i_bc] for i_bc in barcode_ids] nbases = [barcode_nbases[i_bc] for i_bc in barcode_ids] mean_length = [int(float(n) / c) for (c, n) in zip(counts, nbases)] labels = [] for i_bc in barcode_ids: if is_symmetric: labels.append(barcode_labels.get(i_bc, Constants.NO_BC_LABEL)) else: labels.append("{f}, {r}".format( f=barcode_labels.get(i_bc[0], Constants.NO_BC_LABEL), r=barcode_labels.get(i_bc[1], Constants.NO_BC_LABEL))) npasses = [ sum(barcode_npasses[i_bc]) / len(barcode_npasses[i_bc]) for i_bc in barcode_ids ] readquals = [ sum(barcode_readscores[i_bc]) / len(barcode_readscores[i_bc]) for i_bc in barcode_ids ] assert len(labels) == len(counts) == len(nbases) columns = [ Column(Constants.C_BARCODE_ID, values=labels), Column(Constants.C_BARCODE_COUNTS, values=counts), Column(Constants.C_BARCODE_NBASES, values=nbases), Column(Constants.C_BARCODE_READLENGTH, values=mean_length), Column(Constants.C_BARCODE_QUALITY, values=readquals), Column(Constants.C_BARCODE_NPASSES, values=npasses) ] return Table(Constants.T_BARCODES, columns=columns)
def _make_barcodeset(self, fasta_str): tmp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(tmp_fasta, "w") as fasta_out: fasta_out.write(fasta_str) tmp_bc = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name with BarcodeSet(tmp_fasta, generateIndices=True) as ds_bc: ds_bc.write(tmp_bc) return tmp_bc
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError( "Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join( op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format( f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def test_barcodeset(self): fa_out = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(fa_out, "w") as f: f.write(">bc1\nAAAAAAAAAAAAAAAA\n>bc2\nCCCCCCCCCCCCCCCC") ds = BarcodeSet(fa_out) ds.induceIndices() assert [r.id for r in ds] == ["bc1", "bc2"] ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name ds.write(ds_out)
def _make_barcode_table(bam_stats, ccs_set): """ Generate a table of per-barcode results """ barcode_counts = defaultdict(int) barcode_nbases = defaultdict(int) barcode_npasses = defaultdict(list) barcode_readscores = defaultdict(list) for r in bam_stats: barcode_counts[r.bc] += 1 barcode_nbases[r.bc] += r.qLen barcode_npasses[r.bc].append(r.numPasses) barcode_readscores[r.bc].append(r.readScore) barcode_labels = {} for er in ccs_set.externalResources: bcs = er.barcodes if bcs is not None: with BarcodeSet(bcs) as bc_set: for i_bc, rec in enumerate(bc_set): if i_bc in barcode_labels: assert barcode_labels[ i_bc] == rec.id, "Barcode ID mismatch: {l} versus {r}".format( l=barcode_labels[i_bc], r=rec.id) else: barcode_labels[i_bc] = rec.id barcode_ids = sorted(barcode_counts.keys()) counts = [barcode_counts[i_bc] for i_bc in barcode_ids] nbases = [barcode_nbases[i_bc] for i_bc in barcode_ids] mean_length = [int(float(n) / c) for (c, n) in zip(counts, nbases)] labels = [str(barcode_labels.get(i_bc, i_bc)) for i_bc in barcode_ids] npasses = [ sum(barcode_npasses[i_bc]) / len(barcode_npasses[i_bc]) for i_bc in barcode_ids ] readquals = [ sum(barcode_readscores[i_bc]) / len(barcode_readscores[i_bc]) for i_bc in barcode_ids ] assert len(labels) == len(counts) == len(nbases) columns = [ Column(Constants.C_BARCODE_ID, values=labels, header="Barcode ID"), Column(Constants.C_BARCODE_COUNTS, values=counts, header="CCS reads"), Column(Constants.C_BARCODE_NBASES, values=nbases, header="Number of CCS bases"), Column(Constants.C_BARCODE_READLENGTH, values=mean_length, header="CCS Read Length (mean)"), Column(Constants.C_BARCODE_QUALITY, values=readquals, header="CCS Read Score (mean)"), Column(Constants.C_BARCODE_NPASSES, values=npasses, header="Number of Passes (mean)") ] return Table(Constants.T_BARCODES, columns=columns, title="By Barcode")
def resolved_tool_contract_runner(rtc): log.info("Starting {f} version {v} report generation".format( f=__file__, v=__version__)) dataset_uuids = [ openDataSet(rtc.task.input_files[0]).uuid, BarcodeSet(rtc.task.input_files[1]).uuid ] report = run_to_report(reads=rtc.task.input_files[0], barcodes=rtc.task.input_files[1], subreads=True, dataset_uuids=dataset_uuids) log.info(pformat(report.to_dict())) report.write_json(rtc.task.output_files[0]) return 0
def test_iter_reads_by_barcode(self): datasets = [ SubreadSet(self.subreads), get_barcoded_dataset(_make_datastore(self.subreads)) ] barcodes = BarcodeSet(self.barcodes) for ds in datasets: table = sorted(list(iter_reads_by_barcode(ds, barcodes)), lambda a, b: cmp(b.nbases, a.nbases)) self.assertEqual([r.label for r in table], ["Not Barcoded", "lbc1--lbc1", "lbc3--lbc3"]) self.assertEqual([r.idx for r in table], ["None", "0--0", "2--2"]) self.assertEqual([r.nbases for r in table], [9791, 1436, 204]) self.assertEqual([r.n_subreads for r in table], [1, 1, 1])
def _run_args(args): ds = BarcodeSet(args.barcodeset, strict=True) if len(ds) > 1: alarm = PacBioAlarm( exception=None, info=None, message= "This application currently only supports a single PCR primer. To trim adapters when multiple primers are used, please run lima on the command line.", name="Multiple Primers", severity=logging.ERROR, owner="python") alarm.to_json("alarms.json") return 1 else: return 0
def get_barcode_sample_mappings(ds): barcoded_samples = [] for collection in ds.metadata.collections: for bioSample in collection.wellSample.bioSamples: for dnaBc in bioSample.DNABarcodes: barcoded_samples.append((dnaBc.name, bioSample.name)) # recover the original barcode FASTA file so we can map the barcode # indices in the BAM file to the labels bc_sets = { extRes.barcodes for extRes in ds.externalResources if extRes.barcodes is not None } if len(bc_sets) > 1: log.warning( "Multiple BarcodeSets detected - further processing skipped.") elif len(bc_sets) == 0: log.warning( "Can't find original BarcodeSet - further processing skipped.") else: with BarcodeSet(list(bc_sets)[0]) as bcs: labels = [rec.id for rec in bcs] bam_bc = set() # barcode labels actually present in BAM files for rr in ds.resourceReaders(): def mk_lbl(i, j): return "{}--{}".format(labels[i], labels[j]) for fw, rev in zip(rr.pbi.bcForward, rr.pbi.bcReverse): if fw == -1 or rev == -1: continue bam_bc.add(mk_lbl(fw, rev)) bc_filtered = [] bc_with_sample = set() # exclude barcodes from XML that are not present in BAM for bc_label, bio_sample in barcoded_samples: bc_with_sample.add(bc_label) if not bc_label in bam_bc: log.info("Leaving out %s (not present in BAM files)", bc_label) else: bc_filtered.append((bc_label, bio_sample)) # add barcodes that are in the BAM but not the XML metadata for bc_label in list(bam_bc): if not bc_label in bc_with_sample: log.info("Adding barcode %s with unknown sample", bc_label) bc_filtered.append((bc_label, "unknown")) barcoded_samples = bc_filtered return dict(barcoded_samples)
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer, input_file_name, output_file_name, tmp_dir=None, seqid_prefix=None, subreads_in=None): """ Converts a dataset to a set of fastx file, possibly archived. Can take a subreadset or consensusreadset as input. Will convert to either fasta or fastq. If the dataset is barcoded, it will split the fastx files per-barcode. If the output file is .zip, the fastx file(s) will be archived accordingly. """ assert isinstance(program_name, str) barcode_mode = False barcode_sets = set() output_is_archive = (output_file_name.endswith(".zip") or output_file_name.endswith(".tar.gz") or output_file_name.endswith(".tgz")) if output_is_archive: with openDataSet(input_file_name) as ds_in: barcode_mode = ds_in.isBarcoded if barcode_mode: # attempt to collect the labels of barcodes used on this # dataset. assumes that all BAM files used the same barcodes for bam in ds_in.externalResources: if bam.barcodes is not None: barcode_sets.add(bam.barcodes) barcode_labels = [] bio_samples_to_bc = None if barcode_mode: if len(barcode_sets) == 1: bc_file = list(barcode_sets)[0] log.info("Reading barcode labels from %s", bc_file) try: with BarcodeSet(bc_file) as bc_in: for bc in bc_in: barcode_labels.append(bc.id) except IOError as e: log.error("Can't read %s", bc_file) log.error(e) elif len(barcode_sets) > 1: log.warning("Multiple barcode sets used for this SubreadSet:") for fn in sorted(list(barcode_sets)): log.warning(" %s", fn) else: log.info("No barcode labels available") if subreads_in is not None: bio_samples_to_bc = {} with SubreadSet(subreads_in, strict=True) as subread_ds: if subread_ds.isBarcoded: # pylint: disable=no-member bio_samples_to_bc = get_barcode_sample_mappings(subread_ds) base_ext = re.sub("bam2", ".", program_name) suffix = "{f}.gz".format(f=base_ext) tmp_out_dir = tempfile.mkdtemp(dir=tmp_dir) tmp_out_prefix = op.join(tmp_out_dir, "tmp_fastx") args = [ program_name, "-o", tmp_out_prefix, input_file_name, ] if barcode_mode: args.insert(1, "--split-barcodes") if seqid_prefix is not None: args.extend(["--seqid-prefix", pipes.quote(seqid_prefix)]) log.info(" ".join(args)) remove_files = [] result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) def _is_fastx_file(fn): return fn.startswith(tmp_out_prefix) and fn.endswith(suffix) try: assert result.exit_code == 0, "{p} exited with code {c}".format( p=program_name, c=result.exit_code) if output_is_archive: tc_out_dir = op.dirname(output_file_name) fastx_file_names = [] # find the barcoded FASTX files and un-gzip them to the same # output directory and file prefix as the ultimate output for fn in walker(tmp_out_dir, _is_fastx_file): if barcode_mode: # bam2fastx outputs files with the barcode indices # encoded in the file names; here we attempt to # translate these to barcode labels, falling back on # the original indices if necessary bc_fwd_rev = fn.split(".")[-3].split("_") bc_label = "unbarcoded" if (bc_fwd_rev != ["65535", "65535"] and bc_fwd_rev != ["-1", "-1"]): def _label_or_none(x): try: bc = int(x) if bc < 0: return "none" elif bc < len(barcode_labels): return barcode_labels[bc] except ValueError as e: pass return x bc_fwd_label = _label_or_none(bc_fwd_rev[0]) bc_rev_label = _label_or_none(bc_fwd_rev[1]) bc_label = "{f}--{r}".format(f=bc_fwd_label, r=bc_rev_label) suffix2 = ".{l}{t}".format(l=bc_label, t=base_ext) if bio_samples_to_bc is not None: sample = bio_samples_to_bc.get(bc_label, "unknown") suffix2 = ".{}".format(sample) + suffix2 else: suffix2 = base_ext base = re.sub(".zip$", "", re.sub(".tar.gz", "", re.sub(".tgz", "", op.basename(output_file_name)))) fn_out = base if not fn_out.endswith(suffix2): fn_out = re.sub(base_ext, suffix2, fn_out) fastx_out = op.join(tc_out_dir, fn_out) _ungzip_fastx(fn, fastx_out) fastx_file_names.append(fastx_out) remove_files.append(fn) assert len(fastx_file_names) > 0 remove_files.extend(fastx_file_names) return archive_files(fastx_file_names, output_file_name) else: tmp_out = "{p}{b}.gz".format(p=tmp_out_prefix, b=base_ext) _ungzip_fastx(tmp_out, output_file_name) remove_files = [tmp_out] finally: for fn in remove_files: os.remove(fn) return 0