def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples).items(): if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = tz.get_in(["config", "algorithm", "variant_regions"], data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ # Unpack nested lists of samples grouped together if isinstance(samples[0], (list, tuple)) and len(samples[0]) == 1: samples = [x[0] for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update( global_analysis_file, samples): global_no_analysis_file = os.path.join( os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch( batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"][ "callable_regions"] = analysis_file data["config"]["algorithm"][ "non_callable_regions"] = no_analysis_file data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( analysis_file).count() elif vr_file: data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"][ "highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ from bcbio.structural import cnvkit if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x)) for x in items): return items items = [utils.to_single_data(x) for x in items] out = [] for cnv_group in _group_by_cnv_method(multi.group_by_batch(items, False)): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn, cnv_group.work_dir, data) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk} from bcbio.structural import cnvkit items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out = [] for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: if cnvkit.use_general_sv_bins(data): target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group, size_calc_fn) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i), "gcannotated": gcannotated_tsv} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples).items(): if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file out.append([data]) assert len(out) == len(samples) final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = cwlutils.unpack_tarballs(samples, samples[0]) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ import pybedtools samples = [x[0] for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"]["highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ from bcbio.structural import cnvkit if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x)) for x in items): return items items = [utils.to_single_data(x) for x in items] out = [] for batch, batch_items in multi.group_by_batch(items, False).items(): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(items[0]), "structural", "bins", batch)) access_file = tz.get_in(["config", "algorithm", "callable_regions"], batch_items[0]) cnv_file = get_base_cnv_regions(batch_items[0], work_dir, "transcripts100", include_gene_names=False) target_bin, anti_bin = _get_target_antitarget_bin_sizes( cnv_file, items) for data in batch_items: target_bed, anti_bed = cnvkit.targets_w_bins( cnv_file, access_file, target_bin, anti_bin, work_dir, data) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = { "target": target_bed, "antitarget": anti_bed } out.append([data]) if not len(out) == len(items): raise AssertionError( "Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted( [dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out