def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ import pybedtools sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, out_file, data) for c in calls]) if len(input_beds) > 0: size_beds = [] for e_start, e_end in validate.EVENT_SIZES: base, ext = os.path.splitext(out_file) size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext) if not utils.file_exists(size_out_file): with file_transaction(data, size_out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): chrom, start, end = line.split()[:3] size = int(end) - int(start) if size >= e_start and size < e_end: out_handle.write(line) pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=",").saveas(tx_out_file) size_beds.append(size_out_file) out_file = bedutils.combine(size_beds, out_file, data["config"]) if utils.file_exists(out_file): bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep")) calls.append({"variantcaller": "ensemble", "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)}) return calls
def get_split_regions(bed_file, data): """Retrieve a set of split regions using the input BED for callable regions. Provides a less inclusive hook for parallelizing over multiple regions. """ out_file = "%s-analysis_blocks.bed" % utils.splitext_plus(bed_file)[0] with shared.bedtools_tmpdir(data): if not utils.file_uptodate(out_file, bed_file): ref_regions = get_ref_bedtool(dd.get_ref_file(data), data["config"]) nblock_regions = ref_regions.subtract( pybedtools.BedTool(bed_file)).saveas() min_n_size = int( tz.get_in(["config", "algorithm", "nomap_split_size"], data, 250)) block_filter = NBlockRegionPicker(ref_regions, data["config"], min_n_size) final_nblock_regions = nblock_regions.filter( block_filter.include_block).saveas().each( block_filter.expand_block).saveas() with file_transaction(data, out_file) as tx_out_file: final_regions = ref_regions.subtract(final_nblock_regions, nonamecheck=True).\ saveas().merge(d=min_n_size).saveas(tx_out_file) chroms = set([]) with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(bed_file): chroms.add(r.chrom) out = [] for r in pybedtools.BedTool(out_file): if r.chrom in chroms: out.append((r.chrom, r.start, r.stop)) return out
def get_split_regions(bed_file, data): """Retrieve a set of split regions using the input BED for callable regions. Provides a less inclusive hook for parallelizing over multiple regions. """ out_file = "%s-analysis_blocks.bed" % utils.splitext_plus(bed_file)[0] with shared.bedtools_tmpdir(data): if not utils.file_uptodate(out_file, bed_file): ref_regions = get_ref_bedtool(dd.get_ref_file(data), data["config"]) nblock_regions = ref_regions.subtract(pybedtools.BedTool(bed_file)).saveas() min_n_size = int(tz.get_in(["config", "algorithm", "nomap_split_size"], data, 250)) block_filter = NBlockRegionPicker(ref_regions, data["config"], min_n_size) final_nblock_regions = nblock_regions.filter( block_filter.include_block).saveas().each(block_filter.expand_block).saveas() with file_transaction(data, out_file) as tx_out_file: final_regions = ref_regions.subtract(final_nblock_regions, nonamecheck=True).\ saveas().merge(d=min_n_size).saveas(tx_out_file) chroms = set([]) with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(bed_file): chroms.add(r.chrom) out = [] for r in pybedtools.BedTool(out_file): if r.chrom in chroms: out.append((r.chrom, r.start, r.stop)) return out
def block_regions(in_bam, ref_file, data): """Find blocks of regions for analysis from mapped input BAM file. Identifies islands of callable regions, surrounding by regions with no read support, that can be analyzed independently. """ config = data["config"] min_n_size = int(config["algorithm"].get("nomap_split_size", 250)) with shared.bedtools_tmpdir({"config": config}): callable_bed = parallel_callable_loci(in_bam, ref_file, data) nblock_bed = "%s-nblocks%s" % os.path.splitext(callable_bed) callblock_bed = "%s-callableblocks%s" % os.path.splitext(callable_bed) if not utils.file_uptodate(nblock_bed, callable_bed): ref_regions = get_ref_bedtool(ref_file, config) nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions) nblock_regions = _add_config_regions(nblock_regions, ref_regions, config) nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(nblock_bed) if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0: ref_regions.subtract(nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(callblock_bed) else: raise ValueError( "No callable regions found from BAM file. Alignment regions might " "not overlap with regions found in your `variant_regions` BED: %s" % in_bam ) return callblock_bed, nblock_bed, callable_bed
def summarize(calls, data, items): """Summarize results from multiple callers into a single flattened BED file. Approach: - Combine all calls found in all files - Filter files retaining those present with multiple levels of support. - Remove calls in high depth regions. - Remove calls with ends overlapping exclusion regions like low complexity regions. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]), [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file limit_file = shared.remove_highdepth_regions(filter_file, items) exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f] exclude_file = exclude_files[0] if len(exclude_files) > 0 else None if exclude_file: noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data) else: noexclude_file = limit_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep")) if utils.file_exists(noexclude_file): calls.append({"variantcaller": "sv-ensemble", "input_beds": input_beds, "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)}) return calls
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples).items(): if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = tz.get_in(["config", "algorithm", "variant_regions"], data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples).items(): if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file out.append([data]) assert len(out) == len(samples) final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions) return out
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","): """Combine a set of BED files, breaking into individual size chunks. """ out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if len(input_beds) > 0: size_beds = [] for e_start, e_end in validate.EVENT_SIZES: base, ext = os.path.splitext(out_file) size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext) if not utils.file_exists(size_out_file): with file_transaction(data, size_out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] has_regions = False with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): chrom, start, end, event_str = line.split()[:4] event = event_str.split("_", 1)[0] size = int(end) - int(start) if size >= e_start and size < e_end or event == "BND": out_handle.write(line) has_regions = True if has_regions: pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file) if utils.file_exists(size_out_file): ann_size_out_file = annotate.add_genes(size_out_file, data) size_beds.append(ann_size_out_file) if len(size_beds) > 0: out_file = bedutils.combine(size_beds, out_file, data) return out_file
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect( input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def block_regions(in_bam, ref_file, data): """Find blocks of regions for analysis from mapped input BAM file. Identifies islands of callable regions, surrounding by regions with no read support, that can be analyzed independently. """ config = data["config"] min_n_size = int(config["algorithm"].get("nomap_split_size", 250)) with shared.bedtools_tmpdir({"config": config}): callable_bed = parallel_callable_loci(in_bam, ref_file, data) nblock_bed = "%s-nblocks%s" % os.path.splitext(callable_bed) callblock_bed = "%s-callableblocks%s" % os.path.splitext(callable_bed) if not utils.file_uptodate(nblock_bed, callable_bed): ref_regions = get_ref_bedtool(ref_file, config) nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions) nblock_regions = _add_config_regions(nblock_regions, ref_regions, config) nblock_regions.filter(lambda r: len(r) > min_n_size).saveas( nblock_bed) if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0: ref_regions.subtract( nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(callblock_bed) else: raise ValueError( "No callable regions found from BAM file. Alignment regions might " "not overlap with regions found in your `variant_regions` BED: %s" % in_bam) return callblock_bed, nblock_bed, callable_bed
def variantcall_batch_region(items): """CWL entry point: variant call a batch of samples in a block of regions. """ items = [utils.to_single_data(x) for x in items] align_bams = [dd.get_align_bam(x) for x in items] variantcaller = _get_batch_variantcaller(items) region_blocks = list( set([ tuple(x.get("region_block")) for x in items if "region_block" in x ])) assert len(region_blocks) == 1, region_blocks region_block = region_blocks[0] # Pre-called input variant files if not variantcaller and all(d.get("vrn_file") for d in items): return {"vrn_file_region": None, "region_block": region_block} caller_fn = get_variantcallers()[variantcaller] assoc_files = tz.get_in(("genome_resources", "variation"), items[0], {}) region = _region_to_coords(region_block[0]) chrom, start, end = region region_str = "_".join(str(x) for x in region) batch_name = _get_batch_name(items) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s-block.vcf.gz" % (batch_name, region_str)) utils.safe_makedir(os.path.dirname(out_file)) with pshared.bedtools_tmpdir(items[0]): if variantcaller in SUPPORT_MULTICORE: call_file = caller_fn(align_bams, items, dd.get_ref_file(items[0]), assoc_files, [_region_to_coords(r) for r in region_block], out_file) else: call_file = _run_variantcall_batch_multicore( items, region_block, out_file) return {"vrn_file_region": call_file, "region_block": region_block}
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0] all_vrs = _get_variant_regions(items) ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items) if len(all_vrs) > 0 else chrom) with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions if ready_region == chrom: want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed)) else: want_bedtool = pybedtools.BedTool(ready_region).saveas() sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _prep_sample_cnvs(cnv_file, data): """Convert a multiple sample CNV file into a single BED file for a sample. Handles matching and fixing names where R converts numerical IDs (1234) into strings by adding an X (X1234), and converts other characters into '.'s. http://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html """ import pybedtools sample_name = tz.get_in(["rgnames", "sample"], data) def make_names(name): return re.sub("[^\w.]", '.', name) def matches_sample_name(feat): return (feat.name == sample_name or feat.name == "X%s" % sample_name or feat.name == make_names(sample_name)) def update_sample_name(feat): feat.name = sample_name return feat sample_file = os.path.join(os.path.dirname(cnv_file), "%s-cnv.bed" % sample_name) if not utils.file_exists(sample_file): with file_transaction(data, sample_file) as tx_out_file: with shared.bedtools_tmpdir(data): pybedtools.BedTool(cnv_file).filter(matches_sample_name).each( update_sample_name).saveas(tx_out_file) return sample_file
def block_regions(callable_bed, in_bam, ref_file, data): """Find blocks of regions for analysis from mapped input BAM file. Identifies islands of callable regions, surrounding by regions with no read support, that can be analyzed independently. """ min_n_size = int(data["config"]["algorithm"].get("nomap_split_size", 250)) with shared.bedtools_tmpdir(data): nblock_bed = "%s-nblocks.bed" % utils.splitext_plus(callable_bed)[0] callblock_bed = "%s-callableblocks.bed" % utils.splitext_plus(callable_bed)[0] if not utils.file_uptodate(nblock_bed, callable_bed): ref_regions = get_ref_bedtool(ref_file, data["config"]) nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions) nblock_regions = _add_config_regions(nblock_regions, ref_regions, data) with file_transaction(data, nblock_bed, callblock_bed) as (tx_nblock_bed, tx_callblock_bed): nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(tx_nblock_bed) if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0: ref_regions.subtract(tx_nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(tx_callblock_bed) else: raise ValueError("No callable regions found in %s from BAM file %s. Some causes:\n " " - Alignment regions do not overlap with regions found " "in your `variant_regions` BED: %s\n" " - There are no aligned reads in your BAM file that pass sanity checks " " (mapping score > 1, non-duplicates, both ends of paired reads mapped)" % (dd.get_sample_name(data), in_bam, dd.get_variant_regions(data))) return callblock_bed, nblock_bed
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas() if any(dd.get_coverage_interval(d) == "genome" for d in items): want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ # Unpack nested lists of samples grouped together if isinstance(samples[0], (list, tuple)) and len(samples[0]) == 1: samples = [x[0] for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update( global_analysis_file, samples): global_no_analysis_file = os.path.join( os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch( batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"][ "callable_regions"] = analysis_file data["config"]["algorithm"][ "non_callable_regions"] = no_analysis_file data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( analysis_file).count() elif vr_file: data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"][ "highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ CovInfo = collections.namedtuple( "CovInfo", "callable, highdepth, avg_coverage, coverage") config = data["config"] out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir({"config": config}): coverage_file, callable_bed, highdepth_bed, variant_regions_avg_cov = coverage.calculate( bam_file, data) input_regions_bed = config["algorithm"].get("variant_regions", None) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(config, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter( lambda x: x.name == "CALLABLE") if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect( input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, highdepth_bed, variant_regions_avg_cov, coverage_file)
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ import pybedtools sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with shared.bedtools_tmpdir(data): input_beds = filter( lambda x: x is not None, [_create_bed(c, out_file, data) for c in calls]) if len(input_beds) > 0: all_file = "%s-all.bed" % utils.splitext_plus( tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): out_handle.write(line) pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=",").saveas(tx_out_file) if utils.file_exists(out_file): calls.append({"variantcaller": "ensemble", "vrn_file": out_file}) return calls
def _get_chroms(data): """Retrieve chromosomes included in variant_regions for parallelization. """ chroms = set([]) with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(dd.get_variant_regions(data)): chroms.add(r.chrom) out = [] for c in ref.file_contigs(dd.get_ref_file(data)): if c.name in chroms: out.append((c.name, 0, c.size)) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = cwlutils.unpack_tarballs(samples, samples[0]) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, sample, work_dir, data) for c in calls]) if len(input_beds) > 0: out_file = _combine_bed_by_size(input_beds, sample, work_dir, data) if utils.file_exists(out_file): bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep")) calls.append({"variantcaller": "ensemble", "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)}) return calls
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ import pybedtools samples = [x[0] for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"]["highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def _limit_calls(in_file, highdepth_beds, data): """Limit calls to avoid calling in problematic genomic regions. - highdepth_beds -- high depth regions with reads in repeat regions. """ import pybedtools out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): out_handle.write(line) to_remove = pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=",").saveas() pybedtools.BedTool(in_file).intersect(to_remove, v=True).saveas(tx_out_file) return out_file
def sample_callable_bed(bam_file, ref_file, config): """Retrieve callable regions for a sample subset by defined analysis regions. """ out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir({"config": config}): callable_bed = parallel_callable_loci(bam_file, ref_file, config) input_regions_bed = config["algorithm"].get("variant_regions", None) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(config, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(lambda x: x.name == "CALLABLE") if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return out_file
def _limit_calls(in_file, highdepth_beds, data): """Limit calls to avoid calling in problematic genomic regions. - highdepth_beds -- high depth regions with reads in repeat regions. """ import pybedtools out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): out_handle.write(line) to_remove = pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=",").saveas() pybedtools.BedTool(in_file).intersect(to_remove, v=True, nonamecheck=True).saveas(tx_out_file) return out_file
def sample_callable_bed(bam_file, ref_file, config): """Retrieve callable regions for a sample subset by defined analysis regions. """ out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir({"config": config}): callable_bed = parallel_callable_loci(bam_file, ref_file, config) input_regions_bed = config["algorithm"].get("variant_regions", None) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(lambda x: x.name == "CALLABLE") if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return out_file
def exclude_by_ends(in_file, exclude_file, data, in_params=None): """Exclude calls based on overlap of the ends with exclusion regions. Removes structural variants with either end being in a repeat: a large source of false positives. Parameters tuned based on removal of LCR overlapping false positives in DREAM synthetic 3 data. """ params = { "end_buffer": 50, "rpt_pct": 0.9, "total_rpt_pct": 0.2, "sv_pct": 0.5 } if in_params: params.update(in_params) assert in_file.endswith(".bed") out_file = "%s-norepeats%s" % utils.splitext_plus(in_file) to_filter = collections.defaultdict(list) removed = 0 if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with shared.bedtools_tmpdir(data): for coord, end_name in [(1, "end1"), (2, "end2")]: base, ext = utils.splitext_plus(tx_out_file) end_file = _create_end_file( in_file, coord, params, "%s-%s%s" % (base, end_name, ext)) to_filter = _find_to_filter(end_file, exclude_file, params, to_filter) with open(tx_out_file, "w") as out_handle: with open(in_file) as in_handle: for line in in_handle: key = "%s:%s-%s" % tuple(line.strip().split("\t")[:3]) total_rpt_size = sum(to_filter.get(key, [0])) if total_rpt_size <= (params["total_rpt_pct"] * params["end_buffer"]): out_handle.write(line) else: removed += 1 return out_file, removed
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ CovInfo = collections.namedtuple("CovInfo", "callable, highdepth, avg_coverage, coverage") config = data["config"] out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir({"config": config}): coverage_file, callable_bed, highdepth_bed, variant_regions_avg_cov = coverage.calculate(bam_file, data) input_regions_bed = config["algorithm"].get("variant_regions", None) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(config, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(lambda x: x.name == "CALLABLE") if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, highdepth_bed, variant_regions_avg_cov, coverage_file)
def _setup_variant_regions(data, out_dir): """Ensure we have variant regions for calling, using transcript if not present. Respects noalt_calling by removing additional contigs to improve speeds. """ vr_file = dd.get_variant_regions(data) if not vr_file: vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir) contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))]) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0]) if not utils.file_uptodate(out_file, vr_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(vr_file): if r.chrom in contigs: if chromhacks.is_nonalt(r.chrom): out_handle.write(str(r)) data = dd.set_variant_regions(data, out_file) return data
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool( shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract( pybedtools.BedTool(lcr_bed)) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() want_bedtool = pybedtools.BedTool( shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, out_file) for c in calls]) if len(input_beds) > 0: all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): out_handle.write(line) pybedtools.BedTool(all_file).sort(stream=True).merge(nms=True).saveas(tx_out_file) if utils.file_exists(out_file): calls.append({"variantcaller": "ensemble", "vrn_file": out_file}) return calls
def _prep_sample_cnvs(cnv_file, data): """Convert a multiple sample CNV file into a single BED file for a sample. Handles matching and fixing names where R converts numerical IDs (1234) into strings by adding an X (X1234), and converts other characters into '.'s. http://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html """ import pybedtools sample_name = tz.get_in(["rgnames", "sample"], data) def make_names(name): return re.sub("[^\w.]", '.', name) def matches_sample_name(feat): return (feat.name == sample_name or feat.name == "X%s" % sample_name or feat.name == make_names(sample_name)) def update_sample_name(feat): feat.name = sample_name return feat sample_file = os.path.join(os.path.dirname(cnv_file), "%s-cnv.bed" % sample_name) if not utils.file_exists(sample_file): with file_transaction(data, sample_file) as tx_out_file: with shared.bedtools_tmpdir(data): pybedtools.BedTool(cnv_file).filter(matches_sample_name).each(update_sample_name).saveas(tx_out_file) return sample_file
def _prep_sample_cnvs(cnv_file, data): """Convert a multiple sample CNV file into a single BED file for a sample. Handles matching and fixing names where R converts numerical IDs (1234) into strings by adding an X (X1234). """ import pybedtools sample_name = tz.get_in(["rgnames", "sample"], data) def matches_sample_name(feat): return feat.name == sample_name or feat.name == "X%s" % sample_name def update_sample_name(feat): feat.name = sample_name return feat sample_file = os.path.join(os.path.dirname(cnv_file), "%s-cnv.bed" % sample_name) if not utils.file_exists(sample_file): with file_transaction(sample_file) as tx_out_file: with shared.bedtools_tmpdir(data): pybedtools.BedTool(cnv_file).filter(matches_sample_name).each( update_sample_name).saveas(tx_out_file) return sample_file
def exclude_by_ends(in_file, exclude_file, data, in_params=None): """Exclude calls based on overlap of the ends with exclusion regions. Removes structural variants with either end being in a repeat: a large source of false positives. Parameters tuned based on removal of LCR overlapping false positives in DREAM synthetic 3 data. """ params = {"end_buffer": 50, "rpt_pct": 0.9, "total_rpt_pct": 0.2, "sv_pct": 0.5} if in_params: params.update(in_params) assert in_file.endswith(".bed") out_file = "%s-norepeats%s" % utils.splitext_plus(in_file) to_filter = collections.defaultdict(list) removed = 0 if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with shared.bedtools_tmpdir(data): for coord, end_name in [(1, "end1"), (2, "end2")]: base, ext = utils.splitext_plus(tx_out_file) end_file = _create_end_file(in_file, coord, params, "%s-%s%s" % (base, end_name, ext)) to_filter = _find_to_filter(end_file, exclude_file, params, to_filter) with open(tx_out_file, "w") as out_handle: with open(in_file) as in_handle: for line in in_handle: key = "%s:%s-%s" % tuple(line.strip().split("\t")[:3]) total_rpt_size = sum(to_filter.get(key, [0])) if total_rpt_size <= (params["total_rpt_pct"] * params["end_buffer"]): out_handle.write(line) else: removed += 1 return out_file, removed
def summarize(calls, data, highdepth_beds): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None and utils.file_exists(x), [_create_bed(c, sample, work_dir, data) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size(input_beds, sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file if len(highdepth_beds) > 0: limit_file = _limit_calls(filter_file, highdepth_beds, data) else: limit_file = filter_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(limit_file), "bedprep")) calls.append({"variantcaller": "sv-ensemble", "vrn_file": bedutils.clean_file(limit_file, data, bedprep_dir=bedprep_dir)}) return calls