def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect( input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [sys.executable, config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError( "Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter( lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError("Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def add_highdepth_genome_exclusion(items): """Add exclusions to input items to avoid slow runtimes on whole genomes. """ out = [] for d in items: d = utils.deepish_copy(d) if dd.get_coverage_interval(d) == "genome": e = dd.get_exclude_regions(d) if "highdepth" not in e: e.append("highdepth") d = dd.set_exclude_regions(d, e) out.append(d) return out
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)): prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)): prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def _get_sample_excludes(d): excludes = dd.get_exclude_regions(d) # back compatible if tz.get_in(("config", "algorithm", "remove_lcr"), d, False): excludes.append("lcr") return excludes