def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ if vrn_file and "gvcf" in dd.get_tools_on(data): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_callable_regions(data): return dd.get_callable_regions(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def get_analysis_intervals(data): """Retrieve analysis regions for the current variant calling pipeline. """ if data.get("ensemble_bed"): return data["ensemble_bed"] elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data) elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data) elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data) else: for key in ["callable_regions", "variant_regions"]: intervals = data["config"]["algorithm"].get(key) if intervals: return intervals
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"): ref_file = dd.get_ref_file(data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(data["work_bam"], ref_file, data) highdepth_bed = highdepth.identify(data) bam.index(data["work_bam"], data["config"]) sample_callable = callable.sample_callable_bed(data["work_bam"], ref_file, data) offtarget_stats = callable.calculate_offtarget(data["work_bam"], ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "highdepth": highdepth_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "highdepth": covinfo.highdepth, "sample_callable": covinfo.callable, "coverage_depth_bed": covinfo.depth, "avg_coverage": covinfo.avg_coverage } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(bam_file_ready, ref_file, data) sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data) offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats} data = coverage.assign_interval(data) highdepth_bed = highdepth.identify(data) data["regions"]["highdepth"] = highdepth_bed if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def get_analysis_intervals(data): """Retrieve analysis regions for the current variant calling pipeline. """ if data.get("ensemble_bed"): return data["ensemble_bed"] elif data.get("work_bam-orig"): return callable.sample_callable_bed(data["work_bam-orig"], utils.get_in(data, ("reference", "fasta", "base")), data["config"]) elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], utils.get_in(data, ("reference", "fasta", "base")), data["config"]) else: for key in ["callable_regions", "variant_regions"]: intervals = data["config"]["algorithm"].get(key) if intervals: return intervals
def get_analysis_intervals(data): """Retrieve analysis regions for the current variant calling pipeline. """ if data.get("ensemble_bed"): return data["ensemble_bed"] elif data.get("align_bam"): return callable.sample_callable_bed( data["align_bam"], utils.get_in(data, ("reference", "fasta", "base")), data["config"]) elif data.get("work_bam"): return callable.sample_callable_bed( data["work_bam"], utils.get_in(data, ("reference", "fasta", "base")), data["config"]) else: for key in ["callable_regions", "variant_regions"]: intervals = data["config"]["algorithm"].get(key) if intervals: return intervals
def get_analysis_intervals(data): """Retrieve analysis regions for the current variant calling pipeline. """ if data.get("callable_bam"): return callable.sample_callable_bed(data["callable_bam"], data["sam_ref"], data["config"]) else: for key in ["callable_regions", "variant_regions"]: intervals = data["config"]["algorithm"].get(key) if intervals: return intervals
def get_analysis_intervals(data): """Retrieve analysis regions for the current variant calling pipeline. """ if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_callable_regions(data): return dd.get_callable_regions(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data) elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data) elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data) elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ if vrn_file and "gvcf" in dd.get_tools_on(data): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_callable_regions(data): return dd.get_callable_regions(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data) elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data) elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data) elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"): ref_file = dd.get_ref_file(data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(data["work_bam"], ref_file, data) highdepth_bed = highdepth.identify(data) sample_callable = callable.sample_callable_bed(data["work_bam"], ref_file, data) offtarget_stats = callable.calculate_offtarget(data["work_bam"], ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "highdepth": highdepth_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats} data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]