def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ( "Region must be a tuple - something odd just happened" ) raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) if any(dd.get_coverage_interval(x) == "genome" for x in items): target_bed = shared.remove_highdepth_regions(target_bed, items) target_bed = shared.remove_lcr_regions(target_bed, items) return ["--bed", target_bed] else: return []
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas() if any(dd.get_coverage_interval(d) == "genome" for d in items): want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def summarize(calls, data, items): """Summarize results from multiple callers into a single flattened BED file. Approach: - Combine all calls found in all files - Filter files retaining those present with multiple levels of support. - Remove calls in high depth regions. - Remove calls with ends overlapping exclusion regions like low complexity regions. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]), [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file limit_file = shared.remove_highdepth_regions(filter_file, items) exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f] exclude_file = exclude_files[0] if len(exclude_files) > 0 else None if exclude_file: noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data) else: noexclude_file = limit_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep")) if utils.file_exists(noexclude_file): calls.append({"variantcaller": "sv-ensemble", "input_beds": input_beds, "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)}) return calls
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed)) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def _vardict_options_from_config(items, config, out_file, target=None): opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += resources["options"] assert _is_bed_file(target) if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) target = shared.remove_lcr_regions(target, items) target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return opts
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] cur_ploidy = ploidy.get_ploidy(items, region) base_ploidy = ploidy.get_ploidy(items) opts += ["--ploidy", str(cur_ploidy)] # Adjust min fraction when trying to call more sensitively in certain # regions. This is primarily meant for pooled mitochondrial calling. if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0]) and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts): opts += ["--min-alternate-fraction", "0.01"] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def _vardict_options_from_config(items, config, out_file, region=None, do_merge=False): opts = ["-c 1", "-S 2", "-E 3", "-g 4"] #["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += resources["options"] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = shared.subset_variant_regions(variant_regions, region, out_file, do_merge=do_merge) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) opts += [target] # this must be the last option else: # one-based, end-inclusive coordinates as for Gatk opts += ["-R", bamprep.region_to_gatk(target)] return opts
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool( shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract( pybedtools.BedTool(lcr_bed)) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() want_bedtool = pybedtools.BedTool( shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) if any(dd.get_coverage_interval(x) == "genome" for x in items): target_bed = shared.remove_highdepth_regions(target_bed, items) target_bed = shared.remove_lcr_regions(target_bed, items) return ["--bed", target_bed] else: return []
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] cur_ploidy = ploidy.get_ploidy(items, region) base_ploidy = ploidy.get_ploidy(items) opts += ["--ploidy", str(cur_ploidy)] # Adjust min fraction when trying to call more sensitively in certain # regions. This is primarily meant for pooled mitochondrial calling. if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0]) and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts): opts += ["--min-alternate-fraction", "0.01"] variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def _vardict_options_from_config(items, config, out_file, target=None): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] # remove low mapping quality reads opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) target = shared.remove_lcr_regions(target, items) target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return " ".join(opts), " ".join(var2vcf_opts)
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts