def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ items = shared.add_highdepth_genome_exclusion(items) out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): with file_transaction(items[0], out_file) as tx_out_file: # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) want_bedtool = pybedtools.BedTool(shared.subset_variant_regions(want_bedtool.saveas().fn, chrom, tx_out_file, items)) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas() full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle ] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: write_empty_vcf(out_file) return out_file
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ items = shared.add_highdepth_genome_exclusion(items) out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): with file_transaction(items[0], out_file) as tx_out_file: # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) want_bedtool = pybedtools.BedTool( shared.subset_variant_regions(want_bedtool.saveas().fn, chrom, tx_out_file, items)) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract( sv_exclude_bed, nonamecheck=True).saveas() full_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError( "Only regional variant calling with cortex_var is supported. Set variant_regions" ) target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, out_file, config) for x in in_handle ] combine_variant_files(regional_vcfs, out_file, ref_file, config) else: write_empty_vcf(out_file) return out_file
def _scalpel_options_from_config(items, config, out_file, region, tmp_path): opts = [] opts += ["--format", "vcf", "--intarget"] # output vcf, report only variants within bed regions variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--bed", target] else: tmp_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(tmp_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region print("%s\t%s\t%s" % (chrom, start, end), file=tx_tmp_bed) opts += ["--bed", tmp_bed] resources = config_utils.get_resources("scalpel", config) if resources.get("options"): opts += resources["options"] if "--outratio" not in " ".join(opts): # add minimum reportable allele frequency, for which Scalpel defaults to 5 # but other somatic tools in bcbio default to 10 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += ["--outratio", str(min_af)] return opts
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all( realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) return out_file
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ if len(align_bams) == 1: align_bam = align_bams[0] config = items[0]["config"] else: raise NotImplementedError("Need to add multisample calling for cortex_var") if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): bam.index(align_bam, config) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle ] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: vcfutils.write_empty_vcf(out_file) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. Checks for variant region specifications that do not overlap contigs (in which case we do not calculate coverage) and regions smaller than callable_min_size (in which case we assign everything as callable). callable_min_size avoids calculations for small chromosomes we won't split on later, saving computation and disk IO. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] region_size = _get_region_size(ref_file, data, region) if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data): coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE" custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data) return custom_file, False elif not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data) return custom_file, variant_regions is None
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def run_cortex(align_bams, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ if len(align_bams) == 1: align_bam = align_bams[0] else: raise NotImplementedError("Need to add multisample calling for cortex_var") broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: write_empty_vcf(out_file) return out_file
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run tumor only smCounter2 calling. """ paired = vcfutils.get_paired_bams(align_bams, items) assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" % (",".join([dd.get_sample_name(d) for d in items]))) vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) out_file = out_file.replace(".vcf.gz", ".vcf") out_prefix = utils.splitext_plus(os.path.basename(out_file))[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["smCounter2", "--runPath", os.path.dirname(tx_out_file), "--outPrefix", out_prefix, "--bedTarget", target, "--refGenome", ref_file, "--bamFile", paired.tumor_bam, "--bamType", "consensus", "--nCPU", dd.get_num_cores(paired.tumor_data)] do.run(cmd, "smcounter2 variant calling") for fname in glob.glob(os.path.join(os.path.dirname(tx_out_file), "*.smCounter*")): shutil.move(fname, os.path.join(os.path.dirname(out_file), os.path.basename(fname))) utils.symlink_plus(os.path.join(os.path.dirname(out_file), "%s.smCounter.cut.vcf" % out_prefix), out_file) return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"], remove_orig=False, prep_cmd="sed 's#FORMAT\t%s#FORMAT\t%s#' | %s" % (out_prefix, dd.get_sample_name(paired.tumor_data), vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
def _run_recal_bam(dup_align_bam, recal_file, region, ref_file, out_file, config): """Run BAM recalibration with the given input """ if not file_exists(out_file): if _recal_available(recal_file): broad_runner = broad.runner_from_config(config) with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "PrintReads", "-BQSR", recal_file, "-R", ref_file, "-I", dup_align_bam, "--out", tx_out_file, ] base_bed = config["algorithm"].get("variant_regions", None) region_bed = subset_variant_regions(base_bed, region, tx_out_file) if region_bed: params += ["-L", region_bed, "--interval_set_rule", "INTERSECTION"] elif region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) elif region: subset_bam_by_region(dup_align_bam, region, out_file) else: shutil.copy(dup_align_bam, out_file) return out_file
def _config_params(base_config, assoc_files, region, out_file, items): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] # Output quality scores if "--enable_qscore_output" not in params: params.append("--enable_qscore_output") # drf not currently supported in MuTect to turn off duplicateread filter # params += gatk.standard_cl_params(items) return params
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--annotation", "QualByDepth", "--annotation", "HaplotypeScore", "--annotation", "MappingQualityRankSumTest", "--annotation", "ReadPosRankSumTest", "--annotation", "FisherStrand", "--annotation", "RMSMappingQuality", "--annotation", "DepthOfCoverage", "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, ] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] return broad_runner, params, out_file
def gatk_realigner_targets(runner, align_bam, ref_file, dbsnp=None, region=None, out_file=None, deep_coverage=False, variant_regions=None): """Generate a list of interval regions for realignment around indels. """ if out_file: out_file = "%s.intervals" % os.path.splitext(out_file)[0] else: out_file = "%s-realign.intervals" % os.path.splitext(align_bam)[0] # check only for file existence; interval files can be empty after running # on small chromosomes, so don't rerun in those cases if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: logger.debug("GATK RealignerTargetCreator: %s %s" % (os.path.basename(align_bam), region)) params = ["-T", "RealignerTargetCreator", "-I", align_bam, "-R", ref_file, "-o", tx_out_file, "-l", "INFO", ] region = subset_variant_regions(variant_regions, region, tx_out_file) if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] if dbsnp: params += ["--known", dbsnp] if deep_coverage: params += ["--mismatchFraction", "0.30", "--maxIntervalSize", "650"] runner.run_gatk(params) return out_file
def run_samtools(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Detect SNPs and indels with samtools mpileup and bcftools. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index", align_bam) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): logger.info("Genotyping with samtools: {region} {fname}".format( region=region, fname=os.path.basename(align_bam))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if variant_regions is not None and not os.path.isfile(target_regions): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: _call_variants_samtools(align_bam, ref_file, config, target_regions, tx_out_file) return out_file
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0] all_vrs = _get_variant_regions(items) ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items) if len(all_vrs) > 0 else chrom) with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions if ready_region == chrom: want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed)) else: want_bedtool = pybedtools.BedTool(ready_region).saveas() sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine multiple VCF files into a single output file. Handles complex merging of samples and other tricky issues using GATK. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] broad_runner = broad.runner_from_config(config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, orig_file in enumerate(orig_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), orig_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def _config_params(base_config, assoc_files, region, out_file): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = base_config["algorithm"].get("variant_regions") region = subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] # Output quality scores if "--enable_qscore_output" not in params: params.append("--enable_qscore_output") return params
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ( "Region must be a tuple - something odd just happened" ) raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) if any(dd.get_coverage_interval(x) == "genome" for x in items): target_bed = shared.remove_highdepth_regions(target_bed, items) target_bed = shared.remove_lcr_regions(target_bed, items) return ["--bed", target_bed] else: return []
def _shared_gatk_call_prep(align_bam, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) broad_runner.run_fn("picard_index", align_bam) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] region = subset_variant_regions(variant_regions, region, out_file) bfh = config["resources"]["gatk"].get("max_bam_file_handle",1024) params = ["-I", align_bam, "-R", ref_file, "-bfh", str(bfh), "--annotation", "QualByDepth", "--annotation", "HaplotypeScore", "--annotation", "MappingQualityRankSumTest", "--annotation", "ReadPosRankSumTest", "--annotation", "FisherStrand", "--annotation", "RMSMappingQuality", "--annotation", "DepthOfCoverage", "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, ] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] return broad_runner, params, out_file
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def calc_callable_loci(data, region=None, out_file=None): """Determine callable bases for input BAM using Broad's CallableLoci walker. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html """ broad_runner = broad.runner_from_config(data["config"]) if out_file is None: out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0] out_summary = "%s-callable-summary.txt" % os.path.splitext(data["work_bam"])[0] variant_regions = data["config"]["algorithm"].get("variant_regions", None) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: broad_runner.run_fn("picard_index", data["work_bam"]) params = ["-T", "CallableLoci", "-R", data["sam_ref"], "-I", data["work_bam"], "--out", tx_out_file, "--summary", out_summary] ready_region = shared.subset_variant_regions(variant_regions, region, tx_out_file) if ready_region: params += ["-L", ready_region] if ((variant_regions and ready_region and os.path.isfile(ready_region)) or not variant_regions or not region): broad_runner.run_gatk(params) else: with open(out_file, "w") as out_handle: for tregion in get_ref_bedtool(data["sam_ref"], data["config"]): if tregion.chrom == region: out_handle.write("%s\t%s\t%s\tNO_COVERAGE\n" % (tregion.chrom, tregion.start, tregion.stop)) return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] return broad_runner, params, out_file
def read_backed_phasing(vcf_file, bam_files, genome_file, region, config): """Phase variants using GATK's read-backed phasing. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_phasing_ReadBackedPhasing.html """ if has_variants(vcf_file): broad_runner = broad.runner_from_config(config) out_file = "%s-phased%s" % os.path.splitext(vcf_file) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: params = [ "-T", "ReadBackedPhasing", "-R", genome_file, "--variant", vcf_file, "--out", tx_out_file, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE" ] for bam_file in bam_files: params += ["-I", bam_file] variant_regions = config["algorithm"].get( "variant_regions", None) region = shared.subset_variant_regions(variant_regions, region, out_file) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params) return out_file else: return vcf_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) if _use_spark(num_cores, gatk_type): # GATK4 spark runs use 2bit reference index params = ["--reference", dd.get_ref_twobit(items[0])] else: picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. """ import pybedtools variant_regions = bedutils.merge_overlaps( utils.get_in(data, ("config", "algorithm", "variant_regions")), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] if not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: with file_transaction(data, custom_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write( "%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, "NO_COVERAGE")) return custom_file, variant_regions is None
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] params += standard_cl_params(items) broad_runner = broad.runner_from_config(config) return broad_runner, params
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) target = shared.subset_variant_regions(variant_regions, region, out_file, items) interval = "--interval %s" % (target) if target else "" with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % ( assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = _license_export(items[0]) cmd = ( "{license} sentieon driver -t 1 -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def read_backed_phasing(vcf_file, bam_files, genome_file, region, config): """Phase variants using GATK's read-backed phasing. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_phasing_ReadBackedPhasing.html """ if has_variants(vcf_file): broad_runner = broad.runner_from_config(config) out_file = "%s-phased%s" % os.path.splitext(vcf_file) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: params = ["-T", "ReadBackedPhasing", "-R", genome_file, "--variant", vcf_file, "--out", tx_out_file, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE"] for bam_file in bam_files: params += ["-I", bam_file] variant_regions = config["algorithm"].get("variant_regions", None) region = shared.subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params) return out_file else: return vcf_file
def run_haplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) target = shared.subset_variant_regions(variant_regions, region, out_file, items) interval = "--interval %s" % (target) if target else "" with file_transaction(items[0], out_file) as tx_out_file: dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" bams = " ".join(["-i %s" % x for x in align_bams]) license = _license_export(items[0]) cmd = ("{license} sentieon driver -t 1 -r {ref_file} " "{bams} {interval} --algo Haplotyper {dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def _config_params(base_config, assoc_files, region, out_file): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = base_config["algorithm"].get("variant_regions") region = subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] return params
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ raise NotImplementedError("Cortex currently out of date and needs reworking.") if len(align_bams) == 1: align_bam = align_bams[0] config = items[0]["config"] else: raise NotImplementedError("Need to add multisample calling for cortex_var") if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): bam.index(align_bam, config) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle] combine_file = "{0}-raw{1}".format(*os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: vcfutils.write_empty_vcf(out_file) return out_file
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format(name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp, ref_file, config) return ann_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file, items=items) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _pindel_options(items, config, out_file, region, tmp_path): """parse pindel options. Add region to cmd. :param items: (dict) information from yaml :param config: (dict) information from yaml (items[0]['config']) :param region: (str or tupple) region to analyze :param tmp_path: (str) temporal folder :returns: (list) options for pindel """ variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) opts = "" if target: if isinstance(target, six.string_types) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) opts = "-j " + remove_lcr_regions(target_bed, items) return opts
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """ Preparation work for MuTect. """ #FIXME: We assume all other bits in the config are shared base_config = items[0]["config"] dbsnp = assoc_files["dbsnp"] cosmic = assoc_files.get("cosmic") broad_runner = broad.runner_from_config(base_config, "mutect") broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) variant_regions = base_config["algorithm"].get("variant_regions", None) contamination = base_config["algorithm"].get("fraction_contamination", 0) region = subset_variant_regions(variant_regions, region, out_file) #FIXME: Add more parameters like fraction contamination etc params = ["-R", ref_file, "-T", "MuTect"] params += ["--dbsnp", dbsnp] tumor_bam = None normal_bam = None for bamfile, item in itertools.izip(align_bams, items): metadata = item["metadata"] if metadata["phenotype"] == "normal": normal_bam = bamfile normal_sample_name = item["name"][1] elif metadata["phenotype"] == "tumor": tumor_bam = bamfile tumor_sample_name = item["name"][1] if tumor_bam is None or normal_bam is None: raise ValueError("Missing phenotype definition (tumor or normal) " "in samples") params += ["-I:normal", normal_bam] params += ["-I:tumor", tumor_bam] params += ["--tumor_sample_name", tumor_sample_name] params += ["--normal_sample_name", normal_sample_name] params += ["--fraction_contamination", contamination] if cosmic is not None: params += ["--cosmic", cosmic] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = [ "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file ] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend( ["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if quiet_out: params.extend( ["--suppressCommandLineHeader", "--setKey", "null"]) variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += [ "-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION" ] jvm_opts = broad.get_gatk_framework_opts(config) cmd = [config_utils.get_program("gatk-framework", config) ] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{ file_key: out_file, "region": region, "sam_ref": ref_file, "config": config }] else: return out_file
def _subset_regions(region, base_file, items): """Subset to a BED file (or genomic region) for calling. """ variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target = pshared.subset_variant_regions(variant_regions, region, base_file, items) if isinstance(target, basestring) and os.path.isfile(target): return target else: return bamprep.region_to_gatk(target)
def _subset_regions(region, base_file, items): """Subset to a BED file (or genomic region) for calling. """ variant_regions = bedutils.population_variant_regions(items, merged=True) target = pshared.subset_variant_regions(variant_regions, region, base_file, items) if isinstance(target, six.string_types) and os.path.isfile(target): return target else: return bamprep.region_to_gatk(target)
def _add_region_params(region, out_file, items): """Add parameters for selecting by region to command line. """ params = [] variant_regions = tz.get_in(["config", "algorithm", "variant_regions"], items[0]) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def has_variant_regions(items, base_file, chrom=None): """Determine if we should process this chromosome: needs variant regions defined. """ if chrom: all_vrs = _get_variant_regions(items) if len(all_vrs) > 0: test = shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items) if test == chrom: return False return True
def _has_variant_regions(items, base_file, chrom=None): """Determine if we should process this chromosome: needs variant regions defined. """ if chrom: all_vrs = _get_variant_regions(items) if len(all_vrs) > 0: test = shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items) if test == chrom: return False return True
def _add_region_params(region, out_file, items): """Add parameters for selecting by region to command line. """ params = [] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += gatk.standard_cl_params(items) return params
def _clean_regions(items, region): """Intersect region with target file if it exists""" variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target