def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] for a in annotation.get_gatk_annotations(items[0]["config"]): params += ["--annotation", a] paired = vcfutils.get_paired_bams(align_bams, items) params += _add_tumor_params(paired) params += _add_region_params(region, out_file, items) params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner = broad.runner_from_config(items[0]["config"]) assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))) pp_cmd = _post_process_cl(paired) cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] params += standard_cl_params(items) broad_runner = broad.runner_from_config(config) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) if _use_spark(num_cores, gatk_type): # GATK4 spark runs use 2bit reference index params = ["--reference", dd.get_ref_twobit(items[0])] else: picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] return broad_runner, params, out_file
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = [ "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] for a in annotation.get_gatk_annotations( items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus( tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus( tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: bam.index(x, config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError( "Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", ".join([dd.get_sample_name(x) for x in items]) ) params += ["-I:tumor", paired.tumor_bam] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] if dbsnp: params += ["--dbsnp", dbsnp] if cosmic: params += ["--cosmic", cosmic] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: bam.index(x, config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items])) params += ["-I:tumor", paired.tumor_bam] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] if dbsnp: params += ["--dbsnp", dbsnp] if cosmic: params += ["--cosmic", cosmic] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] if dd.is_set_coverage_depth_max(data): coverage_depth_max = dd.get_coverage_depth_max(data) # GATK can only downsample to a minimum of 200 coverage_depth_max = max([200, coverage_depth_max]) params += [ "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE" ] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. items = 1 sample or T/N pair """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): # call somatic variants keeping germline sites and using germline 1KG resource # use --native-pair-hmm-threads? broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() # shared Mutect2 settings for PureCN analysis in the case of: # - PON creation # - Tumor-only PureCN run # - T/N PureCN run # PURECN requirement alters Mutect2 variants calling! if "purecn" in dd.get_svcaller(items[0]): # mutect call for PON creation or purecn T-only analysis _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: germline_resource = tz.get_in(["genome_resources", "variation", "af_only_gnomad"], items[0]) germline_path = os.path.normpath(os.path.join(os.path.dirname(ref_file), germline_resource)) input_bam = dd.get_work_bam(items[0]) tx_prefilt_vcf = utils.splitext_plus(tx_out_file)[0] + ".prefilt.vcf" tx_vcf = os.path.splitext(tx_out_file)[0] out_file_ungz = os.path.splitext(out_file)[0] params = ["-T", "Mutect2"] # T/N pair if len(items) == 2: paired = vcfutils.get_paired_bams(align_bams, items) # not really running purecn with mutect1/gatk3 params += _add_tumor_params(paired, items, gatk_type) logger.debug("You are running mutect2 in PureCN analysis in T/N mode, T-only + PON is recommended") else: #T only params += ["-I", input_bam] # adding SNV PON from background/variant snv_pon = tz.get_in(["config", "algorithm", "background", "variant"], items[0]) if snv_pon and dd.get_batch(items[0]) != "pon_build": params += ["-pon", snv_pon] params += ["--genotype-pon-sites"] opt_list = config_utils.get_resources("mutect2", items[0]["config"]).get("options") # default is 50, sometimes 100 or 200 is recommended for better sensitivity in detection # hom del CNVs (calling more variants helps) interval_padding = 50 if opt_list: opt_dict = dict(zip(opt_list[::2], opt_list[1::2])) if "--interval_padding" in opt_dict: interval_padding = opt_dict["--interval_padding"] params += ["--max-mnp-distance", "0", "--interval-padding", interval_padding, "--germline-resource", germline_path, "--genotype-germline-sites", "--reference", ref_file, "-O", tx_prefilt_vcf] params += _add_region_params(region, out_file, items, gatk_type) broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) filter_cmd = _mutect2_filter(broad_runner, items, tx_prefilt_vcf, out_file_ungz, ref_file) cmd = "{gatk_cmd} && {filter_cmd}" do.run(cmd.format(**locals()), "MuTect2") # no AF filter for PureCN variants out_file = vcfutils.bgzip_and_index(out_file_ungz, items[0]["config"]) else: # a regular mutect call paired = vcfutils.get_paired_bams(align_bams, items) f1r2_file = None _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) if all(is_paired(bam) for bam in align_bams) and ( "mutect2_readmodel" in utils.get_in(items[0], "config", "tools_on")): orientation_filter = True else: orientation_filter = False if gatk_type == "gatk4" and orientation_filter: f1r2_file = "{}-f1r2.tar.gz".format(utils.splitext_plus(out_file)[0]) params += ["--f1r2-tar-gz", f1r2_file] # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) if orientation_filter: tx_f1r2_file = "{}-read-orientation-model.tar.gz" tx_f1r2_file = tx_f1r2_file.format(utils.splitext_plus(f1r2_file)[0]) tx_read_orient_cmd = _mutect2_read_filter(broad_runner, f1r2_file, tx_f1r2_file) filter_cmd = _mutect2_filter(broad_runner, items, tx_raw_prefilt_file, tx_raw_file, ref_file, tx_f1r2_file) else: filter_cmd = _mutect2_filter(broad_runner, items, tx_raw_prefilt_file, tx_raw_file, ref_file) if orientation_filter: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}" else: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])