def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region))) return out_file
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def _add_region_params(region, out_file, items, gatk_type): """Add parameters for selecting by region to command line. """ params = [] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += gatk.standard_cl_params(items) return params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _config_params(base_config, assoc_files, region, out_file, items): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] # Output quality scores if "--enable_qscore_output" not in params: params.append("--enable_qscore_output") # drf not currently supported in MuTect to turn off duplicateread filter # params += gatk.standard_cl_params(items) return params
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file): """Performs genotyping of gVCFs into final VCF files. """ if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region), "--max_alternate_alleles", "4"] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: # GATK performs poorly with memory usage when parallelizing # with a large number of cores but makes use of extra memory, # so we cap at 6 cores. # See issue #1565 for discussion # Recent GATK 3.x versions also have race conditions with multiple # threads, so limit to 1 and keep memory available # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs # params += ["-nt", str(min(6, cores))] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale, parallel_gc=True) return vcfutils.bgzip_and_index(out_file, data["config"])
def _config_params(base_config, assoc_files, region, out_file): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = base_config["algorithm"].get("variant_regions") region = subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] return params
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We try to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 Known issue -- Genomics DB workspace path core dumps on longer paths: (std::string::compare(char const*)) """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir): if os.path.exists(out_dir): shutil.rmtree(out_dir) with utils.chdir(os.path.dirname(out_file)): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = ["-T", "GenomicsDBImport", "--reader-threads", str(cores), "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()), "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: vcfutils.bgzip_and_index(vrn_file, data["config"]) params += ["--variant", vrn_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine multiple VCF files into a single output file. Handles complex merging of samples and other tricky issues using GATK. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] broad_runner = broad.runner_from_config(config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, orig_file in enumerate(orig_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), orig_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=None, want_bcf=True): cl = [ config_utils.get_program("samtools", config), "mpileup", "-f", ref_file, "-d", str(max_read_depth), "-L", str(max_read_depth), "-m", "3", "-F", "0.0002", ] if want_bcf: cl += ["-D", "-S", "-u"] if target_regions: str_regions = bamprep.region_to_gatk(target_regions) if os.path.isfile(str_regions): cl += ["-l", str_regions] else: cl += ["-r", str_regions] cl += align_bams return " ".join(cl)
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for SomaticIndelDetector. """ base_config = items[0]["config"] for x in align_bams: bam.index(x, base_config) params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"] # Limit per base read start count to between 200-10000, i.e. from any base # can no more 10000 new reads begin. # Further, limit maxNumberOfReads accordingly, otherwise SID discards # windows for high coverage panels. window_size = 200 # default SID value paired = vcfutils.get_paired_bams(align_bams, items) max_depth = min(max(200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000)), 10000) params += ["--downsample_to_coverage", max_depth] params += ["--maxNumberOfReads", str(int(max_depth) * window_size)] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] # notice there must be at least 4 reads of coverage in normal params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af] else: params += ["--unpaired"] params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file): """Performs genotyping of gVCFs into final VCF files. """ if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region), "--max_alternate_alleles", "4"] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: # GATK performs poorly with memory usage when parallelizing # with a large number of cores but makes use of extra memory, # so we cap at 6 cores. # See issue #1565 for discussion params += ["-nt", str(min(6, cores))] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def read_backed_phasing(vcf_file, bam_files, genome_file, region, config): """Phase variants using GATK's read-backed phasing. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_phasing_ReadBackedPhasing.html """ if has_variants(vcf_file): broad_runner = broad.runner_from_config(config) out_file = "%s-phased%s" % os.path.splitext(vcf_file) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: params = ["-T", "ReadBackedPhasing", "-R", genome_file, "--variant", vcf_file, "--out", tx_out_file, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE"] for bam_file in bam_files: params += ["-I", bam_file] variant_regions = config["algorithm"].get("variant_regions", None) region = shared.subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params) return out_file else: return vcf_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """ Preparation work for MuTect. """ #FIXME: We assume all other bits in the config are shared base_config = items[0]["config"] dbsnp = assoc_files["dbsnp"] cosmic = assoc_files.get("cosmic") broad_runner = broad.runner_from_config(base_config, "mutect") broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) variant_regions = base_config["algorithm"].get("variant_regions", None) contamination = base_config["algorithm"].get("fraction_contamination", 0) region = subset_variant_regions(variant_regions, region, out_file) #FIXME: Add more parameters like fraction contamination etc params = ["-R", ref_file, "-T", "MuTect"] params += ["--dbsnp", dbsnp] tumor_bam = None normal_bam = None for bamfile, item in itertools.izip(align_bams, items): metadata = item["metadata"] if metadata["phenotype"] == "normal": normal_bam = bamfile normal_sample_name = item["name"][1] elif metadata["phenotype"] == "tumor": tumor_bam = bamfile tumor_sample_name = item["name"][1] if tumor_bam is None or normal_bam is None: raise ValueError("Missing phenotype definition (tumor or normal) " "in samples") params += ["-I:normal", normal_bam] params += ["-I:tumor", tumor_bam] params += ["--tumor_sample_name", tumor_sample_name] params += ["--normal_sample_name", normal_sample_name] params += ["--fraction_contamination", contamination] if cosmic is not None: params += ["--cosmic", cosmic] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _add_region_params(region, out_file, items): """Add parameters for selecting by region to command line. """ params = [] variant_regions = tz.get_in(["config", "algorithm", "variant_regions"], items[0]) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def _subset_regions(region, base_file, items): """Subset to a BED file (or genomic region) for calling. """ variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target = pshared.subset_variant_regions(variant_regions, region, base_file, items) if isinstance(target, basestring) and os.path.isfile(target): return target else: return bamprep.region_to_gatk(target)
def _subset_regions(region, base_file, items): """Subset to a BED file (or genomic region) for calling. """ variant_regions = bedutils.population_variant_regions(items, merged=True) target = pshared.subset_variant_regions(variant_regions, region, base_file, items) if isinstance(target, six.string_types) and os.path.isfile(target): return target else: return bamprep.region_to_gatk(target)
def subset_vcf(in_file, region, out_file, config): """Subset VCF in the given region, handling bgzip and indexing of input. """ work_file = vcfutils.bgzip_and_index(in_file, config) if not file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bcftools = config_utils.get_program("bcftools", config) region_str = bamprep.region_to_gatk(region) cmd = "{bcftools} view -r {region_str} {work_file} > {tx_out_file}" do.run(cmd.format(**locals()), "subset %s: %s" % (os.path.basename(work_file), region_str)) return out_file
def _get_interval(variant_regions, region, out_file, items): """Retrieve interval to run analysis in. Handles no targets, BED and regions """ target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): return "--interval %s" % target else: return "--interval %s" % bamprep.region_to_gatk(target) else: return ""
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: params += ["--variant", vrn_file] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def _get_interval(variant_regions, region, out_file, items): """Retrieve interval to run analysis in. Handles no targets, BED and regions region can be a single region or list of multiple regions for multicore calling. """ target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, six.string_types) and os.path.isfile(target): return "--interval %s" % target else: return "--interval %s" % bamprep.region_to_gatk(target) else: return ""
def run_gvcftyper(vrn_files, out_file, region, data): """Produce joint called variants from input gVCF files. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: license = license_export(data) ref_file = dd.get_ref_file(data) input_files = " ".join(vrn_files) region = bamprep.region_to_gatk(region) cmd = ("{license}sentieon driver -r {ref_file} --interval {region} " "--algo GVCFtyper {tx_out_file} {input_files}") do.run(cmd.format(**locals()), "Sentieon GVCFtyper") return out_file
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: params += ["--variant", vrn_file] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params, memscale=memscale) return out_file
def _run_wham_genotype(in_file, all_bams, coords, data): """Run genotyping on a prepped, merged VCF file. """ out_file = "%s-wgts%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cores = dd.get_cores(data) ref_file = dd.get_ref_file(data) coord_str = bamprep.region_to_gatk(coords) cmd = ("WHAM-GRAPHENING -b {in_file} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_out_file}") do.run(cmd.format(**locals()), "Genotype WHAM: %s" % region.to_safestr(coords)) return out_file
def _bed_to_platypusin(region, base_file): """Convert BED file regions into Platypus custom inputs. """ if isinstance(region, basestring) and os.path.isfile(region): out_file = "%s-platypusregion.list" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for region in pybedtools.BedTool(region): out_handle.write("%s:%s-%s\n" % (region.chrom, region.start, region.stop)) return out_file else: return bamprep.region_to_gatk(region)
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def merge_gvcfs(data, region, vrn_files, out_file): """Simple merging of gVCFs with gvcftools. merge_variants does appear to work correctly, so we remove gVCF parts with extract_variants and then combine the merged samples together. Longer term we plan to replace this with agg (https://github.com/Illumina/agg) or GLnexus (https://github.com/dnanexus-rnd/GLnexus). """ if not utils.file_exists(out_file): region = bamprep.region_to_gatk(region) vcfutils.merge_variant_files([_extract_variants_from_gvcf(f, region, out_file, data) for f in vrn_files], out_file, dd.get_ref_file(data), data["config"], region) return vcfutils.bgzip_and_index(out_file, data["config"])
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for SomaticIndelDetector. """ base_config = items[0]["config"] for x in align_bams: bam.index(x, base_config) params = [ "-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS" ] # Limit per base read start count to between 200-10000, i.e. from any base # can no more 10000 new reads begin. # Further, limit maxNumberOfReads accordingly, otherwise SID discards # windows for high coverage panels. paired = vcfutils.get_paired_bams(align_bams, items) params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] min_af = float( get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] # notice there must be at least 4 reads of coverage in normal params += [ "--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af ] else: params += ["--unpaired"] params += [ "--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af ] if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) for x in align_bams: bam.index(x, config) if num_cores > 1 and broad_runner.gatk_type() == "gatk4": # dbSNP annotation not currently supported with Multiple cores dbsnp = None # GATK4 spark runs use 2bit reference index params = ["--reference", dd.get_ref_twobit(items[0])] else: picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] params += standard_cl_params(items) return broad_runner, params
def _config_params(base_config, assoc_files, region, out_file): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] contamination = base_config["algorithm"].get("fraction_contamination", 0) params += ["--fraction_contamination", contamination] dbsnp = assoc_files["dbsnp"] if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = base_config["algorithm"].get("variant_regions") region = subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region) ] for vrn_file in vrn_files: params += ["--variant", vrn_file] cores = dd.get_cores(data) memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params, memscale=memscale) return out_file
def _vardict_options_from_config(items, config, out_file, region=None, do_merge=False): opts = ["-c 1", "-S 2", "-E 3", "-g 4"] #["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += resources["options"] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, do_merge=do_merge) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += [target] # this must be the last option else: # one-based, end-inclusive coordinates as for Gatk opts += ["-R", bamprep.region_to_gatk(target)] return opts
def _run_wham_coords(inputs, background_bams, coords, final_file): """Run WHAM on a specific set of chromosome, start, end coordinates. """ base, ext = os.path.splitext(final_file) out_file = "%s-%s%s" % (base, region.to_safestr(coords), ext) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) coord_str = bamprep.region_to_gatk(coords) opts = "-k -m 30" cmd = ( "WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_out_file}") do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords)) return [[coords, out_file]]
def _run_wham_coords(inputs, background_bams, coords, final_file): """Run WHAM on a specific set of chromosome, start, end coordinates. """ base, ext = utils.splitext_plus(final_file) raw_file = "%s-%s.vcf" % (base, region.to_safestr(coords)) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) if not utils.file_exists(raw_file): with file_transaction(inputs[0], raw_file) as tx_raw_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) coord_str = bamprep.region_to_gatk(coords) opts = "-k -m 30" cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_raw_file}") do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords)) merge_vcf = _run_wham_merge(raw_file, inputs[0]) gt_vcf = _run_wham_genotype(merge_vcf, all_bams, coords, inputs[0]) prep_vcf = vcfutils.sort_by_ref(gt_vcf, inputs[0]) return [[coords, prep_vcf]]
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max( 200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = [ "-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We try to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 Known issue -- Genomics DB workspace path core dumps on longer paths: (std::string::compare(char const*)) """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir): if os.path.exists(out_dir): shutil.rmtree(out_dir) with utils.chdir(os.path.dirname(out_file)): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = [ "-T", "GenomicsDBImport", "--reader-threads", str(cores), "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()), "-L", bamprep.region_to_gatk(region) ] for vrn_file in vrn_files: vcfutils.bgzip_and_index(vrn_file, data["config"]) samplemap = _create_samplemap_file(vrn_files) params += ["--sample-name-map", samplemap] # For large inputs, reduce memory usage by batching # https://github.com/bcbio/bcbio-nextgen/issues/2852 if len(vrn_files) > 200: params += ["--batch-size", "50"] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: bam.index(x, config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items])) params += ["-I:tumor", paired.tumor_bam] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] if dbsnp: params += ["--dbsnp", dbsnp] if cosmic: params += ["--cosmic", cosmic] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] if dd.is_set_coverage_depth_max(data): coverage_depth_max = dd.get_coverage_depth_max(data) # GATK can only downsample to a minimum of 200 coverage_depth_max = max([200, coverage_depth_max]) params += [ "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE" ] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params
def run_gvcfgenotyper(data, orig_region, vrn_files, out_file): """Merge strelka2 and Illumina compatible gVCFs with gvcfgenotyper. https://github.com/Illumina/gvcfgenotyper Also need to explore GLnexus (https://github.com/dnanexus-rnd/GLnexus) """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: regions = _find_gvcf_blocks(vrn_files[0], bamprep.region_to_gatk(orig_region), os.path.dirname(tx_out_file)) if len(regions) == 1: _run_gvcfgenotyper(data, regions[0], vrn_files, tx_out_file) else: split_outs = [_run_gvcfgenotyper(data, r, vrn_files, "%s-%s.vcf.gz" % (utils.splitext_plus(out_file)[0], r.replace(":", "_").replace("-", "_"))) for r in regions] vcfutils.concat_variant_files(split_outs, tx_out_file, regions, dd.get_ref_file(data), data["config"]) return vcfutils.bgzip_and_index(out_file, data["config"])
def prep_mpileup(align_bams, ref_file, config, max_read_depth=None, target_regions=None, want_bcf=True): cl = [ config_utils.get_program("samtools", config), "mpileup", "-f", ref_file ] if max_read_depth: cl += ["-d", str(max_read_depth), "-L", str(max_read_depth)] if want_bcf: cl += ["-t", "DP", "-u", "-g"] if target_regions: str_regions = bamprep.region_to_gatk(target_regions) if os.path.isfile(str_regions): cl += ["-l", str_regions] else: cl += ["-r", str_regions] cl += align_bams return " ".join(cl)
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) params = [ "-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--output", tx_out_file, "-L", bamprep.region_to_gatk(region) ] cores = dd.get_cores(data) memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] jvm_opts = broad.get_gatk_framework_opts(config) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = [ "-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. ropts += [str(x) for x in resources.get("options", [])] No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) params = ["-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--output", tx_out_file, "-L", bamprep.region_to_gatk(region)] params += ["-ploidy", str(ploidy.get_ploidy([data], region))] # Avoid slow genotyping runtimes with improved quality score calculation in GATK4 # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1 resources = config_utils.get_resources("gatk", data["config"]) params += [str(x) for x in resources.get("options", [])] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _config_params(base_config, assoc_files, region, out_file, items): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += [ "--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0) ] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] # Output quality scores if "--enable_qscore_output" not in params: params.append("--enable_qscore_output") # drf not currently supported in MuTect to turn off duplicateread filter # params += gatk.standard_cl_params(items) return params
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = [ "-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region) ] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: params += ["-nt", str(cores)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale) return out_file
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = [ "-T", "GenomicsDBImport", "--readerThreads", str(cores), "--genomicsDBWorkspace", tx_out_dir, "-L", bamprep.region_to_gatk(region) ] for vrn_file in vrn_files: params += ["--variant", vrn_file] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = [ "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file ] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend( ["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend( ["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get( "variant_regions", None) cur_region = shared.subset_variant_regions( variant_regions, region, out_file) if cur_region: params += [ "-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION" ] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts( config, os.path.dirname(tx_out_file), memscale=memscale) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{ file_key: out_file, "region": region, "sam_ref": ref_file, "config": config }] else: return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """ Preparation work for MuTect. """ #FIXME: We assume all other bits in the config are shared base_config = items[0]["config"] dbsnp = assoc_files["dbsnp"] cosmic = assoc_files.get("cosmic") broad_runner = broad.runner_from_config(base_config, "mutect") mutect_version = broad_runner.get_mutect_version() try: assert mutect_version is not None except AssertionError: logger.warn("WARNING") logger.warn("MuTect version could not be determined from jar file. " "Please ensure you are using at least version 1.1.5, " "as versions 1.1.4 and lower have known issues.") logger.warn("Proceeding but assuming correct version 1.1.5.") else: try: assert LooseVersion(mutect_version) >= LooseVersion("1.1.5") except AssertionError: message = ( "MuTect 1.1.4 and lower is known to have incompatibilities " "with Java < 7, and this may lead to problems in analyses. " "Please use MuTect 1.1.5 or higher (note that it requires " "Java 7).") raise ValueError(message) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) variant_regions = base_config["algorithm"].get("variant_regions", None) contamination = base_config["algorithm"].get("fraction_contamination", 0) region = subset_variant_regions(variant_regions, region, out_file) #FIXME: Add more parameters like fraction contamination etc params = ["-R", ref_file, "-T", "MuTect"] params += ["--dbsnp", dbsnp] tumor_bam = None normal_bam = None for bamfile, item in itertools.izip(align_bams, items): metadata = item["metadata"] if metadata["phenotype"] == "normal": normal_bam = bamfile normal_sample_name = item["name"][1] elif metadata["phenotype"] == "tumor": tumor_bam = bamfile tumor_sample_name = item["name"][1] if tumor_bam is None or normal_bam is None: raise ValueError("Missing phenotype definition (tumor or normal) " "in samples") params += ["-I:normal", normal_bam] params += ["-I:tumor", tumor_bam] params += ["--tumor_sample_name", tumor_sample_name] params += ["--normal_sample_name", normal_sample_name] params += ["--fraction_contamination", contamination] if cosmic is not None: params += ["--cosmic", cosmic] if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join( _vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program( "vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ( "{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """ Preparation work for MuTect. """ #FIXME: We assume all other bits in the config are shared base_config = items[0]["config"] dbsnp = assoc_files["dbsnp"] cosmic = assoc_files.get("cosmic") broad_runner = broad.runner_from_config(base_config, "mutect") broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: broad_runner.run_fn("picard_index", x) variant_regions = base_config["algorithm"].get("variant_regions", None) contamination = base_config["algorithm"].get("fraction_contamination", 0) region = subset_variant_regions(variant_regions, region, out_file) #FIXME: Add more parameters like fraction contamination etc params = ["-R", ref_file, "-T", "MuTect"] params += ["--dbsnp", dbsnp] tumor_bam = None normal_bam = None for bamfile, item in itertools.izip(align_bams, items): metadata = item["metadata"] if metadata["phenotype"] == "normal": normal_bam = bamfile normal_sample_name = item["name"][1] elif metadata["phenotype"] == "tumor": tumor_bam = bamfile tumor_sample_name = item["name"][1] if tumor_bam is None or normal_bam is None: raise ValueError("Missing phenotype definition (tumor or normal) " "in samples") params += ["-I:normal", normal_bam] params += ["-I:tumor", tumor_bam] params += ["--tumor_sample_name", tumor_sample_name] params += ["--normal_sample_name", normal_sample_name] params += ["--fraction_contamination", contamination] if cosmic is not None: params += ["--cosmic", cosmic] if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params