def _get_resource_programs(fn, algs): """Retrieve programs used in analysis based on algorithm configurations. Helps avoid requiring core information from unused programs. """ # standard list of programs we always use # XXX Need to expose this in a top-level way to allow more multiprocessing used_progs = set( ["gatk", "gemini", "bcbio_coverage", "samtools", "snpEff"]) for alg in algs: # get aligners used aligner = alg.get("aligner") if aligner: used_progs.add(aligner) vc = alg.get("variantcaller") if vc: if isinstance(vc, (list, tuple)): for x in vc: used_progs.add(x) else: used_progs.add(vc) if config_utils.use_vqsr(algs): used_progs.add("gatk-vqsr") for prog in (fn.metadata.get("resources", []) if hasattr(fn, "metadata") else []): if prog in used_progs: yield prog
def _variant_filtration(in_file, ref_file, vrn_files, data, filter_type, hard_filter_fn): """Filter SNP and indel variant calls using GATK best practice recommendations. Hard filter if configuration indicates too little data or already finished a hard filtering, otherwise try VQSR. """ # Algorithms multiplied by number of input files to check for large enough sample sizes algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if (not config_utils.use_vqsr(algs) or _already_hard_filtered(in_file, filter_type)): logger.info("Skipping VQSR, using hard filers: we don't have whole genome input data") return hard_filter_fn(in_file, data) elif not _have_training_data(vrn_files): logger.info("Skipping VQSR, using hard filers: genome build does not have sufficient training data") return hard_filter_fn(in_file, data) else: sensitivities = {"INDEL": "98.0", "SNP": "99.97"} recal_file, tranches_file = _run_vqsr(in_file, ref_file, vrn_files, sensitivities[filter_type], filter_type, data) if recal_file is None: # VQSR failed logger.info("VQSR failed due to lack of training data. Using hard filtering.") return hard_filter_fn(in_file, data) else: return _apply_vqsr(in_file, ref_file, recal_file, tranches_file, sensitivities[filter_type], filter_type, data)
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if includes_missingalt(data): logger.info("Removing variants with missing alts from %s." % call_file) call_file = gatk_remove_missingalt(call_file, data) if "gatkcnn" in dd.get_tools_on(data): return _cnn_filter(call_file, vrn_files, data) elif config_utils.use_vqsr(algs, call_file): if vcfutils.is_gvcf_file(call_file): raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def _get_resource_programs(fn, algs): """Retrieve programs used in analysis based on algorithm configurations. Helps avoid requiring core information from unused programs. """ # standard list of programs we always use # XXX Need to expose this in a top-level way to allow more multiprocessing used_progs = set(["gatk", "gemini", "bcbio_coverage", "samtools", "snpEff"]) for alg in algs: # get aligners used aligner = alg.get("aligner") if aligner: used_progs.add(aligner) vc = alg.get("variantcaller") if vc: if isinstance(vc, (list, tuple)): for x in vc: used_progs.add(x) else: used_progs.add(vc) if config_utils.use_vqsr(algs): used_progs.add("gatk-vqsr") for prog in (fn.metadata.get("resources", []) if hasattr(fn, "metadata") else []): if prog in used_progs: yield prog
def _get_resource_programs(progs, algs): """Retrieve programs used in analysis based on algorithm configurations. Handles special cases like aligners and variant callers. """ out = set([]) for p in progs: if p == "aligner": for alg in algs: aligner = alg.get("aligner") if aligner: out.add(aligner) elif p == "variantcaller": for alg in algs: vc = alg.get("variantcaller") if vc: if isinstance(vc, (list, tuple)): for x in vc: out.add(x) else: out.add(vc) elif p == "gatk-vqsr": if config_utils.use_vqsr(algs): out.add("gatk-vqsr") else: out.add(p) return sorted(list(out))
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def _variant_filtration(in_file, ref_file, vrn_files, data, filter_type, hard_filter_fn): """Filter SNP and indel variant calls using GATK best practice recommendations. Hard filter if configuration indicates too little data or already finished a hard filtering, otherwise try VQSR. """ human = tz.get_in(["genome_resources", "aliases", "human"], data) # Algorithms multiplied by number of input files to check for large enough sample sizes algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if (not config_utils.use_vqsr(algs) or _already_hard_filtered(in_file, filter_type) or not human): return hard_filter_fn(in_file, data) else: sensitivities = {"INDEL": "98.0", "SNP": "99.97"} recal_file, tranches_file = _run_vqsr(in_file, ref_file, vrn_files, sensitivities[filter_type], filter_type, data) if recal_file is None: # VQSR failed logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) return hard_filter_fn(in_file, data) else: return _apply_vqsr(in_file, ref_file, recal_file, tranches_file, sensitivities[filter_type], filter_type, data)
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): if "gvcf" in dd.get_tools_on(data) and not dd.get_jointcaller(data): raise ValueError( "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def _variant_filtration(in_file, ref_file, vrn_files, data, filter_type, hard_filter_fn): """Filter SNP and indel variant calls using GATK best practice recommendations. Use cutoff-based soft filters if configuration indicates too little data or already finished a cutoff-based filtering step, otherwise try VQSR. """ # Algorithms multiplied by number of input files to check for large enough sample sizes algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if (not config_utils.use_vqsr(algs) or _already_cutoff_filtered(in_file, filter_type)): logger.info( "Skipping VQSR, using cutoff-based filers: we don't have whole genome input data" ) return hard_filter_fn(in_file, data) elif not _have_training_data(vrn_files): logger.info( "Skipping VQSR, using cutoff-based filers: genome build does not have sufficient training data" ) return hard_filter_fn(in_file, data) else: sensitivities = {"INDEL": "98.0", "SNP": "99.97"} recal_file, tranches_file = _run_vqsr(in_file, ref_file, vrn_files, sensitivities[filter_type], filter_type, data) if recal_file is None: # VQSR failed logger.info( "VQSR failed due to lack of training data. Using cutoff-based soft filtering." ) return hard_filter_fn(in_file, data) else: return _apply_vqsr(in_file, ref_file, recal_file, tranches_file, sensitivities[filter_type], filter_type, data)
def _variant_filtration(in_file, ref_file, vrn_files, data, filter_type, hard_filter_fn): """Filter SNP and indel variant calls using GATK best practice recommendations. """ # hard filter if configuration indicates too little data or already finished a hard filtering if not config_utils.use_vqsr([data["config"]["algorithm"]]) or _already_hard_filtered(in_file, filter_type): return hard_filter_fn(in_file, data) else: recal_file, tranches_file = _run_vqsr(in_file, ref_file, vrn_files, filter_type, data) if recal_file is None: # VQSR failed logger.info("VQSR failed due to lack of training data. Using hard filtering.") return hard_filter_fn(in_file, data) else: return _apply_vqsr(in_file, ref_file, recal_file, tranches_file, filter_type, data)
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data): """Filter SNP variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "SNP" variantcaller = config["algorithm"].get("variantcaller", "gatk") filters = [ "QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0" ] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.hard_w_expression(snp_file, " || ".join(filters), data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format( base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \ "Need HapMap and 1000 genomes training files" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend( ["--recal_file", tx_recal, "--tranches_file", tx_tranches]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) # Can fail to run if not enough values are present to train. Rerun with regional # filtration approach instead except: logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _get_used_programs(fn, algs): used_progs = set(["gatk", "gemini", "bcbio_coverage", "samtools", "snpEff", "cufflinks", "picard", "rnaseqc","cutadapt"]) for alg in algs: # get aligners used aligner = alg.get("aligner") if aligner: used_progs.add(aligner) vc = alg.get("variantcaller") if vc: if isinstance(vc, (list, tuple)): for x in vc: used_progs.add(x) else: used_progs.add(vc) if config_utils.use_vqsr(algs): used_progs.add("gatk-vqsr") return used_progs
def _variant_filtration_indel(snp_file, ref_file, vrn_files, data): """Filter indel variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): filterexp = " || ".join( ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) return vfilter.hard_w_expression(snp_file, filterexp, data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format( base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, data) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend( ["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.gatk_major_version() ) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration(in_file, ref_file, vrn_files, data, filter_type, hard_filter_fn): """Filter SNP and indel variant calls using GATK best practice recommendations. """ # hard filter if configuration indicates too little data or already finished a hard filtering human = tz.get_in(["genome_resources", "aliases", "human"], data) if (not config_utils.use_vqsr([data["config"]["algorithm"]]) or _already_hard_filtered(in_file, filter_type) or not human): return hard_filter_fn(in_file, data) else: sensitivities = {"INDEL": "98.0", "SNP": "99.97"} recal_file, tranches_file = _run_vqsr(in_file, ref_file, vrn_files, sensitivities[filter_type], filter_type, data) if recal_file is None: # VQSR failed logger.info("VQSR failed due to lack of training data. Using hard filtering.") return hard_filter_fn(in_file, data) else: return _apply_vqsr(in_file, ref_file, recal_file, tranches_file, sensitivities[filter_type], filter_type, data)
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not config_utils.use_vqsr([config["algorithm"]]): return variant_filtration_with_exp(broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data): """Filter SNP variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "SNP" variantcaller = config["algorithm"].get("variantcaller", "gatk") filters = ["QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.hard_w_expression(snp_file, " || ".join(filters), data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \ "Need HapMap and 1000 genomes training files" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) # Can fail to run if not enough values are present to train. Rerun with regional # filtration approach instead except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not config_utils.use_vqsr([config["algorithm"]]): return variant_filtration_with_exp(broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration(in_file, ref_file, vrn_files, data, filter_type, hard_filter_fn): """Filter SNP and indel variant calls using GATK best practice recommendations. Hard filter if configuration indicates too little data or already finished a hard filtering, otherwise try VQSR. """ human = tz.get_in(["genome_resources", "aliases", "human"], data) # Algorithms multiplied by number of input files to check for large enough sample sizes algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if (not config_utils.use_vqsr(algs) or _already_hard_filtered(in_file, filter_type) or not human): return hard_filter_fn(in_file, data) else: sensitivities = {"INDEL": "98.0", "SNP": "99.97"} recal_file, tranches_file = _run_vqsr(in_file, ref_file, vrn_files, sensitivities[filter_type], filter_type, data) if recal_file is None: # VQSR failed logger.info("VQSR failed due to lack of training data. Using hard filtering.") return hard_filter_fn(in_file, data) else: return _apply_vqsr(in_file, ref_file, recal_file, tranches_file, sensitivities[filter_type], filter_type, data)