def genotype_filter(vcf_file, expression, data, name, filterext=""): """Perform genotype based filtering using GATK with the provided expression. Adds FT tags to genotypes, rather than the general FILTER flag. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "VariantFiltration", "-R", tz.get_in(["reference", "fasta", "base"], data), "--variant", vcf_file, "--out", tx_out_file, "--genotypeFilterName", name, "--genotypeFilterExpression", "'%s'" % expression, ] jvm_opts = broad.get_gatk_framework_opts(data["config"]) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter with expression: %s" % expression) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = [ "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file ] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--clusterWindowSize", "35", "--clusterSize", "3", "--filterExpression", "\"'FS > 30.0'\"", "--filterName", "FS", "--filterExpression", "\"'QD < 2.0'\"", "--filterName", "QD", "-o", tx_out_file] jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter RNA-seq variants.") return out_file
def concat_variant_files_catvariants(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Uses GATK CatVariants as a lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to bcftools concat if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config) failed = False with file_transaction(config, out_file) as tx_out_file: params = ["org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False) try: do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False) except subprocess.CalledProcessError as msg: if ("We require all VCFs to have complete VCF headers" in str(msg) or "Features added out of order" in str(msg) or "The reference allele cannot be missing" in str(msg)): os.remove(tx_out_file) failed = True else: raise if failed: return _run_concat_variant_files_bcftools(input_file_list, out_file, config) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def gatk_filter_rnaseq(data, vrn_file, out_file): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: params = ["-T", "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--clusterWindowSize", "35", "--clusterSize", "3", "--filterExpression", "\"'FS > 30.0'\"", "--filterName", "FS", "--filterExpression", "\"'QD < 2.0'\"", "--filterName", "QD", "-o", tx_out_file] jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter variants.") return out_file
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to bcftools concat if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [ x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x) ] if len(exist_files) == 0: # no non-empty inputs, merge the empty ones exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") failed = False with file_transaction(config, out_file) as tx_out_file: params = [ "org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted" ] jvm_opts = broad.get_gatk_framework_opts( config, os.path.dirname(tx_out_file), include_gatk=False) try: do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False) except subprocess.CalledProcessError as msg: if ("We require all VCFs to have complete VCF headers" in str(msg) or "Features added out of order" in str(msg) or "The reference allele cannot be missing" in str(msg)): os.remove(tx_out_file) failed = True else: raise if failed: return _run_concat_variant_files_bcftools(input_file_list, out_file, config) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file. """ requires_gatkfull = False args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", dd.get_ref_file(data), "-I", data["work_bam"]] if requires_gatkfull: runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir) else: jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) return broad.gatk_cmd("gatk-framework", jvm_opts, args)
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file, recalibrating if configured. """ requires_gatkfull = False args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]] if prep_params["recal"] == "gatk": if "prep_recal" in data and _recal_has_reads(data["prep_recal"]): requires_gatkfull = True args += ["-BQSR", data["prep_recal"]] elif prep_params["recal"]: raise NotImplementedError("Recalibration method %s" % prep_params["recal"]) if requires_gatkfull: runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir) else: jvm_opts = broad.get_gatk_framework_opts(data["config"]) return broad.gatk_cmd("gatk-framework", jvm_opts, prep_params)
def _filter_paired(tumor, normal, out_file, reference, data): """filter paired vcf file with GATK :param tumor: (str) sample name for tumor :param normal: (str) sample name for normal :param out_file: (str) final vcf file :param reference: (str) genome in fasta format :param data: (dict) information from yaml file(items[0]) :returns: (str) name of final vcf file """ in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf" shutil.move(out_file, in_file) config = data["config"] with file_transaction(data, out_file) as tx_out_file: params = ["-T", "SomaticPindelFilter", "-V", in_file, "-o", tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference] jvm_opts = broad.get_gatk_opts(config) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter pindel variants") return out_file
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to bcftools concat if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x)] if len(exist_files) == 0: # no non-empty inputs, merge the empty ones exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") failed = False with file_transaction(config, out_file) as tx_out_file: params = ["org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False) try: do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False) except subprocess.CalledProcessError as msg: if ("We require all VCFs to have complete VCF headers" in str(msg) or "Features added out of order" in str(msg) or "The reference allele cannot be missing" in str(msg)): os.remove(tx_out_file) failed = True else: raise if failed: return _run_concat_variant_files_bcftools(input_file_list, out_file, config) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ ("FixMisencodedBaseQualityReads" if dd.get_quality_format( data, "").lower() == "illumina" else "PrintReads"), "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF", "MatchingBasesAndQualsReadFilter", "-RF", "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator" ] jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def genotype_filter(vcf_file, expression, data, name, filterext=""): """Perform genotype based filtering using GATK with the provided expression. Adds FT tags to genotypes, rather than the general FILTER flag. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = ["-T", "VariantFiltration", "-R", tz.get_in(["reference", "fasta", "base"], data), "--variant", vcf_file, "--out", tx_out_file, "--genotypeFilterName", name, "--genotypeFilterExpression", "'%s'" % expression] jvm_opts = broad.get_gatk_framework_opts(data["config"], os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter with expression: %s" % expression) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = ["-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar"] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file, recalibrating if configured. """ requires_gatkfull = False args = [ "-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"] ] if prep_params["recal"] == "gatk": if "prep_recal" in data and _recal_has_reads(data["prep_recal"]): requires_gatkfull = True args += ["-BQSR", data["prep_recal"]] elif prep_params["recal"]: raise NotImplementedError("Recalibration method %s" % prep_params["recal"]) if requires_gatkfull: runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir) else: jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) return broad.gatk_cmd("gatk-framework", jvm_opts, args)
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [("FixMisencodedBaseQualityReads" if dd.get_quality_format(data, "").lower() == "illumina" else "PrintReads"), "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF", "MatchingBasesAndQualsReadFilter", "-RF", "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator"] jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file