def concat_variant_files_catvariants(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Uses GATK CatVariants as a lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to bcftools concat if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config) failed = False with file_transaction(config, out_file) as tx_out_file: params = ["org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False) try: do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False) except subprocess.CalledProcessError as msg: if ("We require all VCFs to have complete VCF headers" in str(msg) or "Features added out of order" in str(msg) or "The reference allele cannot be missing" in str(msg)): os.remove(tx_out_file) failed = True else: raise if failed: return _run_concat_variant_files_bcftools(input_file_list, out_file, config) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar", ] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params do.run(cmd, "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to slower CombineVariants if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") params = ["org.broadinstitute.gatk.tools.CatVariants", "-R" , ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, include_gatk=False) cmd = [config_utils.get_program("gatk-framework", config)] + params + jvm_opts try: do.run(cmd, "Concat variant files", log_error=False) except subprocess.CalledProcessError, msg: if str(msg).find("We require all VCFs to have complete VCF headers"): return combine_variant_files(orig_files, out_file, ref_file, config) else: raise
def variants(data): if not "vrn_file" in data: return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = data['work_bam'] ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = dd.get_variant_regions(data) sample = splitext_plus(os.path.basename(in_vcf))[0] in_bam = data["work_bam"] cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) # return df return data
def gatk_filter_rnaseq(data, vrn_file, out_file): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: params = ["-T", "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--clusterWindowSize", "35", "--clusterSize", "3", "--filterExpression", "\"'FS > 30.0'\"", "--filterName", "FS", "--filterExpression", "\"'QD < 2.0'\"", "--filterName", "QD", "-o", tx_out_file] jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter variants.") return out_file
def calc_variants_stats(data, args): in_vcf = data['vcf'] ref_file = args.reference # gatk_jar = '/groups/bcbio/bcbio/toolplus/gatk/3.2-2-gec30cee/GenomeAnalysisTK.jar' jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = args.region sample = splitext_plus(op.basename(in_vcf))[0] in_bam = data['bam'] cg_file = op.join(args.out, sample + "_with-gc.vcf.gz") parse_file = op.join(args.out, sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " cg for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.info('parsing coverage: %s' % sample) # return df return parse_file
def genotype_filter(vcf_file, expression, data, name, filterext=""): """Perform genotype based filtering using GATK with the provided expression. Adds FT tags to genotypes, rather than the general FILTER flag. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "VariantFiltration", "-R", tz.get_in(["reference", "fasta", "base"], data), "--variant", vcf_file, "--out", tx_out_file, "--genotypeFilterName", name, "--genotypeFilterExpression", "'%s'" % expression, ] jvm_opts = broad.get_gatk_framework_opts(data["config"]) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter with expression: %s" % expression) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file, recalibrating if configured. """ args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]] if prep_params.get("max_depth"): args += ["--downsample_to_coverage", str(prep_params["max_depth"])] if prep_params["recal"] == "gatk": if "prep_recal" in data and _recal_has_reads(data["prep_recal"]): args += ["-BQSR", data["prep_recal"]] elif prep_params["recal"]: raise NotImplementedError("Recalibration method %s" % prep_params["recal"]) jvm_opts = broad.get_gatk_framework_opts(data["config"], memscale={"direction": "decrease", "magnitude": 3}) return [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + args
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file. """ requires_gatkfull = False args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", dd.get_ref_file(data), "-I", data["work_bam"]] if requires_gatkfull: runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir) else: jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) return broad.gatk_cmd("gatk-framework", jvm_opts, args)
def _split_mulitallelic(in_file, data): """Split input into biallelic and multiallelic files. """ ba_out = "%s-biallelic%s" % utils.splitext_plus(in_file) ma_out = "%s-multiallelic%s" % utils.splitext_plus(in_file) for out_file, select_type in [(ba_out, "BIALLELIC"), (ma_out, "MULTIALLELIC")]: if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = ["-T", "SelectVariants", "-R", dd.get_ref_file(data), "--variant", in_file, "--out", tx_out_file, "-restrictAllelesTo", select_type] jvm_opts = broad.get_gatk_framework_opts(data["config"]) cmd = ["gatk-framework"] + jvm_opts + params do.run(cmd, "Select %s variants" % select_type) vcfutils.bgzip_and_index(out_file, data["config"]) return ba_out, ma_out
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file, recalibrating if configured. """ requires_gatkfull = False args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]] if prep_params["recal"] == "gatk": if "prep_recal" in data and _recal_has_reads(data["prep_recal"]): requires_gatkfull = True args += ["-BQSR", data["prep_recal"]] elif prep_params["recal"]: raise NotImplementedError("Recalibration method %s" % prep_params["recal"]) if requires_gatkfull: runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir) else: jvm_opts = broad.get_gatk_framework_opts(data["config"]) return broad.gatk_cmd("gatk-framework", jvm_opts, prep_params)
def _filter_bad_reads(in_bam, ref_file, config): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, config) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with utils.curdir_tmpdir({"config": config}) as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals"] jvm_opts = broad.get_gatk_framework_opts(config, tmp_dir) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "Filter problem reads") return out_file
def _count_rRNA_reads(in_bam, out_file, ref_file, rRNA_interval, single_end, config): """Use GATK counter to count reads in rRNA genes """ bam.index(in_bam, config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: rRNA_coor = os.path.join(os.path.dirname(out_file), "rRNA.list") _transform_browser_coor(rRNA_interval, rRNA_coor) params = ["-T", "CountReads", "-R", ref_file, "-I", in_bam, "-log", tx_out_file, "-L", rRNA_coor, "--filter_reads_with_N_cigar"] jvm_opts = broad.get_gatk_framework_opts(config) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "counts rRNA for %s" % in_bam) return out_file
def _filter_paired(tumor, normal, out_file, reference, data): """filter paired vcf file with GATK :param tumor: (str) sample name for tumor :param normal: (str) sample name for normal :param out_file: (str) final vcf file :param reference: (str) genome in fasta format :param data: (dict) information from yaml file(items[0]) :returns: (str) name of final vcf file """ in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf" shutil.move(out_file, in_file) config = data["config"] with file_transaction(data, out_file) as tx_out_file: params = ["-T", "SomaticPindelFilter", "-V", in_file, "-o", tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference] jvm_opts = broad.get_gatk_framework_opts(config) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter pindel variants") return out_file
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. We could handle multiple input cases with recursion if needed but currently only works with two inputs. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] jvm_opts = broad.get_gatk_framework_opts(config) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to bcftools concat if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x)] if len(exist_files) == 0: # no non-empty inputs, merge the empty ones exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") failed = False with file_transaction(config, out_file) as tx_out_file: params = ["org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False) try: do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False) except subprocess.CalledProcessError as msg: if ("We require all VCFs to have complete VCF headers" in str(msg) or "Features added out of order" in str(msg) or "The reference allele cannot be missing" in str(msg)): os.remove(tx_out_file) failed = True else: raise if failed: return _run_concat_variant_files_bcftools(input_file_list, out_file, config) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar" ] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts( data["config"], tmp_dir) cmd = [ config_utils.get_program("gatk-framework", data["config"]) ] + jvm_opts + params do.run(cmd, "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def genotype_filter(vcf_file, expression, data, name, filterext=""): """Perform genotype based filtering using GATK with the provided expression. Adds FT tags to genotypes, rather than the general FILTER flag. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "VariantFiltration", "-R", tz.get_in(["reference", "fasta", "base"], data), "--variant", vcf_file, "--out", tx_out_file, "--genotypeFilterName", name, "--genotypeFilterExpression", "'%s'" % expression ] jvm_opts = broad.get_gatk_framework_opts(data["config"]) cmd = [config_utils.get_program("gatk-framework", data["config"]) ] + jvm_opts + params do.run(cmd, "Filter with expression: %s" % expression) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to bcftools concat if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") failed = False with file_transaction(config, out_file) as tx_out_file: params = ["org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, include_gatk=False) cmd = [config_utils.get_program("gatk-framework", config)] + params + jvm_opts try: do.run(cmd, "Concat variant files", log_error=False) except subprocess.CalledProcessError, msg: if ("We require all VCFs to have complete VCF headers" in str(msg) or "Features added out of order" in str(msg)): os.remove(tx_out_file) failed = True else: raise if failed: return concat_variant_files_bcftools(input_file_list, out_file, ref_file, config)
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = [ "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file ] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend( ["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend( ["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get( "variant_regions", None) cur_region = shared.subset_variant_regions( variant_regions, region, out_file) if cur_region: params += [ "-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION" ] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts( config, os.path.dirname(tx_out_file), memscale=memscale) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{ file_key: out_file, "region": region, "sam_ref": ref_file, "config": config }] else: return out_file