def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def _call_variants_samtools(align_bams, ref_file, items, target_regions, tx_out_file): """Call variants with samtools in target_regions. Works around a GATK VCF 4.2 compatibility issue in samtools 1.0 by removing addition 4.2-only isms from VCF header lines. """ config = items[0]["config"] mpileup = prep_mpileup(align_bams, ref_file, config, target_regions=target_regions, want_bcf=True) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with pre-1.0 samtools") bcftools_opts = "call -v -m" compress_cmd = "| bgzip -c" if tx_out_file.endswith(".gz") else "" cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| vt normalize -n -q -r {ref_file} - " "| sed 's/VCFv4.2/VCFv4.1/' " "| sed 's/,Version=3>/>/' " "| sed 's/,Version=\"3\">/>/' " "| sed 's/Number=R/Number=./' " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Variant calling with samtools", items[0])
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" vcfutils = config_utils.get_program("vcfutils.pl", config) cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "> {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def _freebayes_custom(in_file, ref_file, data): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. Experimental: for testing new methods. """ if vcfutils.get_paired_phenotype(data): return None config = data["config"] bv_ver = programs.get_version("bcbio_variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir( os.path.join(os.path.dirname(in_file), "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file ] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def prep_gemini_db(fnames, call_id, samples, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") use_gemini = _do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) is_population = len(fnames) > 1 if is_population: name, caller = call_id gemini_vcf = get_multisample_vcf(fnames, name, caller, data) else: gemini_vcf = fnames[0] if use_gemini and not utils.file_exists(gemini_db): with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) gemini_ver = programs.get_version("gemini", config=data["config"]) # Recent versions of gemini allow loading only passing variants if LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts = "--passonly" else: load_opts = "" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) return [[ call_id, { "db": gemini_db if use_gemini else None, "vcf": gemini_vcf if is_population else None } ]]
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}]]
def prep_gemini_db(fnames, call_id, samples, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") use_gemini = _do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) is_population = len(fnames) > 1 if is_population: name, caller = call_id gemini_vcf = get_multisample_vcf(fnames, name, caller, data) else: gemini_vcf = fnames[0] if use_gemini and not utils.file_exists(gemini_db): with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) gemini_ver = programs.get_version("gemini", config=data["config"]) # Recent versions of gemini allow loading only passing variants if LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts = "--passonly" else: load_opts = "" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) return [[call_id, {"db": gemini_db if use_gemini else None, "vcf": gemini_vcf if is_population else None}]]
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion( "0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists( os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in( ("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None }]]
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError( "samtools calling not supported with 0.1.19 samtools and 0.20 bcftools" ) bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any( realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def _set_default_versions(self, config): """Retrieve pre-computed version information for expensive to retrieve versions. Starting up GATK takes a lot of resources so we do it once at start of analysis. """ out = [] for name in ["gatk", "picard"]: try: v = programs.get_version(name, config=config) except KeyError: v = None out.append(v) self._gatk_version, self._picard_version = out
def _set_default_versions(self, config): """Retrieve pre-computed version information for expensive to retrieve versions. Starting up GATK takes a lot of resources so we do it once at start of analysis. """ out = [] for name in ["gatk", "picard", "mutect"]: try: v = programs.get_version(name, config=config) except KeyError: v = None out.append(v) self._gatk_version, self._picard_version, self._mutect_version = out
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"] ]) if resources.get("options") else "" cmd = ( "{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz"), ("--skip-gerp-bp", "hg19.gerp.bw"), ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = "snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP" cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [ [ (name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}, ] ]
def _freebayes_custom(in_file, ref_file, config): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. """ bv_ver = programs.get_version("bcbio.variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def create_gemini_db(gemini_vcf, data, gemini_db=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in( ("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) return gemini_db
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else "" cmd = ("{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion( "0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None }]]
def _freebayes_custom(in_file, ref_file, data): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. Experimental: for testing new methods. """ if vcfutils.get_paired_phenotype(data): return None config = data["config"] bv_ver = programs.get_version("bcbio_variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["bcbio-variation"] + jvm_opts + java_args + ["variant-filter", "freebayes", in_file, ref_file] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def create_gemini_db(gemini_vcf, data, gemini_db=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) return gemini_db