def run_vcfanno(vcf_file, data, decomposed=False): """Run vcfanno, providing annotations from external databases. """ conf_files = dd.get_vcfanno(data) if conf_files: with_basepaths = collections.defaultdict(list) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for f in conf_files: data_basepath = (install.get_gemini_dir(data) if f.find("gemini") >= 0 and is_human(data, builds=["37"]) else None) with_basepaths[data_basepath].append(f) conf_files = with_basepaths.items() else: conf_files = _default_conf_files(data) out_file = None if conf_files: for data_basepath, conf_files in conf_files: ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data, data_basepath=data_basepath, decomposed=decomposed) if ann_file: out_file = ann_file vcf_file = ann_file return out_file
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig( data) else None conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = ["gemini"] ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) if "vcf2db_expand" in dd.get_tools_on(data): vcf2db_args = [ "--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths" ] else: vcf2db_args = [] cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def _has_gemini(data): from bcbio import install gemini_dir = install.get_gemini_dir(data) return ((os.path.exists(gemini_dir) and len(os.listdir(gemini_dir)) > 0) and os.path.exists( os.path.join(os.path.dirname(gemini_dir), "gemini-config.yaml")))
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): gresources = get_resources(data["genome_build"], ref_file, data) if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): if (data.get("files") and population.do_db_build([data], need_bam=False) and population.support_gemini_orig(data)): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def _default_conf_files(data): conf_files = collections.defaultdict(list) if _has_gemini_data(data): data_basepath = install.get_gemini_dir(data) if is_human(data, builds=["37"]) else None conf_files[data_basepath].append("gemini") if _annotate_somatic(data): conf_files[None].append("somatic") return conf_files.items()
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion( "0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists( os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in( ("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None }]]
def _run_vcfanno(gemini_vcf, data, use_gemini=False): data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None conf_files = dd.get_vcfanno(data) if not conf_files and use_gemini: conf_files = ["gemini"] if conf_files: return vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) else: return gemini_vcf
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"] ]) if resources.get("options") else "" cmd = ( "{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def handle_vcf_calls(vcf_file, data, orig_items): """Prioritize VCF calls based on external annotations supplied through GEMINI. """ if not _do_prioritize(orig_items): return vcf_file else: if population.has_gemini_data(data): data_basepath = install.get_gemini_dir(data) if population.support_gemini_orig(data) else None ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data, data_basepath) if ann_vcf: priority_file = _prep_priority_filter_vcfanno(ann_vcf, data) return _apply_priority_filter(vcf_file, priority_file, data) # No GEMINI database for filtering, return original file return vcf_file
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None ann_file = vcfanno.run_vcfanno(gemini_vcf, "gemini", data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz"), ("--skip-gerp-bp", "hg19.gerp.bw"), ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = "snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP" cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [ [ (name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}, ] ]
def handle_vcf_calls(vcf_file, data, orig_items): """Prioritize VCF calls based on external annotations supplied through GEMINI. """ if not _do_prioritize(orig_items): return vcf_file else: if population.has_gemini_data(data): data_basepath = install.get_gemini_dir( data) if population.support_gemini_orig(data) else None ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data, data_basepath) if ann_vcf: priority_file = _prep_priority_filter_vcfanno(ann_vcf, data) return _apply_priority_filter(vcf_file, priority_file, data) # No GEMINI database for filtering, return original file return vcf_file
def _back_compatible_gemini(conf_files, data): """Provide old install directory for configuration with GEMINI supplied tidy VCFs. Handles new style (bcbio installed) and old style (GEMINI installed) configuration and data locations. """ if vcfanno.is_human(data, builds=["37"]): for f in conf_files: if f and os.path.basename(f) == "gemini.conf" and os.path.exists(f): with open(f) as in_handle: for line in in_handle: if line.startswith("file"): fname = line.strip().split("=")[-1].replace('"', '').strip() if fname.find(".tidy.") > 0: return install.get_gemini_dir(data) return None
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None): """Original GEMINI specific data loader, only works with hg19/GRCh37. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) load_opts = "" if "gemini_allvariants" not in dd.get_tools_on(data): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"] ]) if resources.get("options") else "" exports = utils.local_path_export() cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} " "-v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else "" cmd = ("{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def create_gemini_db(gemini_vcf, data, gemini_db=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in( ("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) return gemini_db
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None): """Original GEMINI specific data loader, only works with hg19/GRCh37. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) load_opts = "" if "gemini_allvariants" not in dd.get_tools_on(data): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else "" exports = utils.local_path_export() cmd = ( "{exports} {gemini} {gemini_opts} load {load_opts} " "-v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}" ) cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def create_gemini_db(gemini_vcf, data, gemini_db=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) return gemini_db
def _has_gemini(data): from bcbio import install gemini_dir = install.get_gemini_dir(data) return ((os.path.exists(gemini_dir) and len(os.listdir(gemini_dir)) > 0) and os.path.exists(os.path.join(os.path.dirname(gemini_dir), "gemini-config.yaml")))