def prep_db_parallel(samples, parallel_fn): """Prepares gemini databases in parallel, handling jointly called populations. """ batch_groups, singles, out_retrieve, extras = _group_by_batches(samples, _has_variant_calls) to_process = [] has_batches = False for (name, caller), info in batch_groups.items(): fnames = [x[0] for x in info] to_process.append([fnames, (str(name), caller, True), [x[1] for x in info], extras]) has_batches = True for name, caller, data, fname in singles: to_process.append([[fname], (str(name), caller, False), [data], extras]) if (len(samples) > 0 and not do_db_build([x[0] for x in samples]) and not has_batches and not any(dd.get_vcfanno(x[0] for x in samples))): return samples output = parallel_fn("prep_gemini_db", to_process) out_fetch = {} for batch_id, out_file in output: out_fetch[tuple(batch_id)] = out_file out = [] for batch_name, data in out_retrieve: out_variants = [] for vrn in data["variants"]: use_population = vrn.pop("population", True) if use_population: vrn["population"] = out_fetch[(batch_name, vrn["variantcaller"])] out_variants.append(vrn) data["variants"] = out_variants out.append([data]) for x in extras: out.append([x]) return out
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig( data) else None conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = ["gemini"] ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) if "vcf2db_expand" in dd.get_tools_on(data): vcf2db_args = [ "--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths" ] else: vcf2db_args = [] cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def run_vcfanno(vcf_file, data, decomposed=False): """Run vcfanno, providing annotations from external databases. """ conf_files = dd.get_vcfanno(data) if conf_files: with_basepaths = collections.defaultdict(list) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for f in conf_files: data_basepath = (install.get_gemini_dir(data) if f.find("gemini") >= 0 and is_human(data, builds=["37"]) else None) with_basepaths[data_basepath].append(f) conf_files = with_basepaths.items() else: conf_files = _default_conf_files(data) out_file = None if conf_files: for data_basepath, conf_files in conf_files: ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data, data_basepath=data_basepath, decomposed=decomposed) if ann_file: out_file = ann_file vcf_file = ann_file return out_file
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) name, caller, is_batch = call_info out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) multisample_vcf = get_multisample_vcf(fnames, name, caller, data) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if not utils.file_exists(gemini_db) and use_gemini: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = multiallelic.to_single(multisample_vcf, data, passonly=passonly) ped_file = create_ped_file(samples + extras, gemini_vcf) # Use original approach for hg19/GRCh37 pending additional testing if support_gemini_orig(data) and not any( dd.get_vcfanno(d) for d in samples): gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": multisample_vcf if is_batch else None }]]
def find_annotations(data): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations if not specified: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = _default_conf_files(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] out = [] annodir = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))) for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file else: conffn = os.path.join(annodir, conf_file + ".conf") if not utils.file_exists(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if os.path.exists(luafn): out.append(luafn) return out
def _run_vcfanno(gemini_vcf, data, use_gemini=False): data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None conf_files = dd.get_vcfanno(data) if not conf_files and use_gemini: conf_files = ["gemini"] if conf_files: return vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) else: return gemini_vcf
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [ (x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"]) ] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file]( data, retriever): logger.warn( "Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def run_vcfanno(vcf_file, data, decomposed=False): """Run vcfanno, providing annotations from external databases if needed. Puts together lua and conf files from multiple inputs by file names. """ conf_files = dd.get_vcfanno(data) if conf_files: with_basepaths = collections.defaultdict(list) gemini_basepath = _back_compatible_gemini(conf_files, data) for f in conf_files: name = os.path.splitext(os.path.basename(f))[0] if f.endswith(".lua"): conf_file = None lua_file = f else: conf_file = f lua_file = "%s.lua" % utils.splitext_plus(conf_file)[0] if lua_file and not os.path.exists(lua_file): lua_file = None data_basepath = gemini_basepath if name == "gemini" else None if conf_file and os.path.exists(conf_file): with_basepaths[(data_basepath, name)].append(conf_file) if lua_file and os.path.exists(lua_file): with_basepaths[(data_basepath, name)].append(lua_file) conf_files = with_basepaths.items() out_file = None if conf_files: VcfannoIn = collections.namedtuple("VcfannoIn", ["conf", "lua"]) bp_files = collections.defaultdict(list) for (data_basepath, name), anno_files in conf_files: anno_files = list(set(anno_files)) if len(anno_files) == 1: cur = VcfannoIn(anno_files[0], None) elif len(anno_files) == 2: lua_files = [x for x in anno_files if x.endswith(".lua")] assert len(lua_files) == 1, anno_files lua_file = lua_files[0] anno_files.remove(lua_file) cur = VcfannoIn(anno_files[0], lua_file) else: raise ValueError("Unexpected annotation group %s" % anno_files) bp_files[data_basepath].append(cur) for data_basepath, anno_files in bp_files.items(): ann_file = vcfanno.run(vcf_file, [x.conf for x in anno_files], [x.lua for x in anno_files], data, basepath=data_basepath, decomposed=decomposed) if ann_file: out_file = ann_file vcf_file = ann_file return out_file
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [(x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"])] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever): logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def run_vcfanno(vcf_file, data, decomposed=False): """Run vcfanno, providing annotations from external databases. """ conf_files = dd.get_vcfanno(data) if conf_files: conf_files = [(None, conf_files)] else: conf_files = _default_conf_files(data) out_file = None if conf_files: for data_basepath, conf_files in conf_files: ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data, data_basepath=data_basepath, decomposed=decomposed) if ann_file: out_file = ann_file vcf_file = ann_file return out_file
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) name, caller, is_batch = call_info out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) if use_gemini: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = multiallelic.to_single(gemini_vcf, data, passonly=passonly) gemini_vcf = _run_vcfanno(gemini_vcf, data, use_gemini) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if vcfutils.vcf_has_variants(gemini_vcf): if not utils.file_exists(gemini_db) and use_gemini: ped_file = create_ped_file(samples + extras, gemini_vcf) # Use original approach for hg19/GRCh37 pending additional testing if support_gemini_orig(data) and not any(dd.get_vcfanno(d) for d in samples): gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf, "decomposed": use_gemini}]]