def main(snpeff_jar, vcf_ref, genome, interval_file=None): if os.path.isdir(vcf_ref): vcf_files = sorted(glob.glob(os.path.join(vcf_ref, "*-snp-filter.vcf"))) else: vcf_files = [vcf_ref] for vcf_file in vcf_files: snpeff_effects(snpeff_jar, vcf_file, genome, interval_file)
def variation_effects(vrn_file, genome_file, genome_build, config): """Calculate effects of variations, associating them with transcripts. """ snpeff_vcf, snpeff_txt = snpeff_effects(vrn_file, genome_build, config) annotated_vcf = annotate_effects(vrn_file, snpeff_vcf, genome_file, config) \ if snpeff_vcf else None return annotated_vcf, snpeff_txt
def variation_effects(vrn_file, genome_file, genome_build, config): """Calculate effects of variations, associating them with transcripts. Runs snpEff, returning the resulting effects file. No longer runs the GATK annotator, since it requires an old version of snpEff. """ return snpeff_effects(vrn_file, genome_build, config)
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ cur_name = "%s, %s" % (data["name"][-1], get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) effect_todo = effects.get_type(data) if effect_todo: if effect_todo == "snpeff": ann_vrn_file = effects.snpeff_effects(data) elif effect_todo == "vep": ann_vrn_file = effects.run_vep(data) else: raise ValueError( "Unexpected variant effects configuration: %s" % effect_todo) if ann_vrn_file: data["vrn_file"] = ann_vrn_file logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration( data["vrn_file"], data["sam_ref"], tz.get_in(("genome_resources", "variation"), data, {}), data) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data) return [[data]]
def main(config_file, env, cores): cores = int(cores) config = read_config(config_file, env) idremap = read_remap_file(config["runinfo"]["idmapping"]) exclude = read_priority_file(config["runinfo"]["priority"], idremap) samples = list(get_input_samples(config["inputs"], idremap)) problem = [x for x in samples if x["id"] is None] if len(problem) > 0: print "Problem identifiers" for p in problem: print p["illuminaid"], os.path.basename(p["dir"]) raise NotImplementedError check_fam(samples, config["runinfo"]["fam"]) config["algorithm"] = {"num_cores": cores} samples = [ s for s in samples if s["id"] is not None and s["id"] not in exclude ] print "Processing %s samples" % len(samples) out_files = [ outf for outf in joblib.Parallel(cores)( joblib.delayed(run_illumina_prep)(s, config) for s in samples) ] merge_file = merge_vcf_files(out_files, cores, config) effects_file = effects.snpeff_effects({ "vrn_file": merge_file, "sam_ref": config["ref"]["GRCh37"], "reference": { "fasta": { "base": config["ref"]["GRCh37"] } }, "genome_resources": { "aliases": { "snpeff": "GRCh37.74" } }, "genome_build": "GRCh37", "config": config }) data = {"config": config, "dirs": {"work": os.getcwd()}, "name": [""]} gemini_db = population.prep_gemini_db( [os.path.join(os.getcwd(), effects_file)], [utils.splitext_plus(config["outputs"]["merge"])[0], "casava", True], [{ "config": config, "work_bam": "yes", "genome_build": "GRCh37", "genome_resources": { "aliases": { "human": True } } }], data)[0][1]["db"] print gemini_db noexclude_file = "%s-noexclude%s" % utils.splitext_plus(effects_file) noexclude_file = vcfutils.exclude_samples(effects_file, noexclude_file, exclude, config["ref"]["GRCh37"], config) prepare_plink_vcftools(noexclude_file, config)
def variation_effects(vrn_file, genome_build, config): """Calculate effects of variations, associating them with transcripts. """ snpeff_jar = os.path.join(config["program"]["snpEff"], "snpEff.jar") java_memory = config["algorithm"].get("java_memory", None) return snpeff_effects(snpeff_jar, vrn_file, genome_build, config["algorithm"].get("hybrid_target", None), java_memory)
def main(dirname, config, cores): vcf_files = find_vcf_files(dirname) prepped_files = prep_vcf_files(vcf_files, cores, config) merged_file = merge_vcf_files(prepped_files, cores, config) effects_file = effects.snpeff_effects({"vrn_file": merged_file, "genome_resources": {"aliases" : {"snpeff": "GRCh37"}}, "genome_build": "GRCh37", "config": config}) gemini_db = load_gemini_db(effects_file, config["ped"], cores)
def postprocess_variants(data): """Provide post-processing of variant calls. """ logger.info("Finalizing variant calls: %s" % str(data["name"])) if data["work_bam"] and data.get("vrn_file"): data["vrn_file"] = finalize_genotyper(data["vrn_file"], data["work_bam"], data["sam_ref"], data["config"]) logger.info("Calculating variation effects for %s" % str(data["name"])) ann_vrn_file = effects.snpeff_effects(data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file return [[data]]
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ logger.info("Finalizing variant calls: %s" % str(data["name"])) if data["work_bam"] and data.get("vrn_file"): data["vrn_file"] = variant_filtration( data["vrn_file"], data["sam_ref"], data["genome_resources"]["variation"], data["config"]) logger.info("Calculating variation effects for %s" % str(data["name"])) ann_vrn_file = effects.snpeff_effects(data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file return [[data]]
def postprocess_variants(data): """Provide post-processing of variant calls. """ if data["config"]["algorithm"]["snpcall"]: logger.info("Finalizing variant calls: %s" % str(data["name"])) data["vrn_file"] = finalize_genotyper(data["vrn_file"], data["work_bam"], data["sam_ref"], data["config"]) logger.info("Calculating variation effects for %s" % str(data["name"])) ann_vrn_file = effects.snpeff_effects(data["vrn_file"], data["genome_build"], data["config"]) if ann_vrn_file: data["vrn_file"] = ann_vrn_file return [[data]]
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ logger.info("Finalizing variant calls: %s" % str(data["name"])) if data["work_bam"] and data.get("vrn_file"): vrn_files = configured_vrn_files(data["config"], data["sam_ref"]) data["vrn_file"] = variant_filtration(data["vrn_file"], data["sam_ref"], vrn_files, data["config"]) logger.info("Calculating variation effects for %s" % str(data["name"])) ann_vrn_file = effects.snpeff_effects(data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file return [[data]]
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ cur_name = "%s, %s" % (data["name"][-1], get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) if data["work_bam"] and data.get("vrn_file"): data["vrn_file"] = variant_filtration(data["vrn_file"], data["sam_ref"], data["genome_resources"]["variation"], data["config"]) logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file = effects.snpeff_effects(data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file return [[data]]
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ cur_name = "%s, %s" % (data["name"][-1], get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file = effects.snpeff_effects(data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration( data["vrn_file"], data["sam_ref"], data["genome_resources"]["variation"], data) return [[data]]
def postprocess_variants(data): """Provide post-processing of variant calls. """ logger.info("Finalizing variant calls: %s" % str(data["name"])) if data["work_bam"]: data["vrn_file"] = finalize_genotyper(data["vrn_file"], data["work_bam"], data["sam_ref"], data["config"]) logger.info("Calculating variation effects for %s" % str(data["name"])) ann_vrn_file = effects.snpeff_effects(data["vrn_file"], data["genome_build"], data["config"]) if ann_vrn_file: data["vrn_file"] = ann_vrn_file data = validate.compare_to_rm(data) return [[data]]
def main(dirname, config, cores): vcf_files = find_vcf_files(dirname) prepped_files = prep_vcf_files(vcf_files, cores, config) merged_file = merge_vcf_files(prepped_files, cores, config) effects_file = effects.snpeff_effects({ "vrn_file": merged_file, "genome_resources": { "aliases": { "snpeff": "GRCh37" } }, "genome_build": "GRCh37", "config": config }) gemini_db = load_gemini_db(effects_file, config["ped"], cores)
def _run_ensemble_w_caller(batch_id, vrn_files, bam_files, base_dir, edata): """Run ensemble method using a variant caller to handle re-calling the inputs. Uses bcbio.variation.recall method plus an external variantcaller. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) if not utils.file_exists(out_vcf_file): caller = edata["config"]["algorithm"]["ensemble"]["caller"] cmd = [config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--caller=%s" % caller, out_vcf_file, edata["sam_ref"]] + vrn_files + bam_files do.run(cmd, "Ensemble calling with %s: %s" % (caller, batch_id)) in_data = copy.deepcopy(edata) in_data["vrn_file"] = out_vcf_file effects_vcf = effects.snpeff_effects(in_data) return {"variantcaller": "ensemble", "vrn_file": effects_vcf, "bed_file": None}
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ cur_name = "%s, %s" % (data["name"][-1], get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) effect_todo = tz.get_in(("config", "algorithm", "effects"), data, "snpeff") if effect_todo: if effect_todo == "snpeff": ann_vrn_file = effects.snpeff_effects(data) elif effect_todo == "vep": ann_vrn_file = effects.run_vep(data) else: raise ValueError("Unexpected variant effects configuration: %s" % effect_todo) if ann_vrn_file: data["vrn_file"] = ann_vrn_file logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration(data["vrn_file"], data["sam_ref"], tz.get_in(("genome_resources", "variation"), data, {}), data) return [[data]]
def _run_ensemble_w_caller(batch_id, vrn_files, bam_files, base_dir, edata): """Run ensemble method using a variant caller to handle re-calling the inputs. Uses bcbio.variation.recall method plus an external variantcaller. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) if not utils.file_exists(out_vcf_file): caller = edata["config"]["algorithm"]["ensemble"]["caller"] cmd = [ config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--caller=%s" % caller, out_vcf_file, edata["sam_ref"] ] + vrn_files + bam_files do.run(cmd, "Ensemble calling with %s: %s" % (caller, batch_id)) in_data = copy.deepcopy(edata) in_data["vrn_file"] = out_vcf_file effects_vcf = effects.snpeff_effects(in_data) return { "variantcaller": "ensemble", "vrn_file": effects_vcf, "bed_file": None }
def main(config_file, env, cores): cores = int(cores) config = read_config(config_file, env) idremap = read_remap_file(config["runinfo"]["idmapping"]) exclude = read_priority_file(config["runinfo"]["priority"], idremap) samples = list(get_input_samples(config["inputs"], idremap)) problem = [x for x in samples if x["id"] is None] if len(problem) > 0: print "Problem identifiers" for p in problem: print p["illuminaid"], os.path.basename(p["dir"]) raise NotImplementedError check_fam(samples, config["runinfo"]["fam"]) config["algorithm"] = {"num_cores": cores} samples = [s for s in samples if s["id"] is not None and s["id"] not in exclude] print "Processing %s samples" % len(samples) out_files = [outf for outf in joblib.Parallel(cores)(joblib.delayed(run_illumina_prep)(s, config) for s in samples)] merge_file = merge_vcf_files(out_files, cores, config) effects_file = effects.snpeff_effects({"vrn_file": merge_file, "sam_ref": config["ref"]["GRCh37"], "reference": {"fasta" : {"base": config["ref"]["GRCh37"]}}, "genome_resources": {"aliases" : {"snpeff": "GRCh37.74"}}, "genome_build": "GRCh37", "config": config}) data = {"config": config, "dirs": {"work": os.getcwd()}, "name": [""]} gemini_db = population.prep_gemini_db([os.path.join(os.getcwd(), effects_file)], [utils.splitext_plus(config["outputs"]["merge"])[0], "casava", True], [{"config": config, "work_bam": "yes", "genome_build": "GRCh37", "genome_resources": {"aliases": {"human": True}}}], data)[0][1]["db"] print gemini_db noexclude_file = "%s-noexclude%s" % utils.splitext_plus(effects_file) noexclude_file = vcfutils.exclude_samples(effects_file, noexclude_file, exclude, config["ref"]["GRCh37"], config) prepare_plink_vcftools(noexclude_file, config)
def __call__(self, in_file): self._start_message(in_file) out_file = effects.snpeff_effects(in_file, self.genome, self.config) self._end_message(in_file) return out_file