def add_reference_resources(data): """Add genome reference information to the item to process. """ aligner = data["config"]["algorithm"].get("aligner", None) data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"]) # back compatible `sam_ref` target data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base")) ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"), utils.get_in(data, ("reference", "fasta", "base"))) data["genome_resources"] = genome.get_resources(data["genome_build"], ref_loc) data["reference"]["snpeff"] = effects.get_snpeff_files(data) alt_genome = utils.get_in(data, ("config", "algorithm", "validate_genome_build")) if alt_genome: data["reference"]["alt"] = { alt_genome: genome.get_refs(alt_genome, None, data["dirs"]["galaxy"])["fasta"] } # Re-enable when we have ability to re-define gemini configuration directory if False: if population.do_db_build([data], check_gemini=False, need_bam=False): data["reference"]["gemini"] = population.get_gemini_files(data) return data
def add_reference_resources(data, remote_retriever=None): """Add genome reference information to the item to process. """ aligner = data["config"]["algorithm"].get("aligner", None) if remote_retriever: data["reference"] = remote_retriever.get_refs(data["genome_build"], aligner, data["config"]) else: data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"], data) _check_ref_files(data["reference"], data) # back compatible `sam_ref` target data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base")) ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"), utils.get_in(data, ("reference", "fasta", "base"))) if remote_retriever: data = remote_retriever.get_resources(data["genome_build"], ref_loc, data) else: data["genome_resources"] = genome.get_resources( data["genome_build"], ref_loc, data) if effects.get_type( data) == "snpeff" and "snpeff" not in data["reference"]: data["reference"]["snpeff"] = effects.get_snpeff_files(data) data = _fill_validation_targets(data) data = _fill_prioritization_targets(data) # Re-enable when we have ability to re-define gemini configuration directory if False: if population.do_db_build([data], need_bam=False): data["reference"]["gemini"] = population.get_gemini_files(data) return data
def add_reference_resources(data, remote_retriever=None): """Add genome reference information to the item to process. """ aligner = data["config"]["algorithm"].get("aligner", None) if remote_retriever: data["reference"] = remote_retriever.get_refs(data["genome_build"], aligner, data["config"]) else: data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"], data) _check_ref_files(data["reference"], data) # back compatible `sam_ref` target data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base")) ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"), utils.get_in(data, ("reference", "fasta", "base"))) if remote_retriever: data = remote_retriever.get_resources(data["genome_build"], ref_loc, data) else: data["genome_resources"] = genome.get_resources(data["genome_build"], ref_loc, data) if effects.get_type(data) == "snpeff" and "snpeff" not in data["reference"]: data["reference"]["snpeff"] = effects.get_snpeff_files(data) data = _fill_validation_targets(data) data = _fill_prioritization_targets(data) # Re-enable when we have ability to re-define gemini configuration directory if False: if population.do_db_build([data], need_bam=False): data["reference"]["gemini"] = population.get_gemini_files(data) return data
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data = dd.set_vrn_file(data, ann_file) return [[data]]
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): if (data.get("files") and population.do_db_build([data], need_bam=False) and population.support_gemini_orig(data)): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): gresources = get_resources(data["genome_build"], ref_file, data) if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def handle_vcf_calls(vcf_file, data): """Prioritize VCF calls based on external annotations supplied through GEMINI. """ if not _do_prioritize(data): return vcf_file else: if population.do_db_build([data]): gemini_db = population.create_gemini_db(vcf_file, data) if gemini_db: priority_file = _prep_priority_filter(gemini_db, data) return _apply_priority_filter(vcf_file, priority_file, data) # No GEMINI database for filtering, return original file return vcf_file
def _download_prepped_genome(genome_build, data, name, need_remap): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population out_dir = utils.safe_makedir( os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) ref_dir = os.path.join(out_dir, genome_build, REMAP_NAMES.get(name, name)) if not os.path.exists(ref_dir): target = REMAP_NAMES.get(name, name) if target in INPLACE_INDEX: ref_file = glob.glob( os.path.normpath( os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] INPLACE_INDEX[target](ref_file, ref_dir, data) else: with utils.chdir(out_dir): bucket = S3_INFO["bucket"] key = S3_INFO["key"].format(build=genome_build, target=REMAP_NAMES.get(name, name)) cmd = ( "gof3r get --no-md5 -k {key} -b {bucket} | pigz -d -c | tar -xvp" ) do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob( os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] gresources = get_resources(data["genome_build"], ref_file) if data.get("files") and population.do_db_build( [data], need_bam=False, gresources=gresources): cmd = [ os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly" ] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, name)) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def add_reference_resources(data): """Add genome reference information to the item to process. """ aligner = data["config"]["algorithm"].get("aligner", None) data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"]) # back compatible `sam_ref` target data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base")) ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"), utils.get_in(data, ("reference", "fasta", "base"))) data["genome_resources"] = genome.get_resources(data["genome_build"], ref_loc) data["reference"]["snpeff"] = effects.get_snpeff_files(data) alt_genome = utils.get_in(data, ("config", "algorithm", "validate_genome_build")) if alt_genome: data["reference"]["alt"] = {alt_genome: genome.get_refs(alt_genome, None, data["dirs"]["galaxy"])["fasta"]} # Re-enable when we have ability to re-define gemini configuration directory if False: if population.do_db_build([data], check_gemini=False, need_bam=False): data["reference"]["gemini"] = population.get_gemini_files(data) return data
def _download_prepped_genome(genome_build, data, name, need_remap): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) ref_dir = os.path.join(out_dir, genome_build, REMAP_NAMES.get(name, name)) if not os.path.exists(ref_dir): target = REMAP_NAMES.get(name, name) if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] INPLACE_INDEX[target](ref_file, ref_dir, data) else: with utils.chdir(out_dir): bucket = S3_INFO["bucket"] key = S3_INFO["key"].format(build=genome_build, target=REMAP_NAMES.get(name, name)) cmd = ("gof3r get --no-md5 -k {key} -b {bucket} | pigz -d -c | tar -xvp") do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] gresources = get_resources(data["genome_build"], ref_file) if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources): cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, name)) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)