def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir, ] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml") ) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>10000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>20) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>1)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) calls.append({"variantcaller": "metasv", "vrn_file": filter_file}) return calls
def gatk_cmd(name, jvm_opts, params): """Retrieve PATH to gatk or gatk-framework executable using locally installed java. """ gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name)) # if we can't find via the local executable, fallback to being in the path if not gatk_cmd: gatk_cmd = utils.which(name) if gatk_cmd: return "unset JAVA_HOME && export PATH=%s:$PATH && %s %s %s" % \ (os.path.dirname(gatk_cmd), gatk_cmd, " ".join(jvm_opts), " ".join([str(x) for x in params]))
def gatk_cmd(name, jvm_opts, params): """Retrieve PATH to gatk or gatk-framework executable using locally installed java. """ gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name)) return "unset JAVA_HOME && export PATH=%s:$PATH && %s %s %s" % \ (os.path.dirname(gatk_cmd), gatk_cmd, " ".join(jvm_opts), " ".join([str(x) for x in params]))
def _fill_prioritization_targets(data): """Fill in globally installed files for prioritization. """ ref_file = dd.get_ref_file(data) for target in [["svprioritize"], ["coverage"]]: val = tz.get_in(["config", "algorithm"] + target, data) if val and not os.path.exists(val): installed_vals = [] # Check prioritize directory for ext in [".bed", ".bed.gz"]: installed_vals += glob.glob(os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "coverage", "prioritize", val + "*%s" % ext))) # Check sv-annotation directory for prioritize gene name lists if target[-1] == "svprioritize": installed_vals += glob.glob(os.path.join( os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))), "%s*" % os.path.basename(val))) if len(installed_vals) == 0: raise ValueError("Configuration problem. BED file not found for %s: %s" % (target, val)) elif len(installed_vals) == 1: installed_val = installed_vals[0] else: # check for partial matches installed_val = None for v in installed_vals: if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"): installed_val = v break # handle date-stamped inputs if not installed_val: installed_val = sorted(installed_vals, reverse=True)[0] data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val) return data
def java(items): """Check for presence of external Java 1.7 for tools that require it. """ if any([_needs_java(d) for d in items]): min_version = "1.7" max_version = "1.8" java = utils.which("java") if not java: return ("java not found on PATH. Java %s required for MuTect and GATK < 3.6." % min_version) p = subprocess.Popen([java, "-Xms250m", "-Xmx250m", "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output, _ = p.communicate() p.stdout.close() version = "" for line in output.split("\n"): if line.startswith(("java version", "openjdk version")): version = line.strip().split()[-1] if version.startswith('"'): version = version[1:] if version.endswith('"'): version = version[:-1] if (not version or LooseVersion(version) >= LooseVersion(max_version) or LooseVersion(version) < LooseVersion(min_version)): return ("java version %s required for running MuTect and GATK < 3.6.\n" "It needs to be first on your PATH so running 'java -version' give the correct version.\n" "Found version %s at %s" % (min_version, version, java))
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""): """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing. Commands for HLA post-processing: base=TEST run-HLA $base.hla > $base.hla.top cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all rm -f $base.hla.HLA*gt rm -f $base.hla.HLA*gz """ alt_file = ref_file + ".alt" if utils.file_exists(alt_file): bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem"))) hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")), os.path.basename(out_file) + ".hla") alt_cmd = (" | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}") else: alt_cmd = "" bwa = config_utils.get_program("bwa", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(data["rgnames"]) pairing = "-p" if not fastq2 else "" # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38 # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/ # http://ehc.ac/p/bio-bwa/mailman/message/32268544/ mem_usage = "-c 250" bwa_cmd = ("{bwa} mem {pairing} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 " "{ref_file} {fastq1} {fastq2} ") return (bwa_cmd + alt_cmd).format(**locals())
def _gatk4_cmd(jvm_opts, params, data): """Retrieve unified command for GATK4, using 'gatk'. GATK3 is 'gatk3'. """ gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "gatk")) return "%s && export PATH=%s:\"$PATH\" && gatk --java-options '%s' %s" % \ (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), " ".join(jvm_opts), " ".join([str(x) for x in params]))
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build, out_dir, cores=1): """ works only in unstranded mode for now (--forward-prob 0.5) """ if not utils.which("rsem-calculate-expression"): logger.info("Skipping RSEM because rsem-calculate-expression could " "not be found.") return None sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results") if utils.file_exists(sentinel_file): return out_dir paired_flag = "--paired" if bam.is_paired(bam_file) else "" core_flag = "-p {cores}".format(cores=cores) command = CALCULATE_EXP.format( core_flag=core_flag, paired_flag=paired_flag, bam_file=bam_file, rsem_genome_dir=rsem_genome_dir, build=build, samplename=samplename) message = "Calculating transcript expression of {bam_file} using RSEM." with transaction.file_transaction(out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) with utils.chdir(tx_out_dir): do.run(command, message.format(bam_file=bam_file)) return out_dir
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build, out_dir, cores=1): """ works only in unstranded mode for now (--forward-prob 0.5) """ if not which("rsem-calculate-expression"): logger.info("Skipping RSEM because rsem-calculate-expression could " "not be found.") return None sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results") if file_exists(sentinel_file): return out_dir paired_flag = "--paired" if bam.is_paired(bam_file) else "" core_flag = "-p {cores}".format(cores=cores) cmd = ("rsem-calculate-expression --bam {core_flag} {paired_flag} --no-bam-output " "--forward-prob 0.5 --estimate-rspd {bam_file} {rsem_genome_dir}/{build} " "{samplename}") message = "Calculating transcript expression of {bam_file} using RSEM." with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) with chdir(tx_out_dir): do.run(cmd.format(**locals()), message.format(**locals())) return out_dir
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def gatk_cmd(name, jvm_opts, params, config=None): """Retrieve PATH to gatk or gatk-framework executable using locally installed java. """ if name == "gatk": assert config, "Need configuration input for gatk to distinguish gatk4" if isinstance(config, dict) and "config" not in config: data = {"config": config} else: data = config if "gatk4" in dd.get_tools_on(data): return _gatk4_cmd(jvm_opts, params, data) gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name)) # if we can't find via the local executable, fallback to being in the path if not gatk_cmd: gatk_cmd = utils.which(name) if gatk_cmd: return "unset JAVA_HOME && export PATH=%s:$PATH && %s %s %s" % \ (os.path.dirname(gatk_cmd), gatk_cmd, " ".join(jvm_opts), " ".join([str(x) for x in params]))
def _find_executable(name): in_path = utils.which(name) if in_path: return in_path else: in_conda = os.path.join(os.path.dirname(sys.executable), name) if os.path.exists(in_conda): return in_conda else: return None
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file): cmd = [sys.executable, utils.which("configManta.py")] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] do.run(cmd, "Configure manta SV analysis") return out_file
def gatk_cmd(name, jvm_opts, params, config=None): """Retrieve PATH to gatk using locally installed java. """ if name == "gatk": if isinstance(config, dict) and "config" not in config: data = {"config": config} else: data = config if not data or "gatk4" not in dd.get_tools_off(data): return _gatk4_cmd(jvm_opts, params, data) else: name = "gatk3" gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name)) # if we can't find via the local executable, fallback to being in the path if not gatk_cmd: gatk_cmd = utils.which(name) if gatk_cmd: return "%s && export PATH=%s:\"$PATH\" && %s %s %s" % \ (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd, " ".join(jvm_opts), " ".join([str(x) for x in params]))
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))] cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % _get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file), "--runDir=%s" % tx_work_dir, "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] if dd.get_coverage_interval(paired.tumor_data) not in ["genome"]: cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name) return os.path.join(tx_work_dir, "runWorkflow.py")
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))] cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % _get_region_bed(region, items, out_file), "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file), "--runDir=%s" % tx_work_dir] cmd += ["--bam=%s" % b for b in align_bams] if any(dd.get_coverage_interval(d) not in ["genome"] for d in items): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [utils.get_program_python("configureStrelkaSomaticWorkflow.py"), os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))] cur_bed = get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file) cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % cur_bed, "--runDir=%s" % tx_work_dir, "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] if _is_targeted_region(cur_bed, paired.tumor_data): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name) return os.path.join(tx_work_dir, "runWorkflow.py")
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"), os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))] cur_bed = get_region_bed(region, items, out_file) cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % cur_bed, "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file), "--runDir=%s" % tx_work_dir] cmd += ["--bam=%s" % b for b in align_bams] if _is_targeted_region(cur_bed, items[0]): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def run(data): """HLA typing with bwakit, parsing output from called genotype files. """ bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem"))) align_file = dd.get_align_bam(data) hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(align_file), "hla")), os.path.basename(align_file) + ".hla") if len(glob.glob(hla_base + ".*")) > 0: out_file = hla_base + ".top" if not utils.file_exists(out_file): cmd = "{bwakit_dir}/run-HLA {hla_base}" #do.run(cmd.format(**locals()), "HLA typing with bwakit") out_file = _organize_calls(out_file, hla_base, data) data["hla"] = {"calls": out_file} return data
def run(data): """HLA typing with bwakit, parsing output from called genotype files. """ bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem"))) hla_fqs = tz.get_in(["hla", "fastq"], data, []) if len(hla_fqs) > 0: hla_base = os.path.commonprefix(hla_fqs) while hla_base.endswith("."): hla_base = hla_base[:-1] out_file = hla_base + ".top" if not utils.file_exists(out_file): cmd = "{bwakit_dir}/run-HLA {hla_base}" do.run(cmd.format(**locals()), "HLA typing with bwakit") out_file = _organize_calls(out_file, hla_base, data) data["hla"].update({"call_file": out_file, "hlacaller": "bwakit"}) return data
def prepare_rsem_reference(gtf, multifasta, build): """ gtf: path to GTF file (must have gene_id and transcript_id) multifasta: path to multifasta file build: name of organism build (e.g. hg19) """ if not which("rsem-prepare-reference"): logger.info("Skipping prepping RSEM reference because rsem-prepare-reference could " "not be found.") return None cmd = "rsem-prepare-reference --gtf {gtf} {multifasta} {build}" with tx_tmpdir(remove=False) as rsem_genome_dir: with chdir(rsem_genome_dir): message = "Preparing rsem reference from %s" % gtf do.run(cmd.format(**locals()), message) return rsem_genome_dir
def prepare_rsem_reference(gtf, multifasta, build): """ gtf: path to GTF file (must have gene_id and transcript_id) multifasta: path to multifasta file build: name of organism build (e.g. hg19) """ if not utils.which("rsem-prepare-reference"): logger.info("Skipping prepping RSEM reference because " "rsem-prepare-reference could not be found.") return None command = PREPARE_REFERENCE.format(gtf=gtf, multifasta=multifasta, build=build) with transaction.tx_tmpdir(remove=False) as rsem_genome_dir: with utils.chdir(rsem_genome_dir): message = "Preparing rsem reference from %s" % gtf do.run(command, message) return rsem_genome_dir
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [ sys.executable, os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py")) ] cmd += [ "--referenceFasta=%s" % ref_file, "--callRegions=%s" % _get_region_bed( region, [paired.tumor_data, paired.normal_data], out_file), "--runDir=%s" % tx_work_dir, "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam ] if dd.get_coverage_interval(paired.tumor_data) not in ["genome"]: cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name) return os.path.join(tx_work_dir, "runWorkflow.py")
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""): """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing. Commands for HLA post-processing: base=TEST run-HLA $base.hla > $base.hla.top cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all rm -f $base.hla.HLA*gt rm -f $base.hla.HLA*gz """ alt_file = ref_file + ".alt" if utils.file_exists(alt_file): bwakit_dir = os.path.dirname( os.path.realpath(utils.which("run-bwamem"))) hla_base = os.path.join( utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")), os.path.basename(out_file) + ".hla") alt_cmd = ( " | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}" ) else: alt_cmd = "" if dd.get_aligner(data) == "sentieon-bwa": bwa_exe = "sentieon-bwa" exports = sentieon.license_export(data) else: bwa_exe = "bwa" exports = "" bwa = config_utils.get_program(bwa_exe, data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(data["rgnames"]) pairing = "-p" if not fastq2 else "" # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38 # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/ # http://ehc.ac/p/bio-bwa/mailman/message/32268544/ mem_usage = "-c 250" bwa_cmd = ( "{exports}{bwa} mem {pairing} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 " "{ref_file} {fastq1} {fastq2} ") return (bwa_cmd + alt_cmd).format(**locals())
def _fill_prioritization_targets(data): """Fill in globally installed files for prioritization. """ ref_file = dd.get_ref_file(data) for target in [["svprioritize"], ["coverage"]]: val = tz.get_in(["config", "algorithm"] + target, data) if val and not os.path.exists(val): installed_vals = [] # Check prioritize directory for ext in [".bed", ".bed.gz"]: installed_vals += glob.glob( os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "coverage", "prioritize", val + "*%s" % ext))) # Check sv-annotation directory for prioritize gene name lists if target[-1] == "svprioritize": installed_vals += glob.glob( os.path.join( os.path.dirname( os.path.realpath( utils.which("simple_sv_annotation.py"))), "%s*" % os.path.basename(val))) if len(installed_vals) == 0: raise ValueError( "Configuration problem. BED file not found for %s: %s" % (target, val)) elif len(installed_vals) == 1: installed_val = installed_vals[0] else: # check for partial matches installed_val = None for v in installed_vals: if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"): installed_val = v break # handle date-stamped inputs if not installed_val: installed_val = sorted(installed_vals, reverse=True)[0] data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val) return data
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [ sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py")) ] cmd += [ "--referenceFasta=%s" % ref_file, "--callRegions=%s" % _get_region_bed(region, items, out_file), "--ploidy=%s" % _get_ploidy(region, items, out_file), "--runDir=%s" % tx_work_dir ] cmd += ["--bam=%s" % b for b in align_bams] if any(dd.get_coverage_interval(d) not in ["genome"] for d in items): cmd += ["--targeted"] do.run( cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [ utils.get_program_python("configureStrelkaSomaticWorkflow.py"), os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py")) ] cur_bed = get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file) if cur_bed: cmd += [ "--referenceFasta=%s" % ref_file, "--callRegions=%s" % cur_bed, "--runDir=%s" % tx_work_dir, "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam ] if _is_targeted_region(cur_bed, paired.tumor_data): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name) return os.path.join(tx_work_dir, "runWorkflow.py")
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [ sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py")) ] cur_bed = get_region_bed(region, items, out_file) cmd += [ "--referenceFasta=%s" % ref_file, "--callRegions=%s" % cur_bed, "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file), "--runDir=%s" % tx_work_dir ] cmd += ["--bam=%s" % b for b in align_bams] if _is_targeted_region(cur_bed, items[0]): cmd += ["--targeted"] do.run( cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def _create_pileup(bam_file, data, out_base, background): """Create pileup calls in the regions of interest for hg19 -> GRCh37 chromosome mapping. """ out_file = "%s-mpileup.txt" % out_base if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: background_bed = os.path.normpath(os.path.join( os.path.dirname(os.path.realpath(utils.which("verifybamid2"))), "resource", "%s.%s.%s.vcf.gz.dat.bed" % (background["dataset"], background["nvars"], background["build"]))) local_bed = os.path.join(os.path.dirname(out_base), "%s.%s-hg19.bed" % (background["dataset"], background["nvars"])) if not utils.file_exists(local_bed): with file_transaction(data, local_bed) as tx_local_bed: with open(background_bed) as in_handle: with open(tx_local_bed, "w") as out_handle: for line in in_handle: out_handle.write("chr%s" % line) mpileup_cl = samtools.prep_mpileup([bam_file], dd.get_ref_file(data), data["config"], want_bcf=False, target_regions=local_bed) cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}") do.run(cl.format(**locals()), "Create pileup from BAM input") return out_file
def _run_funnel(args): """Run funnel TES server with rabix bunny for CWL. """ host = "localhost" port = "8088" main_file, json_file, project_name = _get_main_and_json(args.directory) work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "funnel_work")) log_file = os.path.join(work_dir, "%s-funnel.log" % project_name) # Create bunny configuration directory with TES backend orig_config_dir = os.path.join(os.path.dirname(os.path.realpath(utils.which("rabix"))), "config") work_config_dir = utils.safe_makedir(os.path.join(work_dir, "rabix_config")) for fname in os.listdir(orig_config_dir): if fname == "core.properties": with open(os.path.join(orig_config_dir, fname)) as in_handle: with open(os.path.join(work_config_dir, fname), "w") as out_handle: for line in in_handle: if line.startswith("backend.embedded.types"): line = "backend.embedded.types=TES\n" out_handle.write(line) else: shutil.copy(os.path.join(orig_config_dir, fname), os.path.join(work_config_dir, fname)) flags = ["-c", work_config_dir, "-tes-url=http://%s:%s" % (host, port), "-tes-storage=%s" % work_dir] if args.no_container: _remove_bcbiovm_path() flags += ["--no-container"] cmd = ["rabix"] + flags + [main_file, json_file] funnelp = subprocess.Popen(["funnel", "server", "run", "--Server.HostName", host, "--Server.HTTPPort", port, "--LocalStorage.AllowedDirs", work_dir, "--Worker.WorkDir", os.path.join(work_dir, "funnel-work")]) try: with utils.chdir(work_dir): _run_tool(cmd, not args.no_container, work_dir, log_file) finally: funnelp.kill()
def _get_ericscript_path(self): """Retrieve PATH to the isolated eriscript anaconda environment. """ es = utils.which(os.path.join(utils.get_bcbio_bin(), self.EXECUTABLE)) return os.path.dirname(os.path.realpath(es))
def _get_cmd(): return [ utils.get_program_python("run_metasv.py"), utils.which("run_metasv.py") ]
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace( ".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources( "bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data) } } }) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ( "{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname( os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join( data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index( vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ( "{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def _choose_htseq_count_executable(data): htseq = get_in(data["config"], ("resources", "htseq-count", "cmd"), "htseq-count") return which(htseq)
def _gatk4_cmd(jvm_opts, params, data): """Retrieve unified command for GATK4, using gatk-launch. """ gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "gatk-launch")) return "unset JAVA_HOME && export PATH=%s:$PATH && gatk-launch --javaOptions '%s' %s" % \ (os.path.dirname(gatk_cmd), " ".join(jvm_opts), " ".join([str(x) for x in params]))