def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") out = _get_battenberg_out(paired, work_dir) if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file = _make_ignore_file(work_dir, ref_file, os.path.join(bat_datadir, "impute", "impute_info.txt"), ignore_file) local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "perl5") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) cmd = ("export R_LIBS_USER={local_sitelib} && " "export PERL5LIB={perllib}:$PERL5LIB " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["ignore"] = ignore_file return out
def _run_bubbletree(vcf_csv, cnv_csv, data): """Create R script and run on input data """ local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data) cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_file, "-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)}
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") batch = sshared.get_cur_batch(items) ext = "-%s-cnv" % batch if batch else "-cnv" out_file = os.path.join(work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename(work_bams[0]))[0], ext, chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write(_script.format(prep_str=_prep_load_script(work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = config_utils.get_program("Rscript", items[0]["config"]) try: do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError, msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write('track name=empty description="No CNVs found"\n') else: logger.exception() raise
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename( test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = { "cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn) } if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files( cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} target_bed = annotate.add_genes(raw_target_bed, data) # Do not paralleize cnvkit due to current issues with multi-processing cores = 1 # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), # len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_bed] + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] if cov_interval not in ["amplicon", "genome"]: at_avg, at_min, t_avg = _get_antitarget_size( access_bed, target_bed) if at_avg: cmd += [ "--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg) ] local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _get_brew_versions(): """Retrieve versions of tools installed via brew. """ from bcbio import install tooldir = install.get_defaults().get("tooldir") brew_cmd = os.path.join(tooldir, "bin", "brew") if tooldir else "brew" try: vout = subprocess.check_output([brew_cmd, "which"]) uses_which = True except subprocess.CalledProcessError: vout = subprocess.check_output([brew_cmd, "list", "--versions"]) uses_which = False except OSError: # brew not installed/used vout = "" out = {} for vstr in vout.split("\n"): if vstr.strip(): if uses_which: name, v = vstr.rstrip().split(": ") else: parts = vstr.rstrip().split() name = parts[0] v = parts[-1] out[name] = v return out
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") out_file = os.path.join( work_dir, "%s-%s-cnv.bed" % (os.path.splitext( os.path.basename(work_bams[0]))[0], chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write( _script.format(prep_str=_prep_load_script( work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = config_utils.get_program("Rscript", items[0]["config"]) try: do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError, msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write( 'track name=empty description="No CNVs found"\n') else: logger.exception() raise
def _run_bubbletree(vcf_csv, cnv_csv, data, has_normal=True): """Create R script and run on input data """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) # BubbleTree has some internal hardcoded paramters that assume a smaller # distribution of log2 scores. This is not true for tumor-only calls and # normal contamination, so we scale the calculations to actually get calls. # Need a better long term solution with flexible parameters. lrr_scale = 1.0 if has_normal else 10.0 with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError as msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise return {"caller": "bubbletree", "report": freqs_out, "plots": {"bubble": bubbleplot_out, "track": trackplot_out}}
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: # pick targets, anti-targets and access files based on analysis type # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html cov_interval = dd.get_coverage_interval(data) base_regions = dd.get_variant_regions(data) # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = annotate.subset_by_genes(base_regions, data, work_dir, pad=1e4) raw_target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) target_bed = annotate.add_genes(raw_target_bed, data) # bail out if we ended up with no regions if not utils.file_exists(target_bed): return {} if cov_interval == "amplicon": target_opts = ["--targets", target_bed, "--access", target_bed] elif cov_interval == "genome": target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)] else: target_opts = ["--targets", target_bed, "--access", access_file] cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ target_opts + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def get_perl_exports(tooldir=None): """Environmental exports to use conda install perl and site library. """ from bcbio import install if tooldir is None: tooldir = install.get_defaults().get("tooldir", "/usr/local") perllib = "%s/lib/perl5" % tooldir perl_path = os.path.dirname(perl_cmd()) return "export PATH=%s:$PATH && export PERL5LIB=%s:$PERL5LIB" % (perl_path, perllib)
def has_gemini_data(data): """Use gemini if we installed required data for hg19, hg38. Other organisms don't have special data targets. """ if support_gemini_orig(data, all_human=True): from bcbio import install return "gemini" in install.get_defaults().get("datatarget", []) else: return True
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "perl5") opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("export PERL5LIB={perllib}:$PERL5LIB && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) # More lenient filters for low-frequency variations db_file = os.path.join(tmp_path, "main", "somatic.db") scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("export PERL5LIB={perllib}:$PERL5LIB && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 2 --min-phred-fisher 5 | bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perllib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "perl5") cmd = ( "export PERL5LIB={perllib}:$PERL5LIB && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} " ) do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd = [_get_cmd(), "segment", "-o", tx_out_file, "--rlibpath", local_sitelib, cnr_file] if cov_interval == "genome": cmd += ["--threshold", "0.005"] do.run(cmd, "CNVkit segment") return out_file
def _run_bubbletree(vcf_csv, cnv_csv, data): """Create R script and run on input data """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbles_out = "%s-bubbles.pdf" % base prev_model_out = "%s-bubbletree_prev_model.pdf" % base freqs_out = "%s-bubbletree_prevalence.txt" % base with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): do.run(["Rscript", r_file], "Assess heterogeneity with BubbleTree")
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd = [_get_cmd(), "segment", "-o", tx_out_file, "--rlibpath", local_sitelib, cnr_file] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd = [ _get_cmd(), "segment", "-o", tx_out_file, "--rlibpath", local_sitelib, cnr_file ] if cov_interval == "genome": cmd += ["--threshold", "0.005"] do.run(cmd, "CNVkit segment") return out_file
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} target_bed = annotate.add_genes(raw_target_bed, data) # Do not paralleize cnvkit due to current issues with multi-processing cores = 1 # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), # len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_bed] + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] if cov_interval not in ["amplicon", "genome"]: at_avg, at_min, t_avg = _get_antitarget_size(access_bed, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def snpeff_version(args=None): from bcbio.install import get_defaults tooldir = (args and args.tooldir) or get_defaults()["tooldir"] raw_version = programs.get_version_manifest("snpeff") if not raw_version: config = { "resources": { "snpeff": { "jvm_opts": ["-Xms500m", "-Xmx1g"], "dir": os.path.join(tooldir, "share", "java", "snpeff") } } } raw_version = programs.java_versioner( "snpeff", "snpEff", stdout_flag="snpEff version SnpEff")(config) snpeff_version = "".join( [x for x in str(raw_version) if x in set(string.digits + ".")]) assert snpeff_version, "Did not find snpEff version information" return snpeff_version
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd = [ _get_cmd(), "segment", "-o", tx_out_file, "--rlibpath", local_sitelib, cnr_file ] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "perl5") cmd = ("export PERL5LIB={perllib}:$PERL5LIB && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ") do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_bubbletree(vcf_csv, cnv_csv, data): """Create R script and run on input data """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbles_out = "%s-bubbles.pdf" % base prev_model_out = "%s-bubbletree_prev_model.pdf" % base freqs_out = "%s-bubbletree_prevalence.txt" % base with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run(["Rscript", r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file(work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = {"male": "XY", "female": "XX", "unknown": "L"}.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def _has_gemini(data): """Use gemini if we installed required data. """ from bcbio import install return "gemini" in install.get_defaults().get("datatarget", [])
def _get_perllib(tooldir=None): from bcbio import install if tooldir is None: tooldir = install.get_defaults().get("tooldir", "/usr/local") return "%s/lib/perl5" % tooldir
def R_sitelib(): """Retrieve the R site-library installed with the bcbio installer. """ from bcbio import install return os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library")
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perllib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "perl5") tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ( "export PERL5LIB={perllib}:$PERL5LIB && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" ) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join( _scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join( tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ( "export PERL5LIB={perllib}:$PERL5LIB && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression( "reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file