def get_coverage(data): """Calculate coverage for a sample.bam, account for GC content data is single sample """ data = utils.to_single_data(data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) sample_name = dd.get_sample_name(data) work_dir = _sv_workdir(data) rscript = utils.Rscript_cmd("r36") coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) # PureCN resolves symlinks and the actual output PureCN coverage file name # is derived from the end bam not from bam_file bam_file = os.path.realpath(dd.get_align_bam(data)) bam_name = os.path.basename(bam_file) (bname, ext) = os.path.splitext(bam_name) result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz") if not os.path.exists(result_file): cmd = [rscript, coverage_r, "--outdir", work_dir, "--bam", bam_file, "--intervals", intervals] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN coverage") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to calculate coverage") logger.debug("Saved PureCN coverage files to " + result_file) return result_file
def _run_bubbletree(vcf_csv, cnv_csv, data, has_normal=True): """Create R script and run on input data """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) # BubbleTree has some internal hardcoded paramters that assume a smaller # distribution of log2 scores. This is not true for tumor-only calls and # normal contamination, so we scale the calculations to actually get calls. # Need a better long term solution with flexible parameters. lrr_scale = 1.0 if has_normal else 10.0 with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n" ) else: cmd = [ _get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file ] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values( cnr_file) and cov_interval != "genome": cmd += ["-v", small_vrn_files[0]] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = ( "unset R_HOME && export PATH=%s:$PATH && export TMPDIR=%s && " % (os.path.dirname( utils.Rscript_cmd()), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file.""" # no solution - no signatures if not "rds" in out: return out rscript = utils.Rscript_cmd() purecndx_r = utils.R_package_script("PureCN", "extdata/Dx.R", env="base") simple_repeat_bed = dd.get_variation_resources( paired.tumor_data)["simple_repeat"] callable_bed = dd.get_sample_callable(paired.tumor_data) out_base = utils.splitext_plus(out["rds"])[0] mutation_burden_csv = out_base + "_mutation_burden.csv" if not utils.file_uptodate(mutation_burden_csv, out["rds"]): # no signatures - so we generate them with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = [ rscript, purecndx_r, "--rds", out["rds"], "--callable", callable_bed, "--signatures", "--exclude", simple_repeat_bed, "--out", tx_out_base ] do.run(cmd, "PureCN Dx mutational burden and signatures") out_base, out, all_files = _get_purecn_dx_files(paired, out, require_exist=True) # if a file was not generated it would not go to the upload for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def create_normal_db(coverage_files_txt, snv_pon, out_dir, genome_build): """create normal db input: coverage files calculated by purecn for each sample snv_pon - mutect2 SNV PON output: mapping_bias_hg38.rds normalDB_hg38.rds """ rscript = utils.Rscript_cmd("r36") normaldb_r = utils.R_package_script("r36", "PureCN", "extdata/NormalDB.R") cmd = [rscript, normaldb_r, "--outdir", out_dir, "--coveragefiles", coverage_files_txt, "--normal_panel" , snv_pon, "--genome", genome_build, "--force"] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN normalDB") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to create a normal db") return out_dir
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = utils.R_sitelib() base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file) try: do.run(cmd, "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError as msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise return {"caller": "bubbletree", "report": freqs_out, "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = utils.R_sitelib() batch = sshared.get_cur_batch(items) ext = "-%s-cnv" % batch if batch else "-cnv" out_file = os.path.join( work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename( work_bams[0]))[0], ext, chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write( _script.format(prep_str=_prep_load_script( work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = utils.Rscript_cmd() try: do.run([rscript, "--vanilla", rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError as msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write( 'track name=empty description="No CNVs found"\n') else: logger.exception() raise return [out_file]
def _amber_het_file(vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs" from bcbio.heterogeneity import bubbletree prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], work_dir, paired, AmberWriter) amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join( amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) utils.symlink_plus(prep_file, out_file) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R") with open(r_file, "w") as out_handle: out_handle.write(_amber_seg_script) cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports( ), utils.Rscript_cmd(), r_file, out_file, pcf_file) do.run(cmd, "PURPLE: AMBER baf segmentation") return out_file
def _run_bubbletree(vcf_csv, cnv_csv, data): """Create R script and run on input data """ local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources(data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def run(self, subcmd, opts, memscale=None): jvm_opts = get_picard_opts(self._config, memscale=memscale) Rpath = os.path.dirname(utils.Rscript_cmd()) cmd = ["unset", "JAVA_HOME", "&&", "export", "PATH=%s:$PATH" % Rpath, "&&"] + \ [self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \ ["VALIDATION_STRINGENCY=SILENT"] do.run(" ".join(cmd), "Picard: %s" % subcmd)
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd("r36") purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample) mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mappingbiasfile", mappingbiasfile, "--intervals", intervals, "--snpblacklist", simple_repeat_bed, "--genome", genome, "--force", "--postoptimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample)] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist = True) return out
def chipqc(bam_file, sample, out_dir): """Attempt code to run ChIPQC bioconductor packate in one sample""" work_dir = dd.get_work_dir(sample) sample_name = dd.get_sample_name(sample) logger.warning("ChIPQC is unstable right now, if it breaks, turn off the tool.") if utils.file_exists(out_dir): return _get_output(out_dir) with tx_tmpdir() as tmp_dir: rcode = _sample_template(sample, tmp_dir) # local_sitelib = utils.R_sitelib() rscript = utils.Rscript_cmd() do.run([rscript, rcode], "ChIPQC in %s" % sample_name, log_error=False) shutil.move(tmp_dir, out_dir) return _get_output(out_dir)
def cpat(assembled_fasta, hexamer, logit, out_file=None): if out_file and file_exists(out_file): return out_file if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name cpat_cmd = _find_executable("cpat.py") r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{cpat_cmd} --gene={assembled_fasta} --hex={hexamer} " "--logitModel={logit} --outfile={tx_out_file}") message = "Predicing coding potential of %s." % (assembled_fasta) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "segment", "-o", tx_out_file, cnr_file] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} " "--libdir None") chroms = ["'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_x(c.name)] if "'X'" not in chroms: chroms += ["'X'"] # Use UCSC style naming for human builds to support BSgenome genome_build = ("hg19" if dd.get_genome_build(data) in ["GRCh37", "hg19"] else dd.get_genome_build(data)) cmd += """ --chrs "c(%s)" """ % ",".join(chroms) cmd += " --genomeBuild {genome_build}" if data["genome_build"] in ("hg19", "hg38"): cmd += " --genomeStyle UCSC" if data["genome_build"] in ["hg38"]: data_dir = os.path.normpath(os.path.join( os.path.dirname(os.path.realpath(os.path.join( os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))), os.pardir, os.pardir, "data")) cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt") assert os.path.exists(cytoband_file), cytoband_file cmd += " --cytobandFile %s" % cytoband_file # TitanCNA's model is influenced by the variance in read coverage data # and data type: set reasonable defaults for non-WGS runs # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts) if dd.get_coverage_interval(data) != "genome": cmd += " --alphaK=2500 --alphaKHigh=2500" do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move(os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, out_dir=None): safe_makedir(out_dir) out_prefix = os.path.join(out_dir, "logit") out_file = out_prefix + ".logit.RData" if file_exists(out_file): return out_file tx_prefix = tempfile.NamedTemporaryFile(delete=False).name tx_out_file = tx_prefix + ".logit.RData" logit_cmd = _find_executable("make_logitModel.py") r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} " "--hex={hexamers} --outfile={tx_prefix}") message = "Building coding/noncoding logistical model." do.run(cmd.format(**locals()), message) shutil.move(tx_out_file, out_file) return out_file
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file.""" out_base, out, all_files = _get_purecn_dx_files(paired, out) rscript = utils.Rscript_cmd("r36") purecndx_r = utils.R_package_script("r36", "PureCN", "extdata/Dx.R") simple_repeat_bed = dd.get_variation_resources(paired.tumor_data)["simple_repeat"] callable_bed = dd.get_sample_callable(paired.tumor_data) if not utils.file_uptodate(out["mutation_burden"], out["rds"]): with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = [rscript, purecndx_r, "--rds", out["rds"], "--callable", callable_bed, "--signatures", "--exclude", simple_repeat_bed, "--out", tx_out_base] do.run(cmd, "PureCN Dx mutational burden and signatures") for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join( _vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _get_env(): anaconda_bin = os.path.dirname(utils.Rscript_cmd()) return "unset JAVA_HOME && export PATH=%s:$PATH && " % (anaconda_bin)
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = { "cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk } out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate( out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)]( cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants( paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(paired.tumor_data)) rscript = utils.Rscript_cmd() purecn_r = utils.R_package_script("PureCN", "extdata/PureCN.R", env="base") cmd = [ rscript, purecn_r, "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3" ] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="base"), utils.get_R_exports(env="base"), " ".join( [str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info( "PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): raw_file = "%s-raw%s" % utils.splitext_plus(out_file) with file_transaction(items[0], raw_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if dd.get_avg_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ( "unset R_HOME && unset JAVA_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "| bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = raw_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if raw_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) if assoc_files.get("dbsnp"): annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0], out_file) else: utils.symlink_plus(raw_file, out_file) return out_file
def _report_summary(samples, out_dir): """ Run coverage report with bcbiocov package """ try: import bcbreport.prepare as bcbreport except ImportError: logger.info("skipping report. No bcbreport installed.") return samples # samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) parent_dir = utils.safe_makedir(out_dir) with utils.chdir(parent_dir): logger.info("copy qsignature") qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") if qsignature_fn: # this need to be inside summary/qc dict if utils.file_exists( qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "bcbio_qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) logger.info("summarize metrics") samples = _merge_metrics(samples) logger.info("summarize target information") samples = _merge_target_information(samples) out_dir = utils.safe_makedir("coverage") logger.info("summarize coverage") for data in samples: pfiles = tz.get_in(["summary", "qc", "coverage"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("coverage_fixed") > -1: utils.copy_plus( fn, os.path.join(out_dir, os.path.basename(fn))) out_dir = utils.safe_makedir("variants") logger.info("summarize variants") for data in samples: pfiles = tz.get_in(["summary", "qc", "variants"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("gc-depth-parse.tsv") > -1: utils.copy_plus( fn, os.path.join(out_dir, os.path.basename(fn))) bcbreport.report(parent_dir) out_report = os.path.join(parent_dir, "qc-coverage-report.html") if not utils.file_exists(out_report): rmd_file = os.path.join(parent_dir, "report-ready.Rmd") run_file = "%s-run.R" % (os.path.splitext(out_report)[0]) with open(run_file, "w") as out_handle: out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file) cmd = "%s %s" % (utils.Rscript_cmd(), run_file) # Skip automated generation of coverage report to avoid error # messages. We need to generalize coverage reporting and re-include. # try: # do.run(cmd, "Prepare coverage summary", log_error=False) # except subprocess.CalledProcessError as msg: # logger.info("Skipping generation of coverage report: %s" % (str(msg))) if utils.file_exists("report-ready.html"): shutil.move("report-ready.html", out_report) return samples