def get_coverage(data): """Calculate coverage for a sample.bam, account for GC content data is single sample """ data = utils.to_single_data(data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) sample_name = dd.get_sample_name(data) work_dir = _sv_workdir(data) rscript = utils.Rscript_cmd("r36") coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) # PureCN resolves symlinks and the actual output PureCN coverage file name # is derived from the end bam not from bam_file bam_file = os.path.realpath(dd.get_align_bam(data)) bam_name = os.path.basename(bam_file) (bname, ext) = os.path.splitext(bam_name) result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz") if not os.path.exists(result_file): cmd = [rscript, coverage_r, "--outdir", work_dir, "--bam", bam_file, "--intervals", intervals] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN coverage") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to calculate coverage") logger.debug("Saved PureCN coverage files to " + result_file) return result_file
def create_normal_db(coverage_files_txt, snv_pon, out_dir, genome_build): """create normal db input: coverage files calculated by purecn for each sample snv_pon - mutect2 SNV PON output: mapping_bias_hg38.rds normalDB_hg38.rds """ rscript = utils.Rscript_cmd("r36") normaldb_r = utils.R_package_script("r36", "PureCN", "extdata/NormalDB.R") cmd = [rscript, normaldb_r, "--outdir", out_dir, "--coveragefiles", coverage_files_txt, "--normal_panel" , snv_pon, "--genome", genome_build, "--force"] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN normalDB") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to create a normal db") return out_dir
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources(data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = utils.R_sitelib() batch = sshared.get_cur_batch(items) ext = "-%s-cnv" % batch if batch else "-cnv" out_file = os.path.join( work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename( work_bams[0]))[0], ext, chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write( _script.format(prep_str=_prep_load_script( work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = utils.Rscript_cmd() try: do.run([rscript, "--vanilla", rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError as msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write( 'track name=empty description="No CNVs found"\n') else: logger.exception() raise return [out_file]
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = utils.R_sitelib() base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file) try: do.run(cmd, "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError as msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise return {"caller": "bubbletree", "report": freqs_out, "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd("r36") purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample) mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mappingbiasfile", mappingbiasfile, "--intervals", intervals, "--snpblacklist", simple_repeat_bed, "--genome", genome, "--force", "--postoptimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample)] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist = True) return out
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = { "cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk } out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate( out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)]( cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants( paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(paired.tumor_data)) cmd = [ "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3" ] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % ( utils.R_sitelib(), utils.get_R_exports(), " ".join( [str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info( "PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = utils.get_R_exports() local_sitelib = utils.R_sitelib() cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd} && " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out