def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file} return [[data]]
def _add_bed_to_output(out, data): """Call ploidy and convert into BED representation. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") out_file = "%s.bed" % os.path.splitext(call_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", "bed", "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export BED") out["call_file"] = call_file out["vrn_file"] = annotate.add_genes(out_file, data) return out
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","): """Combine a set of BED files, breaking into individual size chunks. """ out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if len(input_beds) > 0: size_beds = [] for e_start, e_end in validate.EVENT_SIZES: base, ext = os.path.splitext(out_file) size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext) if not utils.file_exists(size_out_file): with file_transaction(data, size_out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] has_regions = False with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): chrom, start, end, event_str = line.split()[:4] event = event_str.split("_", 1)[0] size = int(end) - int(start) if size >= e_start and size < e_end or event == "BND": out_handle.write(line) has_regions = True if has_regions: pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file) if utils.file_exists(size_out_file): ann_size_out_file = annotate.add_genes(size_out_file, data) size_beds.append(ann_size_out_file) if len(size_beds) > 0: out_file = bedutils.combine(size_beds, out_file, data) return out_file
def _prep_bed(data, work_dir): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = clean_file(bed_file, data, prefix="svregions-") else: bed_file = clean_file(dd.get_variant_regions(data), data) col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _annotate_bed(bed_fpath, data, work_dir): annotate_bed = annotate.add_genes(bed_fpath, data, work_dir=work_dir) if annotate_bed == bed_fpath: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with" + bed_fpath) return annotate_bed
def _gids_to_genes(gids, ssm_locs, cnv_ssms, data): """Convert support ids for SNPs and SSMs into associated genes. """ locs = collections.defaultdict(set) for gid in gids: cur_locs = [] try: cur_locs.append(ssm_locs[gid]) except KeyError: for ssm_loc in cnv_ssms.get(gid, []): cur_locs.append(ssm_locs[ssm_loc]) for chrom, pos in cur_locs: locs[chrom].add(pos) genes = set([]) with tx_tmpdir(data) as tmpdir: chrom_prefix = "chr" if next(ref.file_contigs(dd.get_ref_file(data))).name.startswith("chr") else "" loc_file = os.path.join(tmpdir, "battenberg_find_genes.bed") with open(loc_file, "w") as out_handle: for chrom in sorted(locs.keys()): for loc in sorted(list(locs[chrom])): out_handle.write("%s%s\t%s\t%s\n" % (chrom_prefix, chrom, loc - 1, loc)) ann_file = annotate.add_genes(loc_file, data, max_distance=10000) for r in pybedtools.BedTool(ann_file): for gene in r.name.split(","): if gene != ".": genes.add(gene) return sorted(list(genes))
def _add_variantcalls_to_output(out, data): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: # pick targets, anti-targets and access files based on analysis type # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html cov_interval = dd.get_coverage_interval(data) base_regions = dd.get_variant_regions(data) # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = annotate.subset_by_genes(base_regions, data, work_dir, pad=1e4) raw_target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) target_bed = annotate.add_genes(raw_target_bed, data) # bail out if we ended up with no regions if not utils.file_exists(target_bed): return {} if cov_interval == "amplicon": target_opts = ["--targets", target_bed, "--access", target_bed] elif cov_interval == "genome": target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)] else: target_opts = ["--targets", target_bed, "--access", access_file] cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ target_opts + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]} target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [ (bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data) ], data["config"], parallel, ) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data, ) fixed_cnrs = run_multicore( _cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel, ) called_segs = run_multicore( _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel ) return ckouts
def _calculate_sv_coverage_cnvkit(data, work_dir): """Calculate coverage in an CNVkit ready format using mosdepth. """ from bcbio.variation import coverage from bcbio.structural import annotate out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov.regions, out_anti_file, data) return out_target_file, out_anti_file
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) split_cnns = run_multicore(_cnvkit_coverage, [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds], inputs[0]["config"], parallel) raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0]) coverage_cnns = run_multicore(_cnvkit_metrics, [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()], inputs[0]["config"], parallel) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr, data in fixed_cnrs], inputs[0]["config"], parallel) return ckouts
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} target_bed = annotate.add_genes(raw_target_bed, data) # Do not paralleize cnvkit due to current issues with multi-processing cores = 1 # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), # len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_bed] + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] if cov_interval not in ["amplicon", "genome"]: at_avg, at_min, t_avg = _get_antitarget_size(access_bed, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _add_bed_to_output(out, data): """Add FreeBayes cnvmap BED-like representation to the output. """ out_file = "%s.bed" % os.path.splitext(out["cns"])[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", "freebayes", "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, out["cns"]] gender = dd.get_gender(data) if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export FreeBayes BED cnvmap") out["vrn_file"] = annotate.add_genes(out_file, data) return out
def _calculate_sv_coverage_gatk(data, work_dir): """Calculate coverage in defined regions using GATK tools TODO: This does double calculations to get GATK4 compatible HDF read counts and then depth and gene annotations. Both are needed for creating heterogeneity inputs. Ideally replace with a single mosdepth coverage calculation, and creat GATK4 TSV format: CONTIG START END COUNT chrM 1 1000 13268 """ from bcbio.variation import coverage from bcbio.structural import annotate # GATK compatible target_file = gatkcnv.collect_read_counts(data, work_dir) # heterogeneity compatible target_in = bedutils.clean_file(tz.get_in(["regions", "bins", "target"], data), data, bedprep_dir=work_dir) target_cov = coverage.run_mosdepth(data, "target-gatk", target_in) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) return target_file, target_cov_genes
def _add_variantcalls_to_output(out, data, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = population.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["-v", small_vrn_files[0]] if not is_somatic: cmd += ["-m", "clonal"] if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _add_variantcalls_to_output(out, data, items, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if not is_somatic: cmd += ["-m", "clonal"] gender = _get_batch_gender(items) if gender: cmd += ["--sample-sex", gender] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] out["vrn_file"] = shared.annotate_with_depth(out["vrn_file"], items) return out
def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ if dd.get_background_cnv_reference(data, "seq2c"): bed_file = _background_to_bed( dd.get_background_cnv_reference(data, "seq2c"), data) else: bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError( "BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _add_variantcalls_to_output(out, data, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = population.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["-v", small_vrn_files[0]] if not is_somatic: cmd += ["-m", "clonal"] if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _prep_bed(data, bed_file, work_dir): clean_file = os.path.join(work_dir, "%s-clean.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0])) bed = bt.BedTool(bed_file) col_num = bed.field_count() if not utils.file_uptodate(clean_file, bed_file): bed = bed.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(data, clean_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean BED file into " + clean_file) if col_num < 4: annotated_file = annotate.add_genes(clean_file, data, max_distance=0, work_dir=work_dir) if annotated_file == clean_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) else: annotated_file = clean_file ready_file = os.path.join(work_dir, "%s-clean.bed" % (utils.splitext_plus(os.path.basename(annotated_file))[0])) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) # Report all duplicated annotations one-per-line with file_transaction(data, ready_file) as tx_out_file: with open(tx_out_file, 'w') as out: for r in bed: for g in r.name.split(','): out.write('\t'.join(map(str, [r.chrom, r.start, r.end, g])) + '\n') logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _calculate_sv_coverage_cnvkit(data, work_dir): """Calculate coverage in an CNVkit ready format using mosdepth. """ from bcbio.variation import coverage from bcbio.structural import annotate out_target_file = os.path.join( work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join( work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): target_cov = coverage.run_mosdepth( data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth( data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov.regions, out_anti_file, data) return out_target_file, out_anti_file
def _add_variantcalls_to_output(out, data): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"] ] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file ] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vcf_file"] = calls["vcf"] out["vrn_file"] = annotate.add_genes(calls["bed"], data) return out
def _merge_target_information(samples): out_file = os.path.join("metrics", "target_info.yaml") if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) variant_regions = set(dd.get_variant_regions(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the sample across samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the sample across samples vcr = None if len(variant_regions) == 1: vcr = dd.get_variant_regions_orig(data) vcr_merged = dd.get_variant_regions_merged(data) vcr_ann = annotate.add_genes(vcr, data) info["variants_regions_info"] = { "bed": variant_regions, "size": sum(len(x) for x in pybedtools.BedTool(vcr_merged)), "regions": pybedtools.BedTool(vcr).count(), "genes": len( list( set(r.name for r in pybedtools.BedTool(vcr_ann) if r.name and r.name != "."))), } elif len(variant_regions) == 0: info["variants_regions_info"] = {"bed": None} # Reporting in MultiQC only if the target is the sample across samples if len(coverage_beds) == 1: bed = dd.get_coverage(data) if vcr and vcr == bed: info["coverage_bed_info"] = info["variants_regions_info"] elif bed: ann_bed = annotate.add_genes(bed, data) info["coverage_bed_info"] = { "bed": bed, "size": pybedtools.BedTool(bed).total_coverage(), "regions": pybedtools.BedTool(bed).count(), "genes": len( list( set(r.name for r in pybedtools.BedTool(ann_bed) if r.name and r.name != "."))), } if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name( backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir( os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({ "cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files( cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } pct_coverage = ( pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) raw_coverage_cnns = [ _cnvkit_coverage(cdata, bed, itype) for itype, cdata in samples_to_run for bed in [target_bed, antitarget_bed] ] coverage_cnns = reduce(operator.add, [ _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values() ]) background_cnn = _cnvkit_background( _select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs] return ckouts
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join( raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = os.path.splitext( os.path.basename(test_bam))[0].split(".")[0] ckouts.append({ "cnr": os.path.join(raw_work_dir, "%s.cns" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files( cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = { "type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"] } target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" coverage_cnns = run_multicore( _cnvkit_coverage, [(bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in [target_bed, antitarget_bed]], data["config"], parallel) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data) fixed_cnrs = run_multicore(_cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby(lambda x: x[ "bam"], [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel) called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel) return ckouts