def _segment_normalized_gatk(cnr_file, work_dir, paired): """Segmentation of normalized inputs using GATK4, converting into standard input formats. """ work_dir = utils.safe_makedir(os.path.join(work_dir, "gatk-cnv")) seg_file = gatkcnv.model_segments(cnr_file, work_dir, paired)["seg"] std_seg_file = seg_file.replace(".cr.seg", ".seg") if not utils.file_uptodate(std_seg_file, seg_file): with file_transaction(std_seg_file) as tx_out_file: df = pd.read_csv(seg_file, sep="\t", comment="@", header=0, names=["chrom", "loc.start", "loc.end", "num.mark", "seg.mean"]) df.insert(0, "ID", [dd.get_sample_name(paired.tumor_data)] * len(df)) df.to_csv(tx_out_file, sep="\t", header=True, index=False) std_cnr_file = os.path.join(work_dir, "%s.cnr" % dd.get_sample_name(paired.tumor_data)) if not utils.file_uptodate(std_cnr_file, cnr_file): with file_transaction(std_cnr_file) as tx_out_file: logdf = pd.read_csv(cnr_file, sep="\t", comment="@", header=0, names=["chrom", "start", "end", "log2"]) covdf = pd.read_csv(tz.get_in(["depth", "bins", "antitarget"], paired.tumor_data), sep="\t", header=None, names=["chrom", "start", "end", "orig.name", "depth", "gene"]) df = pd.merge(logdf, covdf, on=["chrom", "start", "end"]) del df["orig.name"] df = df[["chrom", "start", "end", "gene", "log2", "depth"]] df.insert(6, "weight", [1.0] * len(df)) df.to_csv(tx_out_file, sep="\t", header=True, index=False) return std_cnr_file, std_seg_file
def _evaluate_multi(calls, truth_svtypes, work_dir, data): base = os.path.join(work_dir, "%s-sv-validate" % (dd.get_sample_name(data))) out_file = base + ".csv" df_file = base + "-df.csv" if any((not utils.file_uptodate(out_file, x["vrn_file"]) or not utils.file_uptodate(df_file, x["vrn_file"])) for x in calls): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with open(df_file, "w") as df_out_handle: writer = csv.writer(out_handle) dfwriter = csv.writer(df_out_handle) writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"]) dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"]) for svtype, truth in truth_svtypes.items(): for size in EVENT_SIZES: str_size = "%s-%s" % size for call in calls: call_bed = convert.to_bed(call, dd.get_sample_name(data), work_dir, calls, data) if utils.file_exists(call_bed): evalout = _evaluate_one(call["variantcaller"], svtype, size, call_bed, truth, data) writer.writerow([svtype, str_size, call["variantcaller"], evalout["sensitivity"]["label"], evalout["precision"]["label"]]) for metric in ["sensitivity", "precision"]: dfwriter.writerow([svtype, str_size, call["variantcaller"], metric, evalout[metric]["val"], evalout[metric]["label"]]) return out_file, df_file
def _evaluate_multi(callers, truth_svtypes, ensemble, call_beds, data): out_file = "%s-validate.csv" % utils.splitext_plus(ensemble)[0] df_file = "%s-validate-df.csv" % utils.splitext_plus(ensemble)[0] if not utils.file_uptodate(out_file, ensemble) or not utils.file_uptodate(df_file, ensemble): with open(out_file, "w") as out_handle: with open(df_file, "w") as df_out_handle: writer = csv.writer(out_handle) dfwriter = csv.writer(df_out_handle) total_callers = callers_by_event(ensemble, data) writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"]) dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"]) for svtype, truth in truth_svtypes.items(): for size in EVENT_SIZES: str_size = "%s-%s" % size for caller in (x for x in callers if x in total_callers[svtype] or x == "sv-ensemble"): try: call_bed = call_beds[caller] except KeyError: assert caller == "sv-ensemble", caller call_bed = ensemble evalout = _evaluate_one(caller, svtype, size, call_bed, truth, data) writer.writerow([svtype, str_size, caller, evalout["sensitivity"]["label"], evalout["precision"]["label"]]) for metric in ["sensitivity", "precision"]: dfwriter.writerow([svtype, str_size, caller, metric, evalout[metric]["val"], evalout[metric]["label"]]) return out_file, df_file
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def index(in_bam, config): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if (not utils.file_uptodate(index_file, in_bam) and not utils.file_uptodate(alt_index_file, in_bam)): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(index_file) as tx_index_file: samtools_cmd = "{samtools} index {in_bam} {tx_index_file}" if sambamba: cmd = "{sambamba} index -t {num_cores} {in_bam} {tx_index_file}" else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam), log_error=False) except: do.run(samtools_cmd.format(**locals()), "Index BAM file (single core): %s" % os.path.basename(in_bam)) return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None): """bgzip and tabix index an input file, handling VCF and BED. """ if config is None: config = {} out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if out_dir: remove_orig = False out_file = os.path.join(out_dir, os.path.basename(out_file)) if (not utils.file_exists(out_file) or not os.path.lexists(out_file) or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))): assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file assert os.path.exists(in_file), "Input file %s not found" % in_file if not utils.file_uptodate(out_file, in_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" if prep_cmd: prep_cmd = "| %s " % prep_cmd cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise if remove_orig: try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config, tabix_args=tabix_args) return out_file
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) assert sambamba, "Did not find sambamba for indexing" samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) try: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam)) except subprocess.CalledProcessError: cmd = "{samtools} index {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def index(in_bam, config): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if (not utils.file_uptodate(index_file, in_bam) and not utils.file_uptodate(alt_index_file, in_bam)): # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) if sambamba: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" else: cmd = "{samtools} index {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam)) return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
def run_filter(vrn_file, align_bam, ref_file, data, items): """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants. Moves damage estimation to INFO field, instead of leaving in FILTER. """ if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file): return data else: raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0] out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext) for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]] if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file): with file_transaction(items[0], raw_file) as tx_out_file: # Does not apply --qcSummary plotting due to slow runtimes cmd = ["dkfzbiasfilter.py", "--filterCycles", "1", "--passOnly", "--tempFolder", os.path.dirname(tx_out_file), vrn_file, align_bam, ref_file, tx_out_file] do.run(cmd, "Filter low frequency variants for DNA damage and strand bias") for out_plot in out_plot_files: tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots", os.path.basename(out_plot)) if utils.file_exists(tx_plot_file): shutil.move(tx_plot_file, out_plot) raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"]) data["vrn_file"] = _filter_to_info(raw_file, items[0]) out_plot_files = [x for x in out_plot_files if utils.file_exists(x)] data["damage_plots"] = out_plot_files return data
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def regions_coverage(data, bed_file, bam_file, target_name): work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) out_file = os.path.join(work_dir, target_name + "_regions_depth.bed") if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file): return out_file with file_transaction(out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file) + " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file}" do.run(cmdl, message.format(**locals())) return out_file
def _extract_germline(in_file, data): """Extract germline calls non-somatic, non-filtered calls. """ out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(out_file + ".gz", in_file): with file_transaction(data, out_file) as tx_out_file: reader = cyvcf2.VCF(in_file) reader.add_filter_to_header({'ID': 'Somatic', 'Description': 'Variant called as Somatic'}) with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer: for rec in reader: writer.write_record(_update_germline_filters(rec)) return out_file
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data): """Create target and antitarget regions from target and access files. """ target_bed = os.path.join(work_dir, "%s.target.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0]) if not utils.file_uptodate(target_bed, raw_target_bed): with file_transaction(data, target_bed) as tx_out_file: cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file] do.run(cmd, "CNVkit target") antitarget_bed = os.path.join(work_dir, "%s.antitarget.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0]) if not utils.file_uptodate(antitarget_bed, target_bed): with file_transaction(data, antitarget_bed) as tx_out_file: cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file] do.run(cmd, "CNVkit antitarget") return target_bed, antitarget_bed
def merge_overlaps(in_file, data, distance=None, out_dir=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ config = data["config"] if in_file: bedtools = config_utils.get_program("bedtools", config, default="bedtools") work_dir = tz.get_in(["dirs", "work"], data) if out_dir: bedprep_dir = out_dir elif work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: distance = "-d %s" % distance if distance else "" cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def slim_vcf(in_file, data): """Remove larger annotations which slow down VCF processing """ to_remove = ["ANN", "LOF"] to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove]) in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False) out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): cur_remove = [] with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break elif line.startswith(to_remove_str): cur_id = line.split("ID=")[-1].split(",")[0] cur_remove.append("INFO/%s" % cur_id) with file_transaction(data, out_file) as tx_out_file: if cur_remove: cur_remove = ",".join(cur_remove) cmd = ("bcftools view -f 'PASS,.' {in_file} | " "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}") else: cmd = ("bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}") do.run(cmd.format(**locals()), "Create slim VCF") return out_file
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _prep_bed(data, work_dir): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = clean_file(bed_file, data, prefix="svregions-") else: bed_file = clean_file(dd.get_variant_regions(data), data) col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _run_gemini_stats(bam_file, data, out_dir): """Retrieve high level variant statistics from Gemini. """ out = {} gemini_db = (data.get("variants", [{}])[0].get("population", {}).get("db") if data.get("variants") else None) if gemini_db: gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0] if not utils.file_uptodate(gemini_stat_file, gemini_db): gemini = config_utils.get_program("gemini", data["config"]) tstv = subprocess.check_output([gemini, "stats", "--tstv", gemini_db]) gt_counts = subprocess.check_output([gemini, "stats", "--gts-by-sample", gemini_db]) dbsnp_count = subprocess.check_output([gemini, "query", gemini_db, "-q", "SELECT count(*) FROM variants WHERE in_dbsnp==1"]) out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1] for line in gt_counts.split("\n"): parts = line.rstrip().split() if len(parts) > 0 and parts[0] == data["name"][-1]: _, hom_ref, het, hom_var, _, total = parts out["Variations (total)"] = int(total) out["Variations (heterozygous)"] = int(het) out["Variations (homozygous)"] = int(hom_var) break out["Variations (in dbSNP)"] = int(dbsnp_count.strip()) if out.get("Variations (total)") > 0: out["Variations (in dbSNP) pct"] = "%.1f%%" % (out["Variations (in dbSNP)"] / float(out["Variations (total)"]) * 100.0) with open(gemini_stat_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) else: with open(gemini_stat_file) as in_handle: out = yaml.safe_load(in_handle) return out
def add_genome_context(orig_file, data): """Annotate a file with annotations of genome context using vcfanno. """ out_file = "%s-context.vcf.gz" % utils.splitext_plus(orig_file)[0] if not utils.file_uptodate(out_file, orig_file): with file_transaction(data, out_file) as tx_out_file: config_file = "%s.toml" % (utils.splitext_plus(tx_out_file)[0]) with open(config_file, "w") as out_handle: all_names = [] for fname in dd.get_genome_context_files(data): bt = pybedtools.BedTool(fname) if bt.field_count() >= 4: d, base = os.path.split(fname) _, prefix = os.path.split(d) name = "%s_%s" % (prefix, utils.splitext_plus(base)[0]) out_handle.write("[[annotation]]\n") out_handle.write('file = "%s"\n' % fname) out_handle.write("columns = [4]\n") out_handle.write('names = ["%s"]\n' % name) out_handle.write('ops = ["uniq"]\n') all_names.append(name) out_handle.write("[[postannotation]]\n") out_handle.write("fields = [%s]\n" % (", ".join(['"%s"' % n for n in all_names]))) out_handle.write('name = "genome_context"\n') out_handle.write('op = "concat"\n') out_handle.write('type = "String"\n') cmd = "vcfanno {config_file} {orig_file} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Annotate with problem annotations", data) return vcfutils.bgzip_and_index(out_file, data["config"])
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def _uniquify_bed_names(bed_file, out_dir, data): """Chanjo required unique names in the BED file to map to intervals. """ out_file = os.path.join(out_dir, "%s-unames%s" % utils.splitext_plus(os.path.basename(bed_file))) if not utils.file_exists(out_file) or not utils.file_uptodate(out_file, bed_file): with file_transaction(data, out_file) as tx_out_file: with open(bed_file) as in_handle: with open(tx_out_file, "w") as out_handle: namecounts = collections.defaultdict(int) for i, line in enumerate(in_handle): parts = line.rstrip("\r\n").split("\t") if len(parts) >= 4: name = parts[3] else: name = str(i) namecount = namecounts.get(name, 0) namecounts[name] += 1 if namecount > 0: name = "%s-%s" % (name, namecount) if len(parts) >= 4: parts[3] = name else: assert len(parts) == 3 parts.append(name) out_handle.write("\t".join(parts) + "\n") return out_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" group_opts, cons_opts, filter_opts = _get_fgbio_options(data, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def _remove_prioritization(in_file, data): """Remove tumor-only prioritization and return non-filtered calls. """ out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(out_file + ".gz", in_file): with file_transaction(data, out_file) as tx_out_file: reader = cyvcf2.VCF(in_file) reader.add_filter_to_header({'ID': 'Somatic', 'Description': 'Variant called as Somatic'}) # with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer: with open(tx_out_file, "w") as out_handle: out_handle.write(reader.raw_header) for rec in reader: rec = _update_prioritization_filters(rec) out_handle.write(str(rec)) # writer.write_record(rec) return out_file
def _filter_ensemble(in_bed, data): """Filter ensemble set of calls, requiring calls supported by 2 callers. We filter only smaller size events, which seem to benefit the most since they have lower precision. We also check to be sure that the required number of callers actually called in each event, since some callers don't handle all event types. """ support_events = set(["BND", "UKN"]) max_size = max([xs[1] for xs in validate.EVENT_SIZES[:2]]) out_file = "%s-filter%s" % utils.splitext_plus(in_bed) if not utils.file_uptodate(out_file, in_bed): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with open(in_bed) as in_handle: total_callers = validate.callers_by_event(in_bed, data) for line in in_handle: chrom, start, end, caller_strs = line.strip().split()[:4] size = int(end) - int(start) events = collections.defaultdict(set) for event, caller in [x.split("_", 1) for x in caller_strs.split(",")]: events[validate.cnv_to_event(event, data)].add(caller) all_callers = set([]) for event, callers in events.iteritems(): all_callers = all_callers.union(callers) if event not in support_events: if (len(all_callers) > 1 or size > max_size or len(total_callers[event]) <= N_FILTER_CALLERS): out_handle.write(line) break return out_file
def _run_svtyper(in_file, full_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} --max_reads 1000 -B {full_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def _prep_vrn_file(in_file, vcaller, seg_file, work_dir, somatic_info): """Select heterozygous variants in the normal sample with sufficient depth. """ data = somatic_info.tumor_data params = {"min_freq": 0.4, "max_freq": 0.6, "tumor_only": {"min_freq": 0.10, "max_freq": 0.90}, "min_depth": 20, "hetblock": {"min_alleles": 25, "allowed_misses": 2}} out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): #ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info) ready_bed = None if ready_bed and utils.file_exists(ready_bed): sub_file = _create_subset_file(in_file, ready_bed, work_dir, data) else: sub_file = in_file with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "freq"]) bcf_in = pysam.VariantFile(sub_file) for rec in bcf_in: tumor_freq = _is_possible_loh(rec, bcf_in, params, somatic_info) if chromhacks.is_autosomal(rec.chrom) and tumor_freq is not None: writer.writerow([_to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq]) return out_file
def prep_vrn_file(in_file, vcaller, work_dir, somatic_info, writer_class, seg_file=None, params=None): """Select heterozygous variants in the normal sample with sufficient depth. writer_class implements write_header and write_row to write VCF outputs from a record and extracted tumor/normal statistics. """ data = somatic_info.tumor_data if not params: params = PARAMS out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): # ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info) ready_bed = None if ready_bed and utils.file_exists(ready_bed): sub_file = _create_subset_file(in_file, ready_bed, work_dir, data) else: sub_file = in_file max_depth = max_normal_germline_depth(sub_file, params, somatic_info) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = writer_class(out_handle) writer.write_header() bcf_in = pysam.VariantFile(sub_file) for rec in bcf_in: stats = _is_possible_loh(rec, bcf_in, params, somatic_info, max_normal_depth=max_depth) if chromhacks.is_autosomal(rec.chrom) and stats is not None: writer.write_row(rec, stats) return out_file
def _count_files_to_amber(tumor_counts, normal_counts, work_dir, data): """Converts tumor and normal counts from GATK CollectAllelicCounts into Amber format. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(data)) if not utils.file_uptodate(out_file, tumor_counts): with file_transaction(data, out_file) as tx_out_file: with open(tumor_counts) as tumor_handle: with open(normal_counts) as normal_handle: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(["Chromosome", "Position", "TumorBAF", "TumorModifiedBAF", "TumorDepth", "NormalBAF", "NormalModifiedBAF", "NormalDepth"]) header = None for t, n in zip(tumor_handle, normal_handle): if header is None and t.startswith("CONTIG"): header = t.strip().split() elif header is not None: t_vals = dict(zip(header, t.strip().split())) n_vals = dict(zip(header, n.strip().split())) amber_line = _counts_to_amber(t_vals, n_vals) if amber_line: writer.writerow(amber_line) return out_file
def block_regions(callable_bed, in_bam, ref_file, data): """Find blocks of regions for analysis from mapped input BAM file. Identifies islands of callable regions, surrounding by regions with no read support, that can be analyzed independently. """ min_n_size = int(data["config"]["algorithm"].get("nomap_split_size", 250)) with shared.bedtools_tmpdir(data): nblock_bed = "%s-nblocks.bed" % utils.splitext_plus(callable_bed)[0] callblock_bed = "%s-callableblocks.bed" % utils.splitext_plus(callable_bed)[0] if not utils.file_uptodate(nblock_bed, callable_bed): ref_regions = get_ref_bedtool(ref_file, data["config"]) nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions) nblock_regions = _add_config_regions(nblock_regions, ref_regions, data) with file_transaction(data, nblock_bed, callblock_bed) as (tx_nblock_bed, tx_callblock_bed): nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(tx_nblock_bed) if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0: ref_regions.subtract(tx_nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(tx_callblock_bed) else: raise ValueError("No callable regions found from BAM file. Alignment regions might " "not overlap with regions found in your `variant_regions` BED: %s" % in_bam) return callblock_bed, nblock_bed
def _cnn_tranch_filtering(in_file, vrn_files, tensor_type, data): """Filter CNN scored VCFs in tranches using standard SNP and Indel truth sets. """ out_file = "%s-filter.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): runner = broad.runner_from_config(data["config"]) gatk_type = runner.gatk_type() assert gatk_type == "gatk4", "CNN filtering requires GATK4" if "train_hapmap" not in vrn_files: raise ValueError("CNN filtering requires HapMap training inputs: %s" % vrn_files) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "FilterVariantTranches", "--variant", in_file, "--output", tx_out_file, "--snp-truth-vcf", vrn_files["train_hapmap"], "--indel-truth-vcf", vrn_files["train_indels"]] if tensor_type == "reference": params += ["--info-key", "CNN_1D", "--tranche", "99"] else: assert tensor_type == "read_tensor" params += ["--info-key", "CNN_2D", "--tranche", "99"] runner.run_gatk(params) return vcfutils.bgzip_and_index(out_file, data["config"])
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, work_dir, data): """Create target and antitarget regions from target and access files. """ target_bed = os.path.join(work_dir, "%s.target.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0]) if not utils.file_uptodate(target_bed, raw_target_bed): with file_transaction(data, target_bed) as tx_out_file: cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file] if cov_interval == "genome": cmd += ["--avg-size", "500"] # small target regions, use smaller, more defined segments elif pct_coverage < 1.0: cmd += ["--avg-size", "50"] do.run(cmd, "CNVkit target") antitarget_bed = os.path.join(work_dir, "%s.antitarget.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0]) if not os.path.exists(antitarget_bed): with file_transaction(data, antitarget_bed) as tx_out_file: cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file] # small target regions, use smaller antitargets if pct_coverage < 1.0: cmd += ["--avg-size", "100000"] do.run(cmd, "CNVkit antitarget") return target_bed, antitarget_bed
def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ if dd.get_background_cnv_reference(data, "seq2c"): bed_file = _background_to_bed( dd.get_background_cnv_reference(data, "seq2c"), data) else: bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError( "BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _run_svtyper(in_file, full_bam, sr_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus( tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} -M -B {full_bam} -S {sr_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n" ) else: cmd = [ _get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file ] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values( cnr_file) and cov_interval != "genome": cmd += [ "--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample ] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650 paired = vcfutils.get_paired(items) if paired: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ( "%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _callable_from_gvcf(data, vrn_file, out_dir): """Retrieve callable regions based on ref call regions in gVCF. Uses https://github.com/lijiayong/gvcf_regions """ methods = { "freebayes": "freebayes", "platypus": "platypus", "gatk-haplotype": "gatk" } gvcf_type = methods.get(dd.get_variantcaller(data)) if gvcf_type: out_file = os.path.join( out_dir, "%s-gcvf-coverage.bed" % utils.splitext_plus(os.path.basename(vrn_file))[0]) if not utils.file_uptodate(out_file, vrn_file): with file_transaction(data, out_file) as tx_out_file: cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} " "| bedtools merge > {tx_out_file}") do.run(cmd.format(**locals()), "Convert gVCF to BED file of callable regions") return out_file
def apply_bqsr(data): """Apply recalibration, producing a updated BAM file. """ in_file = dd.get_align_bam(data) out_table_file = "%s-recal-table-post.txt" % utils.splitext_plus( in_file)[0] out_file = "%s-recal.bam" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file, out_table_file) as (tx_out_file, tx_table_file): assoc_files = dd.get_variation_resources(data) known = "-k %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(data) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {in_file} --algo QualCal {known} {tx_table_file} " "--algo ReadWriter {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon QualCal apply recalibration") return out_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = ("-Dsamjdk.use_async_io_read_samtools=true -Dsamjdk.use_async_io_write_samtools=true " "-Dsamjdk.compression_level=0") group_opts, cons_opts = _get_fgbio_options(data) tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -s adjacency -i {align_bam} | " "fgbio {jvm_opts} {io_opts} CallMolecularConsensusReads {cons_opts} " "--output-per-base-tags=false --sort-order=unsorted " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def _rtg_add_summary_file(eval_files, base_dir, data): """Parse output TP FP and FN files to generate metrics for plotting. """ out_file = os.path.join(base_dir, "validate-summary.csv") if not utils.file_uptodate(out_file, eval_files.get("tp", eval_files["fp"])): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["sample", "caller", "vtype", "metric", "value"]) base = _get_sample_and_caller(data) for metric in ["tp", "fp", "fn"]: for vtype, bcftools_types in [("SNPs", "--types snps"), ("Indels", "--exclude-types snps")]: in_file = eval_files.get(metric) if in_file and os.path.exists(in_file): cmd = ("bcftools view {bcftools_types} {in_file} | grep -v ^# | wc -l") count = int(subprocess.check_output(cmd.format(**locals()), shell=True)) else: count = 0 writer.writerow(base + [vtype, metric, count]) eval_files["summary"] = out_file return eval_files
def block_regions(in_bam, ref_file, config): """Find blocks of regions for analysis from mapped input BAM file. Identifies islands of callable regions, surrounding by regions with no read support, that can be analyzed independently. """ min_n_size = int(config["algorithm"].get("nomap_split_size", 100)) with shared.bedtools_tmpdir({"config": config}): callable_bed = parallel_callable_loci(in_bam, ref_file, config) nblock_bed = "%s-nblocks%s" % os.path.splitext(callable_bed) callblock_bed = "%s-callableblocks%s" % os.path.splitext(callable_bed) if not utils.file_uptodate(nblock_bed, callable_bed): ref_regions = get_ref_bedtool(ref_file, config) nblock_regions = _get_nblock_regions(callable_bed, min_n_size) nblock_regions = _add_config_regions(nblock_regions, ref_regions, config) nblock_regions.saveas(nblock_bed) if len(ref_regions.subtract(nblock_regions)) > 0: ref_regions.subtract(nblock_bed).merge(d=min_n_size).saveas(callblock_bed) else: raise ValueError("No callable regions found from BAM file. Alignment regions might " "not overlap with regions found in your `variant_regions` BED: %s" % in_bam) return callblock_bed, nblock_bed, callable_bed
def _prioritize_plot_regions(region_bt, data, out_dir=None): """Avoid plotting large numbers of regions due to speed issues. Prioritize most interesting. XXX For now, just removes larger regions and avoid plotting thousands of regions. Longer term we'll insert biology-based prioritization. """ max_plots = 1000 max_size = 100 * 1000 # 100kb out_file = "%s-priority%s" % utils.splitext_plus(region_bt.fn) if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) num_plots = 0 if not utils.file_uptodate(out_file, region_bt.fn): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for r in region_bt: if r.stop - r.start < max_size: if num_plots < max_plots: num_plots += 1 out_handle.write("%s\t%s\t%s\n" % (r.chrom, r.start, r.stop)) return out_file
def calculate(bam_file, data, sv_bed): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None, "coverage_perbase" in dd.get_tools_on(data)), ("sv_regions", bedutils.clean_file(sv_bed, data, prefix="svregions-"), None, None, False), ("coverage", bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-"), None, DEPTH_THRESHOLDS, False)] depth_files = {} for target_name, region_bed, quantize, thresholds, per_base in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds, per_base=per_base) for attr in ("dist", "regions", "thresholds", "per_base"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def pon_to_bed(pon_file, out_dir, data): """Extract BED intervals from a GATK4 hdf5 panel of normal file. """ out_file = os.path.join( out_dir, "%s-intervals.bed" % (utils.splitext_plus(os.path.basename(pon_file))[0])) if not utils.file_uptodate(out_file, pon_file): import h5py with file_transaction(data, out_file) as tx_out_file: with h5py.File(pon_file, "r") as f: with open(tx_out_file, "w") as out_handle: intervals = f["original_data"]["intervals"] for i in range( len(intervals["transposed_index_start_end"][0])): chrom = intervals["indexed_contig_names"][ intervals["transposed_index_start_end"][0][i]] start = int( intervals["transposed_index_start_end"][1][i]) - 1 end = int( intervals["transposed_index_start_end"][2][i]) out_handle.write("%s\t%s\t%s\n" % (chrom, start, end)) return out_file
def to_bed(call, sample, work_dir, calls, data): """Create a simplified BED file from caller specific input. """ out_file = os.path.join(work_dir, "%s-%s-flat.bed" % (sample, call["variantcaller"])) if call.get("vrn_file") and not utils.file_uptodate( out_file, call["vrn_file"]): with file_transaction(data, out_file) as tx_out_file: convert_fn = CALLER_TO_BED.get(call["variantcaller"]) if convert_fn: vrn_file = call["vrn_file"] if call["variantcaller"] in SUBSET_BY_SUPPORT: ecalls = [ x for x in calls if x["variantcaller"] in SUBSET_BY_SUPPORT[call["variantcaller"]] ] if len(ecalls) > 0: vrn_file = _subset_by_support(call["vrn_file"], ecalls, data) convert_fn(vrn_file, call["variantcaller"], tx_out_file) if utils.file_exists(out_file): return out_file
def finalize_vcf(in_file, variantcaller, items): """Perform cleanup and dbSNP annotation of the final VCF. - Adds contigs to header for bcftools compatibility - adds sample information for tumor/normal """ out_file = "%s-annotated%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): header_cl = _add_vcf_header_sample_cl(in_file, items, out_file) contig_cl = _add_contig_cl(in_file, items, out_file) cls = [x for x in (contig_cl, header_cl) if x] if cls: post_cl = " | ".join(cls) + " | " else: post_cl = None dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), items[0]) if dbsnp_file: out_file = _add_dbsnp(in_file, dbsnp_file, items[0], out_file, post_cl) if utils.file_exists(out_file): return vcfutils.bgzip_and_index(out_file, items[0]["config"]) else: return in_file
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam)) utils.symlink_plus(in_bam, local_bam) bam.index(local_bam, data["config"]) cmd = ( "samtools view -h {local_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - " ) do.run( cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. Normalized qualities to 4 bin outputs based on pipeline standard recommendations, which will help with output file sizes: https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md#base-quality-score-binning-scheme spark host and timeout settings help deal with runs on restricted systems where we encounter network and timeout errors """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() cores = dd.get_num_cores(data) if gatk_type == "gatk4": params = ["-T", "ApplyBQSRSpark", "--spark-master", "local[%s]" % cores, "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"], "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--quantize-quals", "4"] else: params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file] # Avoid problems with intel deflater for GATK 3.8 and GATK4 # https://github.com/chapmanb/bcbio-nextgen/issues/2145#issuecomment-343095357 if gatk_type == "gatk4": params += ["--jdk-deflater", "--jdk-inflater"] elif LooseVersion(broad_runner.gatk_major_version()) > LooseVersion("3.7"): params += ["-jdk_deflater", "-jdk_inflater"] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) bam.index(out_file, data["config"]) return out_file
def _filter_by_normal(tumor_counts, normal_counts, data): """Filter count files based on normal frequency and median depth, avoiding high depth regions. For frequency, restricts normal positions to those between 0.4 and 0.65 For depth, matches approach used in AMBER to try and avoid problematic genomic regions with high count in the normal: https://github.com/hartwigmedical/hmftools/tree/master/amber#usage """ from bcbio.heterogeneity import bubbletree fparams = bubbletree.NORMAL_FILTER_PARAMS tumor_out = "%s-normfilter%s" % utils.splitext_plus(tumor_counts) normal_out = "%s-normfilter%s" % utils.splitext_plus(normal_counts) if not utils.file_uptodate(tumor_out, tumor_counts): with file_transaction(data, tumor_out, normal_out) as (tx_tumor_out, tx_normal_out): median_depth = _get_normal_median_depth(normal_counts) min_normal_depth = median_depth * fparams["min_depth_percent"] max_normal_depth = median_depth * fparams["max_depth_percent"] with open(tumor_counts) as tumor_handle: with open(normal_counts) as normal_handle: with open(tx_tumor_out, "w") as tumor_out_handle: with open(tx_normal_out, "w") as normal_out_handle: header = None for t, n in zip(tumor_handle, normal_handle): if header is None: if not n.startswith("@"): header = n.strip().split() tumor_out_handle.write(t) normal_out_handle.write(n) elif (_normal_passes_depth( header, n, min_normal_depth, max_normal_depth) and _normal_passes_freq( header, n, fparams)): tumor_out_handle.write(t) normal_out_handle.write(n) return tumor_out, normal_out
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter( lambda x: x is not None, list( set([ tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items ]))) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _convert_to_csv(vcf_file, good_bed, bad_bed): """Convert WHAM output file into BED format for graphical exploration. """ attrs = ["PU", "LRT", "SI", "MQ"] buffer_size = 25 # bp around break ends out_file = "%s-metrics.csv" % utils.splitext_plus(vcf_file)[0] if not utils.file_uptodate(out_file, vcf_file): lrts = [] good = _read_bed(good_bed) bad = _read_bed(bad_bed) with open(out_file, "w") as out_handle: reader = vcf.Reader(filename=vcf_file) writer = csv.writer(out_handle) header = ["chrom", "start", "end", "class", "attr", "val"] writer.writerow(header) for rec in reader: start = max(rec.start - buffer_size, 0) if rec.INFO["BE"][0] not in [".", None]: other_chrom, end, count = rec.INFO["BE"] if int(end) > start and other_chrom == rec.CHROM: end = int(end) + buffer_size if (rec.CHROM, start, end) in good: cur_class = "good" elif (rec.CHROM, start, end) in bad: cur_class = "bad" else: cur_class = None if cur_class: lrts.append(rec.INFO["LRT"]) for attr in attrs: writer.writerow([ rec.CHROM, start, end, cur_class, attr, rec.INFO[attr] ]) import numpy as np print np.mean(lrts), np.median(lrts), np.percentile( lrts, 25), max(lrts), min(lrts) return out_file
def sort_merge(in_file, data, out_dir=None): """Sort and merge a BED file, collapsing gene names. Output is a 3 or 4 column file (the 4th column values go comma-separated). """ out_file = "%s-sortmerge.bed" % os.path.splitext(in_file)[0] bedtools = config_utils.get_program("bedtools", data, default="bedtools") if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): column_opt = "" with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser", "@")): parts = line.split() if len(parts) >= 4: column_opt = "-c 4 -o distinct" with file_transaction(data, out_file) as tx_out_file: cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" sort_cmd = get_sort_cmd(os.path.dirname(tx_out_file)) cmd = ("{cat_cmd} {in_file} | {sort_cmd} -k1,1 -k2,2n | " "{bedtools} merge -i - {column_opt} > {tx_out_file}") do.run(cmd.format(**locals()), "Sort and merge BED file", data) return out_file
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() cores = dd.get_num_cores(data) if gatk_type == "gatk4": params = ["-T", "ApplyBQSRSpark", "--sparkMaster", "local[%s]" % cores, "--input", in_file, "--output", tx_out_file, "--bqsr_recal_file", data["prep_recal"], "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(cores > 1 and gatk_type == "gatk4")) bam.index(out_file, data["config"]) return out_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" group_opts, cons_opts = _get_fgbio_options(data, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=unsorted " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def _remove_overlaps(in_file, out_dir, data): """Remove regions that overlap with next region, these result in issues with PureCN. """ out_file = os.path.join( out_dir, "%s-nooverlaps%s" % utils.splitext_plus(os.path.basename(in_file))) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: prev_line = None for line in in_handle: if prev_line: pchrom, pstart, pend = prev_line.split("\t", 4)[:3] cchrom, cstart, cend = line.split("\t", 4)[:3] # Skip if chromosomes match and end overlaps start if pchrom == cchrom and int(pend) > int(cstart): pass else: out_handle.write(prev_line) prev_line = line out_handle.write(prev_line) return out_file
def coverage_region_detailed_stats(bed_file, data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ if not bed_file or not utils.file_exists(bed_file): return [] else: cov_file, dist_file = _run_mosdepth(bed_file, data) out_cov_file = os.path.join(out_dir, os.path.basename(cov_file)) out_dist_file = os.path.join(out_dir, os.path.basename(dist_file)) if not utils.file_uptodate(out_cov_file, cov_file): utils.copy_plus(cov_file, out_cov_file) utils.copy_plus(dist_file, out_dist_file) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} if extra_cutoffs: cutoffs = sorted(list(cutoffs | extra_cutoffs)) out_files = _calculate_percentiles(out_cov_file, out_dist_file, cutoffs, out_dir, data) return [os.path.abspath(x) for x in out_files]
def _prep_cnv_file(cns_file, svcaller, work_dir, data): """Create a CSV file of CNV calls with log2 and number of marks. """ in_file = cns_file out_file = os.path.join( work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], svcaller)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: reader = csv.reader(in_handle, dialect="excel-tab") writer = csv.writer(out_handle) writer.writerow( ["chrom", "start", "end", "num.mark", "seg.mean"]) reader.next() # header for chrom, start, end, _, log2, probes in ( xs[:6] for xs in reader): if chromhacks.is_autosomal(chrom): writer.writerow([ _to_ucsc_style(chrom), start, end, probes, log2 ]) return out_file
def _filter_ensemble(in_bed, data): """Filter ensemble set of calls, requiring calls supported by 2 callers. We filter only smaller size events, which seem to benefit the most since they have lower precision. We also check to be sure that the required number of callers actually called in each event, since some callers don't handle all event types. """ support_events = set(["BND", "UKN"]) max_size = max([xs[1] for xs in validate.EVENT_SIZES[:2]]) out_file = "%s-filter%s" % utils.splitext_plus(in_bed) if not utils.file_uptodate(out_file, in_bed): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with open(in_bed) as in_handle: total_callers = validate.callers_by_event(in_bed, data) for line in in_handle: chrom, start, end, caller_strs = line.strip().split( )[:4] size = int(end) - int(start) events = collections.defaultdict(set) for event, caller in [ x.split("_", 1) for x in caller_strs.split(",") ]: events[validate.cnv_to_event(event, data)].add(caller) all_callers = set([]) for event, callers in events.iteritems(): all_callers = all_callers.union(callers) if event not in support_events: if (len(all_callers) > 1 or size > max_size or len(total_callers[event]) <= N_FILTER_CALLERS): out_handle.write(line) break return out_file
def _collapse_transcripts(in_file, window, data, out_dir): """Collapse transcripts into min/max coordinates and optionally add windows. """ if out_dir is None: out_dir = os.path.dirname(in_file) out_file = os.path.join( out_dir, "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0], window)) chrom_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]): chrom_sizes[contig.name] = contig.size if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file) sort_cmd = bedutils.get_sort_cmd() cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}" do.run(cmd.format(**locals()), "Sort BED file by transcript name") with open(tx_out_file, "w") as out_handle: # Work around for segmentation fault issue with groupby # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476 x = pybedtools.BedTool(prep_file) def gen(): for r in x: yield r for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)): rs = list(rs) r = rs[0] for gcoords in _group_coords(rs): min_pos = max(min(gcoords) - window, 0) max_pos = min( max(gcoords) + window, chrom_sizes[r.chrom]) out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name)) return bedutils.sort_merge(out_file, data)
def _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data): """Retrieve intervals to run validation on, merging reference and callable BED files. """ a_intervals = get_analysis_intervals(data, vrn_file, base_dir) if a_intervals: final_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: caller = _get_caller(data) sample = dd.get_sample_name(data) combo_intervals = os.path.join(base_dir, "%s-%s-%s-wrm.bed" % (utils.splitext_plus(os.path.basename(final_intervals))[0], sample, caller)) if not utils.file_uptodate(combo_intervals, final_intervals): with file_transaction(data, combo_intervals) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): # Copy files locally to avoid issues on shared filesystems # where BEDtools has trouble accessing the same base # files from multiple locations a = os.path.basename(final_intervals) b = os.path.basename(rm_interval_file) try: shutil.copyfile(final_intervals, a) except IOError: time.sleep(60) shutil.copyfile(final_intervals, a) try: shutil.copyfile(rm_interval_file, b) except IOError: time.sleep(60) shutil.copyfile(rm_interval_file, b) cmd = ("bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}") do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval") final_intervals = combo_intervals else: assert rm_interval_file, "No intervals to subset analysis with for %s" % vrn_file final_intervals = shared.remove_lcr_regions(rm_interval_file, [data]) return final_intervals