def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) elif len(fnames) == 3: r1, r2, r3 = sorted(fnames) to_run.append([r1, r2]) extras.append(r3) else: assert len(fnames) == 1, fnames extras.append(fnames[0]) ready_to_run = [] for r1, r2 in to_run: target = os.path.commonprefix([r1, r2]) r3 = None for test_r3 in extras: if os.path.commonprefix([r1, test_r3]) == target and os.path.commonprefix([r2, test_r3]) == target: r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join(outdir, os.path.commonprefix([r1, r2, r3]).rstrip("_R")) ready_to_run.append([base_name, r1, r3, r2, {"algorithm": {}, "resources": {}}]) parallel = {"type": "local", "cores": len(ready_to_run), "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) elif len(fnames) == 3: r1, r2, r3 = sorted(fnames) to_run.append([r1, r2]) extras.append(r3) else: assert len(fnames) == 1, fnames extras.append(fnames[0]) ready_to_run = [] tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None for r1, r2 in to_run: target = _commonprefix([r1, r2]) if tags: base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2]))) umi = None else: r3 = None for test_r3 in extras: if (_commonprefix([r1, test_r3]) == target and _commonprefix([r2, test_r3]) == target): r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3]))) r1, r2, umi = _find_umi([r1, r2, r3]) ready_to_run.append([base_name, r1, r2, umi, tags, {"algorithm": {}, "resources": {}}]) parallel = {"type": "local", "cores": args.cores, "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def _prep_grabix_indexes(in_files, dirs, config): if in_files[0].endswith(".bam") and len(in_files) == 1 or in_files[1] is None: out = _bgzip_from_bam(in_files[0], dirs, config) else: out = run_multicore(_bgzip_from_fastq, [[{"in_file": x, "dirs": dirs, "config": config}] for x in in_files if x], config) items = [[{"bgzip_file": x, "config": copy.deepcopy(config)}] for x in out if x] run_multicore(_grabix_index, items, config) return out
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]} target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [ (bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data) ], data["config"], parallel, ) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data, ) fixed_cnrs = run_multicore( _cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel, ) called_segs = run_multicore( _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel ) return ckouts
def _prep_grabix_indexes(in_files, dirs, data): if _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], dirs, data["config"]) elif _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], dirs, data) else: out = run_multicore(_bgzip_from_fastq, [[{"in_file": x, "dirs": dirs, "config": data["config"]}] for x in in_files if x], data["config"]) items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in out if x] run_multicore(_grabix_index, items, data["config"]) return out
def _prep_grabix_indexes(in_files, dirs, data): if _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], dirs, data["config"]) elif _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], dirs, data) else: inputs = [{"in_file": x, "dirs": dirs, "config": data["config"], "rgnames": data["rgnames"]} for x in in_files if x] if "pbgzip" not in dd.get_tools_off(data): out = [_bgzip_from_fastq(d) for d in inputs] else: out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"]) items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in out if x] run_multicore(_grabix_index, items, data["config"]) return out
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) split_cnns = run_multicore(_cnvkit_coverage, [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds], inputs[0]["config"], parallel) raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0]) coverage_cnns = run_multicore(_cnvkit_metrics, [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()], inputs[0]["config"], parallel) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr, data in fixed_cnrs], inputs[0]["config"], parallel) return ckouts
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def _prep_grabix_indexes(in_files, data): """Parallel preparation of grabix indexes for files. """ # if we have gzipped but not bgzipped, add a fake index for CWL support # Also skips bgzip indexing if we don't need alignment splitting if _ready_gzip_fastq(in_files, data) and (not _ready_gzip_fastq(in_files, data, require_bgzip=True) or dd.get_align_split_size(data) is False): for in_file in in_files: if not utils.file_exists(in_file + ".gbi"): with file_transaction(data, in_file + ".gbi") as tx_gbi_file: with open(tx_gbi_file, "w") as out_handle: out_handle.write("Not grabix indexed; index added for compatibility.\n") else: items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in in_files if x] run_multicore(_grabix_index, items, data["config"]) return data
def prep_fastq_inputs(in_files, data): """Prepare bgzipped fastq inputs """ if len(in_files) == 1 and _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], data["dirs"], data) elif len(in_files) == 1 and _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], data["dirs"], data) elif len(in_files) in [1, 2] and _ready_gzip_fastq(in_files, data): out = _symlink_in_files(in_files, data) else: if len(in_files) > 2: fpairs = fastq.combine_pairs(in_files) pair_types = set([len(xs) for xs in fpairs]) assert len(pair_types) == 1 fpairs.sort(key=lambda x: os.path.basename(x[0])) organized = [[xs[0] for xs in fpairs]] if len(fpairs[0]) > 1: organized.append([xs[1] for xs in fpairs]) in_files = organized parallel = {"type": "local", "num_jobs": len(in_files), "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))} inputs = [{"in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"], "is_cwl": "cwl_keys" in data, "rgnames": data["rgnames"]} for i, x in enumerate(in_files) if x] out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel) return out
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. Falls back to slower CombineVariants if fails due to GATK stringency issues. """ if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") params = ["org.broadinstitute.gatk.tools.CatVariants", "-R" , ref_file, "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"] jvm_opts = broad.get_gatk_framework_opts(config, include_gatk=False) cmd = [config_utils.get_program("gatk-framework", config)] + params + jvm_opts try: do.run(cmd, "Concat variant files", log_error=False) except subprocess.CalledProcessError, msg: if str(msg).find("We require all VCFs to have complete VCF headers"): return combine_variant_files(orig_files, out_file, ref_file, config) else: raise
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel): """Combine variants in parallel by chromosome, concatenating final outputs. """ file_key = "vcf_files" def split_by_region(data): base, ext = utils.splitext_plus(os.path.basename(out_file)) args = [] for region in [x.name for x in ref.file_contigs(ref_file, config)]: region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext)) utils.safe_makedir(os.path.dirname(region_out)) args.append((region_out, ref_file, config, region)) return out_file, args config = copy.deepcopy(config) config["file_key"] = file_key prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config) items = [[{file_key: prep_files}]] parallel_split_combine( items, split_by_region, run_parallel, "merge_variant_files", "concat_variant_files", file_key, ["region", "sam_ref", "config"], split_outfile_i=0, ) return out_file
def run(items, background=None): """Detect copy number variations from batched set of samples using cn.mops. """ if not background: background = [] names = [tz.get_in(["rgnames", "sample"], x) for x in items + background] work_bams = [x["align_bam"] for x in items + background] if len(items + background) < 2: raise ValueError("cn.mops only works on batches with multiple samples") data = items[0] work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", names[0], "cn_mops")) parallel = {"type": "local", "cores": data["config"]["algorithm"].get("num_cores", 1), "progs": ["delly"]} with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: chroms = [None] if _get_regional_bed_file(items[0]) else pysam_work_bam.references out_files = run_multicore(_run_on_chrom, [(chrom, work_bams, names, work_dir, items) for chrom in chroms], data["config"], parallel) out_file = _combine_out_files(out_files, work_dir, data) out = [] for data in items: if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "cn_mops", "vrn_file": _prep_sample_cnvs(out_file, data)}) out.append(data) return out
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Lightweight approach to merging VCF files split by regions with the same sample information, so no complex merging needed. Handles both plain text and bgzipped/tabix indexed outputs. """ if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: sorted_files = _sort_by_region(orig_files, regions, ref_file, config) filtered_files = [x for x in sorted_files if vcf_has_variants(x)] if len(filtered_files) > 0 and filtered_files[0].endswith(".gz"): filtered_files = run_multicore(p_bgzip_and_index, [[x, config] for x in filtered_files], config) input_vcf_file = "%s-files.txt" % utils.splitext_plus(out_file)[0] with open(input_vcf_file, "w") as out_handle: for fname in filtered_files: out_handle.write(fname + "\n") if len(filtered_files) > 0: compress_str = "| bgzip -c " if out_file.endswith(".gz") else "" cmd = "vcfcat `cat {input_vcf_file}` {compress_str} > {tx_out_file}" do.run(cmd.format(**locals()), "Concatenate variants") else: # try to rescue sample names from individual vcf files my_samples = None for vrn_file in sorted_files: if vrn_file.endswith(".gz"): tabix_index(vrn_file, config) my_reader = vcf.Reader(filename = vrn_file) if len(my_reader.samples) > 0: my_samples = my_reader.samples[:] break write_empty_vcf(tx_out_file, None, my_samples) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) dict_file = "%s.dict" % utils.splitext_plus(ref_file)[0] cores = dd.get_num_cores({"config": config}) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None cmd = ["picard"] + broad.get_picard_opts(config, memscale) + \ ["MergeVcfs", "D=%s" % dict_file, "O=%s" % tx_out_file] + \ ["I=%s" % f for f in ready_files] cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(cmd)) do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} bytype_vcfs = run_multicore(_run_delly, [(work_bams, sv_type, ref_file, work_dir, items) for sv_type in ["DEL", "DUP", "INV", "TRA"]], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) elif len(fnames) == 3: r1, r2, r3 = sorted(fnames) to_run.append([r1, r2]) extras.append(r3) else: assert len(fnames) == 1, fnames extras.append(fnames[0]) ready_to_run = [] tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None if not tags: # Aim for 2 or 3 simultaneous processes, each with multiple cores target_processes = 2 process_cores = max(1, (args.cores // target_processes) + (args.cores % target_processes)) overall_processes = max(1, int(math.ceil(args.cores / float(process_cores)))) else: process_cores = 1 overall_processes = args.cores for r1, r2 in to_run: target = _commonprefix([r1, r2]) if tags: base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2]))) umi = None else: r3 = None for test_r3 in extras: if (_commonprefix([r1, test_r3]) == target and _commonprefix([r2, test_r3]) == target): r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3]))) r1, r2, umi = _find_umi([r1, r2, r3]) # fastp handles a single pair of reads so we split processing to run on each if umi and not tags: ready_to_run.append([base_name, r1, None, umi, None, process_cores, {"algorithm": {}, "resources": {}}]) ready_to_run.append([base_name, None, r2, umi, None, process_cores, {"algorithm": {}, "resources": {}}]) else: ready_to_run.append([base_name, r1, r2, umi, tags, process_cores, {"algorithm": {}, "resources": {}}]) parallel = {"type": "local", "cores": overall_processes, "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = ["DEL", "DUP"] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file}) out.append(data) return out
def concat_variant_files_bcftools(orig_files, out_file, config): if not utils.file_exists(out_file): exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") return _run_concat_variant_files_bcftools(input_file_list, out_file, config) else: return bgzip_and_index(out_file, config)
def _run_cnvkit_shared_orig(inputs, backgrounds): """Original CNVkit implementation with full normalization and segmentation. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \ list(zip(["evaluate"] * len(inputs), inputs)) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, inputs, target_bed, antitarget_bed) else: coverage_cnns = raw_coverage_cnns background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, inputs, target_bed, antitarget_bed) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs] return ckouts
def _get_file_list(orig_files, out_file, regions, ref_file, config): """Create file with region sorted list of non-empty VCFs for concatenating. """ sorted_files = _sort_by_region(orig_files, regions, ref_file, config) exist_files = [x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x)] if len(exist_files) == 0: # no non-empty inputs, merge the empty ones exist_files = [x for x in sorted_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0] with open(input_file_list, "w") as out_handle: for fname in ready_files: out_handle.write(fname + "\n") return input_file_list
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend(["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) params.extend(["--genotypemergeoption", "PRIORITIZE"]) if quiet_out: params.extend(["--suppressCommandLineHeader", "--setKey", "null"]) if region: variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += ["-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION"] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale) cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage depths by GC, repeats and background using CNVkit - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) return back_files, out_files
def _do_merge(orig_files, out_file, config, region): """Do the actual work of merging with bcftools merge. """ if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with short_filenames(run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)) as fs: prep_files = " ".join(fs) bcftools = config_utils.get_program("bcftools", config) output_type = "z" if out_file.endswith(".gz") else "v" region_str = "-r {}".format(region) if region else "" cmd = "{bcftools} merge -O {output_type} {region_str} {prep_files} > {tx_out_file}" do.run(cmd.format(**locals()), "Merge variants") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0])) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []} rs = run_multicore(_run_wham_coords, [(inputs, background_bams, coord, out_file) for coord in coords], inputs[0]["config"], parallel) rs = {coord: fname for (coord, fname) in rs} vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords, dd.get_ref_file(inputs[0]), inputs[0]["config"]) return out_file
def _cram_to_fastq_regions(regions, cram_file, dirs, data): """Convert CRAM files to fastq, potentially within sub regions. Returns multiple fastq files that can be merged back together. """ base_name = utils.splitext_plus(os.path.basename(cram_file))[0] work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep", "%s-parts" % base_name)) fnames = run_multicore(_cram_to_fastq_region, [(cram_file, work_dir, base_name, region, data) for region in regions], data["config"]) # check if we have paired or single end data if any(not _is_gzip_empty(p1) for p1, p2, s in fnames): out = [[p1, p2] for p1, p2, s in fnames] else: out = [[s] for p1, p2, s in fnames] return out, work_dir
def _prep_fastq_inputs(in_files, data): """Prepare bgzipped fastq inputs """ if _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], data["dirs"], data) elif _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], data["dirs"], data) elif _ready_bgzip_fastq(in_files, data): out = in_files else: parallel = {"type": "local", "num_jobs": len(in_files), "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))} inputs = [{"in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"], "is_cwl": "cwl_keys" in data, "rgnames": data["rgnames"]} for i, x in enumerate(in_files) if x] out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel) return out
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.bedpe" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0])) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["wham"]} rs = run_multicore(_run_wham_coords, [(inputs, background_bams, coord, out_file) for coord in coords], inputs[0]["config"], parallel) rs = {coord: fname for (coord, fname) in rs} for coord in coords: with open(rs[coord]) as in_handle: shutil.copyfileobj(in_handle, out_handle) return out_file
def _do_merge(orig_files, out_file, config, region): """Do the actual work of merging with bcftools merge. """ if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: _check_samples_nodups(orig_files) prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config) input_vcf_file = "%s-files.txt" % utils.splitext_plus(out_file)[0] with open(input_vcf_file, "w") as out_handle: for fname in prep_files: out_handle.write(fname + "\n") bcftools = config_utils.get_program("bcftools", config) output_type = "z" if out_file.endswith(".gz") else "v" region_str = "-r {}".format(region) if region else "" cmd = "{bcftools} merge -O {output_type} {region_str} `cat {input_vcf_file}` > {tx_out_file}" do.run(cmd.format(**locals()), "Merge variants") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = vfilter.hard_w_expression(delly_sample_vcf, "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data, name="DVSupport") data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf}) out.append(data) return out