def _run_variantcall_batch_multicore(items, regions, final_file): """Run variant calling on a batch of items using multiple cores. """ batch_name = _get_batch_name(items) variantcaller = _get_batch_variantcaller(items) work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items] def split_fn(data): out = [] for region in regions: region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) out.append((region, work_bams, out_file)) return final_file, out parallel = { "type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1 } run_parallel = dmulti.runner(parallel, items[0]["config"]) to_run = copy.deepcopy(items[0]) to_run["sam_ref"] = dd.get_ref_file(to_run) to_run["group_orig"] = items parallel_split_combine([[to_run]], split_fn, run_parallel, "variantcall_sample", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return final_file
def _run_variantcall_batch_multicore(items, regions, final_file): """Run variant calling on a batch of items using multiple cores. """ batch_name = _get_batch_name(items) variantcaller = _get_batch_variantcaller(items) work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items] def split_fn(data): out = [] for region in regions: region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) out.append((region, work_bams, out_file)) return final_file, out parallel = {"type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1} run_parallel = dmulti.runner(parallel, items[0]["config"]) to_run = copy.deepcopy(items[0]) to_run["sam_ref"] = dd.get_ref_file(to_run) to_run["group_orig"] = items parallel_split_combine([[to_run]], split_fn, run_parallel, "variantcall_sample", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return final_file
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel): """Combine variants in parallel by chromosome, concatenating final outputs. """ file_key = "vcf_files" def split_by_region(data): base, ext = utils.splitext_plus(os.path.basename(out_file)) args = [] for region in [x.name for x in ref.file_contigs(ref_file, config)]: region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext)) utils.safe_makedir(os.path.dirname(region_out)) args.append((region_out, ref_file, config, region)) return out_file, args config = copy.deepcopy(config) config["file_key"] = file_key prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config) items = [[{file_key: prep_files}]] parallel_split_combine(items, split_by_region, run_parallel, "merge_variant_files", "concat_variant_files", file_key, ["region", "sam_ref", "config"], split_outfile_i=0) return out_file
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel): """Combine variants in parallel by chromosome, concatenating final outputs. """ file_key = "vcf_files" items = [[{file_key: orig_files}]] def split_by_region(data): base, ext = os.path.splitext(os.path.basename(out_file)) args = [] for region in [x["SN"] for x in _ref_file_contigs(ref_file, config)]: region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext)) utils.safe_makedir(os.path.dirname(region_out)) args.append((region_out, ref_file, config, True, region)) return out_file, args config = copy.deepcopy(config) config["file_key"] = file_key parallel_split_combine(items, split_by_region, run_parallel, "combine_variant_files", "concat_variant_files", file_key, ["region", "sam_ref", "config"], split_outfile_i=0) return out_file
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel): """Combine variants in parallel by chromosome, concatenating final outputs. """ file_key = "vcf_files" def split_by_region(data): base, ext = utils.splitext_plus(os.path.basename(out_file)) args = [] for region in [x.name for x in ref.file_contigs(ref_file, config)]: region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext)) utils.safe_makedir(os.path.dirname(region_out)) args.append((region_out, ref_file, config, region)) return out_file, args config = copy.deepcopy(config) config["file_key"] = file_key prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config) items = [[{file_key: prep_files}]] parallel_split_combine( items, split_by_region, run_parallel, "merge_variant_files", "concat_variant_files", file_key, ["region", "sam_ref", "config"], split_outfile_i=0, ) return out_file
def parallel_prep_region(samples, regions, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions(regions, "bamprep", "-prep.bam", file_key) return parallel_split_combine(samples, split_fn, run_parallel, "piped_bamprep", None, file_key, ["config"])
def parallel_callable_loci(in_bam, ref_file, data): config = copy.deepcopy(data["config"]) num_cores = config["algorithm"].get("num_cores", 1) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) data = { "work_bam": in_bam, "config": config, "reference": data["reference"], "dirs": { "out": out_dir } } parallel = { "type": "local", "cores": num_cores, "module": "bcbio.distributed" } items = [[data]] with prun.start(parallel, items, config, multiplier=int(num_cores)) as runner: split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam", remove_alts=True) out = parallel_split_combine(items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) return parallel_split_combine(samples, split_fn, run_parallel, "piped_bamprep", None, file_key, ["config"])
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) config = copy.deepcopy(config) data = { "work_bam": in_bam, "config": config, "reference": { "fasta": { "base": ref_file } } } parallel = { "type": "local", "cores": num_cores, "module": "bcbio.distributed" } items = [[data]] with prun.start(parallel, items, config, multiplier=int(num_cores)) as runner: split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine(items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config} parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"} runner = parallel_runner(parallel, {}, config) split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine([[data]], split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def parallel_callable_loci(in_bam, ref_file, data): config = copy.deepcopy(data["config"]) num_cores = config["algorithm"].get("num_cores", 1) data = {"work_bam": in_bam, "config": config, "reference": data["reference"]} parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"} items = [[data]] with prun.start(parallel, items, config, multiplier=int(num_cores)) as runner: split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam", remove_alts=True) out = parallel_split_combine( items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"] )[0] return out[0]["callable_bed"]
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = {"direction": "decrease", "magnitude": 2} data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config} parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"} items = [[data]] with prun.start(parallel, items, config) as runner: split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine( items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"] )[0] return out[0]["callable_bed"]
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = {"direction": "decrease", "magnitude": 2} data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config} parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"} items = [[data]] with prun.start(parallel, items, config, multiplier=int(num_cores)) as runner: split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine(items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config} parallel = { "type": "local", "cores": num_cores, "module": "bcbio.distributed" } runner = parallel_runner(parallel, {}, config) split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine([[data]], split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def parallel_variantcall(sample_info, parallel_fn): """Provide sample genotyping, running in parallel over individual chromosomes. """ to_process = [] finished = [] for x in sample_info: if x[0]["config"]["algorithm"]["snpcall"]: to_process.append(x) else: finished.append(x) if len(to_process) > 0: split_fn = split_bam_by_chromosome("-variants.vcf", "work_bam") processed = parallel_split_combine(to_process, split_fn, parallel_fn, "variantcall_sample", "combine_variant_files", "vrn_file", ["sam_ref", "config"]) finished.extend(processed) return finished
def parallel_realign_sample(sample_info, parallel_fn): """Realign samples, running in parallel over individual chromosomes. """ to_process = [] finished = [] for x in sample_info: if x[0]["config"]["algorithm"]["snpcall"]: to_process.append(x) else: finished.append(x) if len(to_process) > 0: file_key = "work_bam" split_fn = split_bam_by_chromosome("-realign.bam", file_key, default_targets=["nochr"]) processed = parallel_split_combine( to_process, split_fn, parallel_fn, "realign_sample", "combine_bam", file_key, ["config"] ) finished.extend(processed) return finished
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_recalibrate(data) and not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: data["align_bam"] = data["work_bam"] a = data["config"]["algorithm"] if (not a.get("recalibrate") and not a.get("realign") and not a.get("variantcaller", "gatk")): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def parallel_realign_sample(sample_info, parallel_fn): """Realign samples, running in parallel over individual chromosomes. """ to_process = [] finished = [] for x in sample_info: if x[0]["config"]["algorithm"].get("realign", True): to_process.append(x) else: finished.append(x) if len(to_process) > 0: file_key = "work_bam" split_fn = process_bam_by_chromosome("-realign.bam", file_key, default_targets=["nochr"]) processed = parallel_split_combine(to_process, split_fn, parallel_fn, "realign_sample", "combine_bam", file_key, ["config"]) finished.extend(processed) return finished
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: # Do not want to re-run duplicate marking after realignment data = dd.set_mark_duplicates(data, False) torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def parallel_write_recal_bam(xs, parallel_fn): """Rewrite a recalibrated BAM file in parallel, working off each chromosome. """ to_process = [] finished = [] for x in xs: if x[0]["config"]["algorithm"].get("recalibrate", True): to_process.append(x) else: finished.append(x) if len(to_process) > 0: file_key = "work_bam" split_fn = process_bam_by_chromosome("-gatkrecal.bam", file_key, default_targets=["nochr"]) processed = parallel_split_combine(to_process, split_fn, parallel_fn, "write_recal_bam", "combine_bam", file_key, ["config"]) finished.extend(processed) # Save diskspace from original to recalibrated #save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam, # data["config"]) return finished
def parallel_prep_region(samples, regions, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions(regions, "bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no prep or # variant calling extras = [] torun = [] for data in [x[0] for x in samples]: a = data["config"]["algorithm"] if (not a.get("mark_duplicates") and not a.get("recalibrate") and not a.get("realign", "gatk") and not a.get("variantcaller", "gatk")): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", None, file_key, ["config"])