def combine_fastq_files(in_files, work_dir, config): if len(in_files) == 1: return in_files[0] else: cur1, cur2 = in_files[0] out1 = os.path.join(work_dir, os.path.basename(cur1)) out2 = os.path.join(work_dir, os.path.basename(cur2)) if cur2 else None if not os.path.exists(out1): with open(out1, "a") as out_handle: for (cur1, _) in in_files: with open(cur1) as in_handle: shutil.copyfileobj(in_handle, out_handle) if out2 and not os.path.exists(out2): with open(out2, "a") as out_handle: for (_, cur2) in in_files: with open(cur2) as in_handle: shutil.copyfileobj(in_handle, out_handle) if config["algorithm"].get("upload_fastq", False): return out1, out2 for f1, f2 in in_files: utils.save_diskspace(f1, "fastq merged to %s" % out1, config) if f2: utils.save_diskspace(f2, "fastq merged to %s" % out2, config) return out1, out2
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name, rg_name, lane_name, config): """Convert SAM file to merged and sorted BAM file. """ picard = broad.runner_from_config(config) platform = config["algorithm"]["platform"] qual_format = config["algorithm"].get("quality_format", None) base_dir = os.path.dirname(sam_file) picard.run_fn("picard_index_ref", ref_file) out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2, base_dir, platform, sample_name, rg_name, lane_name, qual_format) out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam, ref_file, fastq2 is not None) sort_bam = picard.run_fn("picard_sort", out_bam) utils.save_diskspace(sam_file, "SAM converted to BAM", config) utils.save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config) # merge FASTQ files, only if barcoded samples in the work directory if (os.path.commonprefix([fastq1, sort_bam]) == os.path.split(os.path.dirname(sort_bam))[0] and not config["algorithm"].get("upload_fastq", True)): utils.save_diskspace(fastq1, "Merged into output BAM %s" % out_bam, config) if fastq2: utils.save_diskspace(fastq2, "Merged into output BAM %s" % out_bam, config) return sort_bam
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, names, config): """Convert SAM file to merged and sorted BAM file. """ picard = broad.runner_from_config(config) base_dir = os.path.dirname(sam_file) picard.run_fn("picard_index_ref", ref_file) out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2, base_dir, names) out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam, ref_file, fastq2 is not None) sort_bam = picard.run_fn("picard_sort", out_bam) utils.save_diskspace(sam_file, "SAM converted to BAM", config) utils.save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config) # merge FASTQ files, only if barcoded samples in the work directory if (os.path.commonprefix([fastq1, sort_bam]) == os.path.split( os.path.dirname(sort_bam))[0] and not config["algorithm"].get("upload_fastq", True)): utils.save_diskspace(fastq1, "Merged into output BAM %s" % out_bam, config) if fastq2: utils.save_diskspace(fastq2, "Merged into output BAM %s" % out_bam, config) return sort_bam
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ realigner = data["config"]["algorithm"].get("realign", True) realigner = "gatk" if realigner is True else realigner realign_fn = _realign_approaches[realigner] if realigner else None if realign_fn: logger.info("Realigning %s with %s: %s %s" % (data["name"], realigner, os.path.basename( data["work_bam"]), region)) sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file, data["config"]) else: realign_bam = realign_fn( data["work_bam"], sam_ref, config, data["genome_resources"]["variation"]["dbsnp"], region, out_file) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) resources = config_utils.get_resources("bamtools", config) max_mem = resources.get("memory", "2048") with file_transaction(out_file) as tx_out_file: with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ("{bamtools} merge -list {bam_file_list} | " "{bamtools} sort -mem {max_mem} -out {tx_out_file}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): shutil.copy(bam_files[0], out_file) else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction( config, "%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def apply_recal(data): """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM. """ orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data) had_work_bam = "work_bam" in data if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data))) data["work_bam"] = _gatk_apply_bqsr(data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data))) data["work_bam"] = sentieon.apply_bqsr(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) # CWL does not have work/alignment BAM separation if not had_work_bam and dd.get_work_bam(data): data["align_bam"] = dd.get_work_bam(data) if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam( data): utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"]) return data
def split_bam_file(bam_file, split_size, out_dir, config): """Split a BAM file into paired end fastq splits based on split size. XXX Need to generalize for non-paired end inputs. """ existing = _find_current_bam_split(bam_file, out_dir) if len(existing) > 0: return existing pipe = True utils.safe_makedir(out_dir) broad_runner = broad.runner_from_config(config) out_files = [] def new_handle(num): out = [] for pair in [1, 2]: fname = os.path.join( out_dir, "{base}_{pair}_{num}.fastq".format( base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num ), ) out += [fname, open(fname, "w")] return out with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir: if pipe: sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) os.mkfifo(sort_file) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True) else: sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file) samfile = pysam.Samfile(sort_file, "rb") i = 0 num = 0 f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, None]) for x1, x2 in utils.partition_all(2, samfile): x1_seq, x1_qual = _get_seq_qual(x1) out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual)) x2_seq, x2_qual = _get_seq_qual(x2) out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual)) i += 1 if i % split_size == 0: num += 1 out_handle1.close() out_handle2.close() f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, num]) out_handle1.close() out_handle2.close() samfile.close() if pipe: os.unlink(sort_file) else: utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config) return out_files
def write_recal_bam(data, region=None, out_file=None): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ config = data["config"] if out_file is None: out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0] logger.info("Writing recalibrated BAM for %s to %s" % (data["name"], out_file)) if region == "nochr": out_bam = write_nochr_reads(data["work_bam"], out_file) else: out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"], region, data["sam_ref"], out_file, config) qual_bin = config["algorithm"].get("quality_bin", None) if ((qual_bin is True or qual_bin == "postrecal" or isinstance(qual_bin, list) and "postrecal" in qual_bin) and has_aligned_reads(out_bam)): binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"], os.path.dirname(out_bam), config) shutil.move(out_bam, out_bam + ".binned") shutil.move(binned_bam, out_bam) utils.save_diskspace(out_bam + ".binned", "Quality binned to %s" % out_bam, config) data["work_bam"] = out_bam return [data]
def merge_bam_files(bam_files, work_dir, config, batch=0): """Merge multiple BAM files from a sample into a single BAM for processing. Avoids too many open file issues by merging large numbers of files in batches. """ max_merge = 500 bam_files.sort() i = 1 while len(bam_files) > max_merge: bam_files = [merge_bam_files(xs, work_dir, config, batch + i) for xs in utils.partition_all(max_merge, bam_files)] i += 1 if batch > 0: out_dir = utils.safe_makedir(os.path.join(work_dir, "batchmerge%s" % batch)) else: out_dir = work_dir out_file = os.path.join(out_dir, os.path.basename(sorted(bam_files)[0])) picard = broad.runner_from_config(config) if len(bam_files) == 1: if not os.path.exists(out_file): os.symlink(bam_files[0], out_file) else: picard.run_fn("picard_merge", bam_files, out_file) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name="", rg_name="", pu_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get("java_memory", "")) platform = config["algorithm"]["platform"] if platform.lower() == "illumina": qual_format = "Illumina" else: raise ValueError("Need to specify quality format for %s" % platform) index_ref_file(picard, ref_file) base_dir = os.path.split(align_sam)[0] with curdir_tmpdir() as tmp_dir: out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair, base_dir, platform, qual_format, sample_name, rg_name, pu_name, tmp_dir) out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None) sort_bam = picard_sort(picard, out_bam, tmp_dir) save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
def combine_bam(in_files, out_file, config): """Parallel target to combine multiple BAM files. """ runner = broad.runner_from_path("picard", config) runner.run_fn("picard_merge", in_files, out_file) for in_file in in_files: save_diskspace(in_file, "Merged into {0}".format(out_file), config) bam.index(out_file, config) return out_file
def split_bam_file(bam_file, split_size, out_dir, config): """Split a BAM file into paired end fastq splits based on split size. XXX Need to generalize for non-paired end inputs. """ existing = _find_current_bam_split(bam_file, out_dir) if len(existing) > 0: return existing pipe = True utils.safe_makedir(out_dir) broad_runner = broad.runner_from_config(config) out_files = [] def new_handle(num): out = [] for pair in [1, 2]: fname = os.path.join(out_dir, "{base}_{pair}_{num}.fastq".format( base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num)) out += [fname, open(fname, "w")] return out with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir: if pipe: sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) os.mkfifo(sort_file) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True) else: sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file) samfile = pysam.Samfile(sort_file, "rb") i = 0 num = 0 f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, None]) for x1, x2 in utils.partition_all(2, samfile): x1_seq, x1_qual = _get_seq_qual(x1) out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual)) x2_seq, x2_qual = _get_seq_qual(x2) out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual)) i += 1 if i % split_size == 0: num += 1 out_handle1.close() out_handle2.close() f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, num]) out_handle1.close() out_handle2.close() samfile.close() if pipe: os.unlink(sort_file) else: utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config) return out_files
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ out_file = os.path.join(work_dir, os.path.basename(bam_files[0])) picard = broad.runner_from_config(config) picard.run_fn("picard_merge", bam_files, out_file) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def _finalize_merge(out_file, bam_files, config): """Handle indexes and cleanups of merged BAM and input files. """ # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ bam_files.sort() out_file = os.path.join(work_dir, os.path.basename(bam_files[0])) picard = broad.runner_from_config(config) picard.run_fn("picard_merge", bam_files, out_file) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def _save_fastq_space(items): """Potentially save fastq space prior to merging, since alignments done. """ to_cleanup = {} for data in (utils.to_single_data(x) for x in items): for fname in data.get("files", []): if os.path.realpath(fname).startswith(dd.get_work_dir(data)): to_cleanup[fname] = data["config"] for fname, config in to_cleanup.items(): utils.save_diskspace(fname, "Cleanup prep files after alignment finished", config)
def recalibrate_sample(data): """Recalibrate quality values from aligned sample BAM file. """ logger.info("Recalibrating %s with GATK" % str(data["name"])) if data["config"]["algorithm"]["recalibrate"]: recal_bam = recalibrate_quality( data["work_bam"], data["fastq1"], data["fastq2"], data["sam_ref"], data["dirs"], data["config"] ) save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam, data["config"]) data["work_bam"] = recal_bam return [[data]]
def recalibrate_sample(data): """Recalibrate quality values from aligned sample BAM file. """ logger.info("Recalibrating %s with GATK" % str(data["name"])) if data["config"]["algorithm"]["recalibrate"]: recal_bam = recalibrate_quality(data["work_bam"], data["fastq1"], data["fastq2"], data["sam_ref"], data["dirs"], data["config"]) save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam, data["config"]) data["work_bam"] = recal_bam return [[data]]
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name, lane_name, config, config_file): """Convert SAM file to merged and sorted BAM file. """ lane = lane_name.split("_")[0] cl = ["picard_sam_to_bam.py", "--name=%s" % sample_name, "--rg=%s" % lane, "--pu=%s" % lane_name, config_file, sam_file, ref_file, fastq1] if fastq2: cl.append(fastq2) subprocess.check_call(cl) utils.save_diskspace(sam_file, "SAM converted to BAM", config)
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: bam.index(bam_files[0], config) return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) samblaster = config_utils.get_program("samblaster", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ bam_files.sort() out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) picard = broad.runner_from_config(config) if len(bam_files) == 1: if not os.path.exists(out_file): os.symlink(bam_files[0], out_file) else: picard.run_fn("picard_merge", bam_files, out_file) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name, lane_name, config, config_file): """Convert SAM file to merged and sorted BAM file. """ lane = lane_name.split("_")[0] cl = [ "picard_sam_to_bam.py", "--name=%s" % sample_name, "--rg=%s" % lane, "--pu=%s" % lane_name, config_file, sam_file, ref_file, fastq1 ] if fastq2: cl.append(fastq2) subprocess.check_call(cl) utils.save_diskspace(sam_file, "SAM converted to BAM", config)
def mark_duplicates_sample(data): """Mark duplicate molecules in sample BAM file. """ mark_duplicates = data["config"]["algorithm"].get("mark_duplicates", False) if not mark_duplicates: return [[data]] logger.info("Marking duplicates in {} with Picard".format(str(data["name"]))) picard = broad.runner_from_config(data["config"]) dup_bam, _ = picard_mark_duplicates(picard, data["work_bam"]) reason = "Marked duplicates of {0} in {1}, so {0} is no longer needed" \ "".format(data["work_bam"], dup_bam) save_diskspace(data["work_bam"], reason, data["config"]) data["work_bam"] = dup_bam return [[data]]
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ out_file = os.path.join(work_dir, os.path.basename(bam_files[0])) if not os.path.exists(out_file): picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get( "java_memory", "")) with utils.curdir_tmpdir() as tmp_dir: opts = [("OUTPUT", out_file), ("SORT_ORDER", "coordinate"), ("TMP_DIR", tmp_dir)] for b in bam_files: opts.append(("INPUT", b)) picard.run("MergeSamFiles", opts) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ out_file = os.path.join(work_dir, os.path.basename(bam_files[0])) if not os.path.exists(out_file): picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get("java_memory", "")) with utils.curdir_tmpdir() as tmp_dir: opts = [("OUTPUT", out_file), ("SORT_ORDER", "coordinate"), ("TMP_DIR", tmp_dir)] for b in bam_files: opts.append(("INPUT", b)) picard.run("MergeSamFiles", opts) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ logger.info("Realigning %s with GATK: %s %s" % (data["name"], os.path.basename(data["work_bam"]), region)) if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = gatk_realigner( data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file ) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def mark_duplicates_sample(data): """Mark duplicate molecules in sample BAM file. """ mark_duplicates = data["config"]["algorithm"].get("mark_duplicates", False) if not mark_duplicates: return [[data]] logger.info("Marking duplicates in {} with Picard".format(str( data["name"]))) picard = broad.runner_from_config(data["config"]) dup_bam, _ = picard_mark_duplicates(picard, data["work_bam"]) reason = "Marked duplicates of {0} in {1}, so {0} is no longer needed" \ "".format(data["work_bam"], dup_bam) save_diskspace(data["work_bam"], reason, data["config"]) data["work_bam"] = dup_bam return [[data]]
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") if len(bam_files) > system.open_file_limit(): raise IOError( "More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name="", rg_name="", pu_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get("java_memory", "")) platform = config["algorithm"]["platform"] if platform.lower() == "illumina": qual_format = "Illumina" else: raise ValueError("Need to specify quality format for %s" % platform) index_ref_file(picard, ref_file) base_dir = os.path.split(align_sam)[0] with curdir_tmpdir() as tmp_dir: out_fastq_bam = picard_fastq_to_bam( picard, fastq_one, fastq_pair, base_dir, platform, qual_format, sample_name, rg_name, pu_name, tmp_dir ) out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None) sort_bam = picard_sort(picard, out_bam, tmp_dir) save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease") batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ logger.info("Realigning %s with GATK: %s %s" % (data["name"], os.path.basename(data["work_bam"]), region)) if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = gatk_realigner( data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def apply_recal(data): """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM. """ orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data) had_work_bam = "work_bam" in data if dd.get_recalibrate(data) in [True, "gatk"]: if data.get("prep_recal"): logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data))) data["work_bam"] = _gatk_apply_bqsr(data) elif dd.get_recalibrate(data) == "sentieon": if data.get("prep_recal"): logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data))) data["work_bam"] = sentieon.apply_bqsr(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) # CWL does not have work/alignment BAM separation if not had_work_bam and dd.get_work_bam(data): data["align_bam"] = dd.get_work_bam(data) if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(data): utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"]) return data
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses samtools or bamtools for merging, both of which have some cavaets. samtools can run into file system limits on command line length, while bamtools runs into open file handle issues. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") with utils.curdir_tmpdir() as tmpdir: with utils.chdir(tmpdir): if len(bam_files) < 4096: merge_cl = _samtools_cat(bam_files, tmpdir) else: merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ("{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def report(self, align_bam, ref_file, is_paired, bait_file, target_file): """Produce report metrics using Picard with sorted aligned BAM file. """ dup_bam, dup_metrics = self._get_current_dup_metrics(align_bam) align_metrics = self._collect_align_metrics(dup_bam, ref_file) # Prefer the GC metrics in FastQC instead of Picard # gc_graph, gc_metrics = self._gc_bias(dup_bam, ref_file) gc_graph = None insert_graph, insert_metrics, hybrid_metrics = (None, None, None) if is_paired: insert_graph, insert_metrics = self._insert_sizes(dup_bam) if bait_file and target_file: hybrid_metrics = self._hybrid_select_metrics(dup_bam, bait_file, target_file) vrn_vals = self._variant_eval_metrics(dup_bam) summary_info = self._parser.get_summary_metrics( align_metrics, dup_metrics, insert_metrics, hybrid_metrics, vrn_vals ) pprint.pprint(summary_info) graphs = [] if gc_graph and os.path.exists(gc_graph): graphs.append((gc_graph, "Distribution of GC content across reads")) if insert_graph and os.path.exists(insert_graph): graphs.append((insert_graph, "Distribution of paired end insert sizes")) # Attempt to clean up potential waste of space if dup_bam != align_bam: config = self._picard._config reason = ( "Picard MarkDuplicates file {} only needed for metrics " "and has been removed to save space".format(dup_bam) ) save_diskspace(dup_bam, reason, config) return summary_info, graphs
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ realigner = data["config"]["algorithm"].get("realign", True) realigner = "gatk" if realigner is True else realigner realign_fn = _realign_approaches[realigner] if realigner else None if realign_fn: logger.info("Realigning %s with %s: %s %s" % (data["name"], realigner, os.path.basename(data["work_bam"]), region)) sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = realign_fn(data["work_bam"], sam_ref, config, data["genome_resources"]["variation"]["dbsnp"], region, out_file) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def write_recal_bam(data, region=None, out_file=None): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ config = data["config"] if out_file is None: out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0] logger.info("Writing recalibrated BAM for %s to %s" % (data["name"], out_file)) if region == "nochr": out_bam = write_nochr_reads(data["work_bam"], out_file, data["config"]) else: out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"], region, data["sam_ref"], out_file, config) qual_bin = config["algorithm"].get("quality_bin", None) if ((qual_bin is True or qual_bin == "postrecal" or isinstance(qual_bin, list) and "postrecal" in qual_bin) and has_aligned_reads(out_bam)): binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"], os.path.dirname(out_bam), config) shutil.move(out_bam, out_bam + ".binned") shutil.move(binned_bam, out_bam) utils.save_diskspace(out_bam + ".binned", "Quality binned to %s" % out_bam, config) data["work_bam"] = out_bam return [data]
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: with open(bam_file_list, "w") as out_handle: for f in bam_files: out_handle.write("%s\n" % f) cmd = [config_utils.get_program("bamtools", config), "merge", "-list", bam_file_list, "-out", tx_out_file] do.run(cmd, "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def report(self, align_bam, ref_file, is_paired, bait_file, target_file): """Produce report metrics using Picard with sorted aligned BAM file. """ dup_bam, dup_metrics = self._get_current_dup_metrics(align_bam) align_metrics = self._collect_align_metrics(dup_bam, ref_file) # Prefer the GC metrics in FastQC instead of Picard # gc_graph, gc_metrics = self._gc_bias(dup_bam, ref_file) gc_graph = None insert_graph, insert_metrics, hybrid_metrics = (None, None, None) if is_paired: insert_graph, insert_metrics = self._insert_sizes(dup_bam) if bait_file and target_file: hybrid_metrics = self._hybrid_select_metrics( dup_bam, bait_file, target_file) vrn_vals = self._variant_eval_metrics(dup_bam) summary_info = self._parser.get_summary_metrics(align_metrics, dup_metrics, insert_metrics, hybrid_metrics, vrn_vals) pprint.pprint(summary_info) graphs = [] if gc_graph and os.path.exists(gc_graph): graphs.append((gc_graph, "Distribution of GC content across reads")) if insert_graph and os.path.exists(insert_graph): graphs.append((insert_graph, "Distribution of paired end insert sizes")) # Attempt to clean up potential waste of space if dup_bam != align_bam: config = self._picard._config reason = "Picard MarkDuplicates file {} only needed for metrics " \ "and has been removed to save space".format(dup_bam) save_diskspace(dup_bam, reason, config) return summary_info, graphs
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name, lane_name, config): """Convert SAM file to merged and sorted BAM file. """ rg_name = lane_name.split("_")[0] picard = broad.runner_from_config(config) platform = config["algorithm"]["platform"] base_dir = os.path.dirname(sam_file) picard.run_fn("picard_index_ref", ref_file) out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2, base_dir, platform, sample_name, rg_name, lane_name) out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam, ref_file, fastq2 is not None) sort_bam = picard.run_fn("picard_sort", out_bam) utils.save_diskspace(sam_file, "SAM converted to BAM", config) utils.save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config) return sort_bam
def _save_diskspace(in_file, out_file, config): """Potentially remove input file to save space if configured and in work directory. """ if (os.path.commonprefix([in_file, out_file]).rstrip("/") == os.path.split( os.path.dirname(out_file))[0]): save_diskspace(in_file, "Trimmed to {}".format(out_file), config)
def _save_diskspace(in_file, out_file, config): """Potentially remove input file to save space if configured and in work directory. """ if (os.path.commonprefix([in_file, out_file]).rstrip("/") == os.path.split(os.path.dirname(out_file))[0]): save_diskspace(in_file, "Trimmed to {}".format(out_file), config)