def merge_bam_files(bam_files, work_dir, config, batch=0): """Merge multiple BAM files from a sample into a single BAM for processing. Avoids too many open file issues by merging large numbers of files in batches. """ max_merge = 500 bam_files.sort() i = 1 while len(bam_files) > max_merge: bam_files = [merge_bam_files(xs, work_dir, config, batch + i) for xs in utils.partition_all(max_merge, bam_files)] i += 1 if batch > 0: out_dir = utils.safe_makedir(os.path.join(work_dir, "batchmerge%s" % batch)) else: out_dir = work_dir out_file = os.path.join(out_dir, os.path.basename(sorted(bam_files)[0])) picard = broad.runner_from_config(config) if len(bam_files) == 1: if not os.path.exists(out_file): os.symlink(bam_files[0], out_file) else: picard.run_fn("picard_merge", bam_files, out_file) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) return out_file
def split_bam_file(bam_file, split_size, out_dir, config): """Split a BAM file into paired end fastq splits based on split size. XXX Need to generalize for non-paired end inputs. """ existing = _find_current_bam_split(bam_file, out_dir) if len(existing) > 0: return existing pipe = True utils.safe_makedir(out_dir) broad_runner = broad.runner_from_config(config) out_files = [] def new_handle(num): out = [] for pair in [1, 2]: fname = os.path.join( out_dir, "{base}_{pair}_{num}.fastq".format( base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num ), ) out += [fname, open(fname, "w")] return out with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir: if pipe: sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) os.mkfifo(sort_file) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True) else: sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file) samfile = pysam.Samfile(sort_file, "rb") i = 0 num = 0 f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, None]) for x1, x2 in utils.partition_all(2, samfile): x1_seq, x1_qual = _get_seq_qual(x1) out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual)) x2_seq, x2_qual = _get_seq_qual(x2) out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual)) i += 1 if i % split_size == 0: num += 1 out_handle1.close() out_handle2.close() f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, num]) out_handle1.close() out_handle2.close() samfile.close() if pipe: os.unlink(sort_file) else: utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config) return out_files
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): shutil.copy(bam_files[0], out_file) else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction( config, "%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def split_bam_file(bam_file, split_size, out_dir, config): """Split a BAM file into paired end fastq splits based on split size. XXX Need to generalize for non-paired end inputs. """ existing = _find_current_bam_split(bam_file, out_dir) if len(existing) > 0: return existing pipe = True utils.safe_makedir(out_dir) broad_runner = broad.runner_from_config(config) out_files = [] def new_handle(num): out = [] for pair in [1, 2]: fname = os.path.join(out_dir, "{base}_{pair}_{num}.fastq".format( base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num)) out += [fname, open(fname, "w")] return out with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir: if pipe: sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) os.mkfifo(sort_file) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True) else: sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file) samfile = pysam.Samfile(sort_file, "rb") i = 0 num = 0 f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, None]) for x1, x2 in utils.partition_all(2, samfile): x1_seq, x1_qual = _get_seq_qual(x1) out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual)) x2_seq, x2_qual = _get_seq_qual(x2) out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual)) i += 1 if i % split_size == 0: num += 1 out_handle1.close() out_handle2.close() f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, num]) out_handle1.close() out_handle2.close() samfile.close() if pipe: os.unlink(sort_file) else: utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config) return out_files
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], config, "coordinate"): with file_transaction(config, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, config) shutil.copy(bam_files[0], tx_out_file) samtools = config_utils.get_program("samtools", config) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, config) sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources( "samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: bam.index(bam_files[0], config) return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) samblaster = config_utils.get_program("samblaster", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): with file_transaction(config, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, config) shutil.copy(bam_files[0], tx_out_file) samtools = config_utils.get_program("samtools", config) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, config) sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run('{} quickcheck -v {}'.format(samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, config) bam.index(out_file, config) return out_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = [ "platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1" ] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers tuned_opts = [ "--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", "--minVarFreq", "0.0", "--assemble", "1" ] for okey, oval in utils.partition_all(2, tuned_opts): if okey not in cmd: cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = ( " | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease") batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def _combine_variants(in_vcfs, out_file, ref_file, config): """Combine variant files, batching to avoid problematic large commandlines. """ max_batch = 500 if len(in_vcfs) > max_batch: new_vcfs = [] for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)): path, fname = os.path.split(out_file) batch_path = safe_makedir(os.path.join(path, "batch")) base, ext = os.path.splitext(fname) cur_out = os.path.join(batch_path, "{0}-batch{1}{2}".format(base, i, ext)) for x in batch_vcfs: with open(x) as in_handle: if not in_handle.readline().startswith("##fileformat=VCFv4"): raise ValueError("Unexpected VCF file: %s" % x) combine_variant_files(batch_vcfs, cur_out, ref_file, config) new_vcfs.append(cur_out) in_vcfs = new_vcfs assert len(in_vcfs) <= max_batch combine_variant_files(in_vcfs, out_file, ref_file, config)
def _combine_variants(in_vcfs, out_file, ref_file, config): """Combine variant files, batching to avoid problematic large commandlines. """ max_batch = 500 if len(in_vcfs) > max_batch: new_vcfs = [] for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)): path, fname = os.path.split(out_file) batch_path = safe_makedir(os.path.join(path, "batch")) base, ext = os.path.splitext(fname) cur_out = os.path.join(batch_path, "{0}-batch{1}{2}".format(base, i, ext)) for x in batch_vcfs: with open(x) as in_handle: if not in_handle.readline().startswith( "##fileformat=VCFv4"): raise ValueError("Unexpected VCF file: %s" % x) combine_variant_files(batch_vcfs, cur_out, ref_file, config) new_vcfs.append(cur_out) in_vcfs = new_vcfs assert len(in_vcfs) <= max_batch combine_variant_files(in_vcfs, out_file, ref_file, config)
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", "--minVarFreq", "0.0", "--assemble", "1"] for okey, oval in utils.partition_all(2, tuned_opts): if okey not in cmd: cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = (" | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file