def _sambamba_merge(bam_files): """Merge multiple BAM files with sambamba. """ if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) than available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) return "{sambamba} merge {tx_out_file} -t {num_cores} `cat {tx_bam_file_list}`"
def _bamtools_merge(bam_files): """Use bamtools to merge multiple BAM files, requires a list from disk. """ if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) return "{bamtools} merge -list {bam_file_list}"
def _bamtools_merge(bam_files): """Use bamtools to merge multiple BAM files, requires a list from disk. """ if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) than available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) return "{bamtools} merge -list {bam_file_list}"
def _samtools_merge(bam_files): """Concatenate multiple BAM files together with samtools. Creates short paths to shorten the commandline. """ if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) than available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) return "{samtools} merge - `cat {tx_bam_file_list}`"
def _samtools_merge(bam_files): """Concatenate multiple BAM files together with samtools. Creates short paths to shorten the commandline. """ if len(bam_files) > system.open_file_limit(): raise IOError( "More files to merge (%s) than available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) return "{samtools} merge - `cat {tx_bam_file_list}`"
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): shutil.copy(bam_files[0], out_file) else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction( config, "%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], config, "coordinate"): with file_transaction(config, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, config) shutil.copy(bam_files[0], tx_out_file) samtools = config_utils.get_program("samtools", config) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, config) sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources( "samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") if len(bam_files) > system.open_file_limit(): raise IOError( "More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: bam.index(bam_files[0], config) return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) samblaster = config_utils.get_program("samblaster", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ("{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): with file_transaction(config, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, config) shutil.copy(bam_files[0], tx_out_file) samtools = config_utils.get_program("samtools", config) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, config) sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run('{} quickcheck -v {}'.format(samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease") batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file