Exemple #1
0
def _sambamba_merge(bam_files):
    """Merge multiple BAM files with sambamba.
    """
    if len(bam_files) > system.open_file_limit():
        raise IOError("More files to merge (%s) than available open file descriptors (%s)\n"
                      "See documentation on tips for changing file limits:\n"
                      "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                      "parallel.html#tuning-systems-for-scale"
                      % (len(bam_files), system.open_file_limit()))
    return "{sambamba} merge {tx_out_file} -t {num_cores} `cat {tx_bam_file_list}`"
Exemple #2
0
def _sambamba_merge(bam_files):
    """Merge multiple BAM files with sambamba.
    """
    if len(bam_files) > system.open_file_limit():
        raise IOError("More files to merge (%s) than available open file descriptors (%s)\n"
                      "See documentation on tips for changing file limits:\n"
                      "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                      "parallel.html#tuning-systems-for-scale"
                      % (len(bam_files), system.open_file_limit()))
    return "{sambamba} merge {tx_out_file} -t {num_cores} `cat {tx_bam_file_list}`"
Exemple #3
0
def _bamtools_merge(bam_files):
    """Use bamtools to merge multiple BAM files, requires a list from disk.
    """
    if len(bam_files) > system.open_file_limit():
        raise IOError("More files to merge (%s) then available open file descriptors (%s)\n"
                      "See documentation on tips for changing file limits:\n"
                      "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                      "parallel.html#tuning-systems-for-scale"
                      % (len(bam_files), system.open_file_limit()))
    return "{bamtools} merge -list {bam_file_list}"
Exemple #4
0
def _bamtools_merge(bam_files):
    """Use bamtools to merge multiple BAM files, requires a list from disk.
    """
    if len(bam_files) > system.open_file_limit():
        raise IOError("More files to merge (%s) than available open file descriptors (%s)\n"
                      "See documentation on tips for changing file limits:\n"
                      "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                      "parallel.html#tuning-systems-for-scale"
                      % (len(bam_files), system.open_file_limit()))
    return "{bamtools} merge -list {bam_file_list}"
Exemple #5
0
def _samtools_merge(bam_files):
    """Concatenate multiple BAM files together with samtools.
    Creates short paths to shorten the commandline.
    """
    if len(bam_files) > system.open_file_limit():
        raise IOError("More files to merge (%s) than available open file descriptors (%s)\n"
                      "See documentation on tips for changing file limits:\n"
                      "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                      "parallel.html#tuning-systems-for-scale"
                      % (len(bam_files), system.open_file_limit()))
    return "{samtools} merge - `cat {tx_bam_file_list}`"
Exemple #6
0
def _samtools_merge(bam_files):
    """Concatenate multiple BAM files together with samtools.
    Creates short paths to shorten the commandline.
    """
    if len(bam_files) > system.open_file_limit():
        raise IOError(
            "More files to merge (%s) than available open file descriptors (%s)\n"
            "See documentation on tips for changing file limits:\n"
            "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
            "parallel.html#tuning-systems-for-scale" %
            (len(bam_files), system.open_file_limit()))
    return "{samtools} merge - `cat {tx_bam_file_list}`"
Exemple #7
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
        shutil.copy(bam_files[0], out_file)
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(
                                config,
                                "%s.list" % os.path.splitext(out_file)[0]
                        ) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    bam.index(out_file, config)
    return out_file
Exemple #8
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(
                bam_files[0], config, "coordinate"):
            with file_transaction(config, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, config)
                shutil.copy(bam_files[0], tx_out_file)
            samtools = config_utils.get_program("samtools", config)
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(
                            bam_files, tx_out_file, config)
                        sambamba = config_utils.get_program("sambamba", config)
                        samtools = config_utils.get_program("samtools", config)
                        resources = config_utils.get_resources(
                            "samtools", config)
                        num_cores = config["algorithm"].get("num_cores", 1)
                        max_mem = config_utils.adjust_memory(
                            resources.get("memory", "1G"), 2,
                            "decrease").upper()
                        if bam.bam_already_sorted(bam_files[0], config,
                                                  "coordinate"):
                            cmd = _sambamba_merge(bam_files)
                        else:
                            assert config.get("mark_duplicates", True)
                            cmd = _biobambam_merge_dedup()
                        do.run(
                            cmd.format(**locals()), "Merge bam files to %s" %
                            os.path.basename(out_file), None)
                        do.run(
                            '{} quickcheck -v {}'.format(
                                samtools, tx_out_file),
                            "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, config)
    bam.index(out_file, config)
    return out_file
Exemple #9
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            if len(bam_files) > system.open_file_limit():
                raise IOError(
                    "More files to merge (%s) then available open file descriptors (%s)\n"
                    "See documentation on tips for changing file limits:\n"
                    "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                    "parallel.html#tuning-systems-for-scale" %
                    (len(bam_files), system.open_file_limit()))
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir,
                                   prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Exemple #10
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" %
                                              os.path.splitext(out_file)[0]
                                              ) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (
                                merge_cl + " | "
                                "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                            )
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Exemple #11
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        bam.index(bam_files[0], config)
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            samblaster = config_utils.get_program("samblaster", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Exemple #12
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            if len(bam_files) > system.open_file_limit():
                raise IOError("More files to merge (%s) then available open file descriptors (%s)\n"
                              "See documentation on tips for changing file limits:\n"
                              "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                              "parallel.html#tuning-systems-for-scale"
                              % (len(bam_files), system.open_file_limit()))
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = ("{bamtools} merge -list {bam_file_list} | "
                           "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Exemple #13
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"):
            with file_transaction(config, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, config)
                shutil.copy(bam_files[0], tx_out_file)
            samtools = config_utils.get_program("samtools", config)
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, config)
                        sambamba = config_utils.get_program("sambamba", config)
                        samtools = config_utils.get_program("samtools", config)
                        resources = config_utils.get_resources("samtools", config)
                        num_cores = config["algorithm"].get("num_cores", 1)
                        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                             2, "decrease").upper()
                        if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                            cmd = _sambamba_merge(bam_files)
                        else:
                            # Aim for 3.5Gb/core memory for BAM merging
                            num_cores = config_utils.adjust_cores_to_mb_target(
                                3500, resources.get("memory", "2G"), num_cores)
                            assert config.get("mark_duplicates", True)
                            cmd = _biobambam_merge_dedup()
                        do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                None)
                        do.run('{} quickcheck -v {}'.format(samtools, tx_out_file),
                               "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, config)
    bam.index(out_file, config)
    return out_file
Exemple #14
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease")
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file