Ejemplo n.º 1
0
Archivo: merge.py Proyecto: jme9/wabio
def merge_bam_files(bam_files, work_dir, config, batch=0):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Avoids too many open file issues by merging large numbers of files in batches.
    """
    max_merge = 500
    bam_files.sort()
    i = 1
    while len(bam_files) > max_merge:
        bam_files = [merge_bam_files(xs, work_dir, config, batch + i)
                     for xs in utils.partition_all(max_merge, bam_files)]
        i += 1
    if batch > 0:
        out_dir = utils.safe_makedir(os.path.join(work_dir, "batchmerge%s" % batch))
    else:
        out_dir = work_dir
    out_file = os.path.join(out_dir, os.path.basename(sorted(bam_files)[0]))
    picard = broad.runner_from_config(config)
    if len(bam_files) == 1:
        if not os.path.exists(out_file):
            os.symlink(bam_files[0], out_file)
    else:
        picard.run_fn("picard_merge", bam_files, out_file)
        for b in bam_files:
            utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 2
0
def split_bam_file(bam_file, split_size, out_dir, config):
    """Split a BAM file into paired end fastq splits based on split size.

    XXX Need to generalize for non-paired end inputs.
    """
    existing = _find_current_bam_split(bam_file, out_dir)
    if len(existing) > 0:
        return existing
    pipe = True

    utils.safe_makedir(out_dir)
    broad_runner = broad.runner_from_config(config)
    out_files = []

    def new_handle(num):
        out = []
        for pair in [1, 2]:
            fname = os.path.join(
                out_dir,
                "{base}_{pair}_{num}.fastq".format(
                    base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num
                ),
            )
            out += [fname, open(fname, "w")]
        return out

    with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir:
        if pipe:
            sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0])
            os.mkfifo(sort_file)
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True)
        else:
            sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0])
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file)

        samfile = pysam.Samfile(sort_file, "rb")
        i = 0
        num = 0
        f1, out_handle1, f2, out_handle2 = new_handle(num)
        out_files.append([f1, f2, None])
        for x1, x2 in utils.partition_all(2, samfile):
            x1_seq, x1_qual = _get_seq_qual(x1)
            out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual))
            x2_seq, x2_qual = _get_seq_qual(x2)
            out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual))
            i += 1
            if i % split_size == 0:
                num += 1
                out_handle1.close()
                out_handle2.close()
                f1, out_handle1, f2, out_handle2 = new_handle(num)
                out_files.append([f1, f2, num])
        out_handle1.close()
        out_handle2.close()
        samfile.close()
        if pipe:
            os.unlink(sort_file)
        else:
            utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config)
    return out_files
Ejemplo n.º 3
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
        shutil.copy(bam_files[0], out_file)
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(
                                config,
                                "%s.list" % os.path.splitext(out_file)[0]
                        ) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 4
0
def split_bam_file(bam_file, split_size, out_dir, config):
    """Split a BAM file into paired end fastq splits based on split size.

    XXX Need to generalize for non-paired end inputs.
    """
    existing = _find_current_bam_split(bam_file, out_dir)
    if len(existing) > 0:
        return existing
    pipe = True

    utils.safe_makedir(out_dir)
    broad_runner = broad.runner_from_config(config)
    out_files = []
    def new_handle(num):
        out = []
        for pair in [1, 2]:
            fname = os.path.join(out_dir, "{base}_{pair}_{num}.fastq".format(
                base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num))
            out += [fname, open(fname, "w")]
        return out
    with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir:
        if pipe:
            sort_file = os.path.join(tmp_dir, "%s-sort.bam" %
                                     os.path.splitext(os.path.basename(bam_file))[0])
            os.mkfifo(sort_file)
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file,
                                compression_level=0, pipe=True)
        else:
            sort_file = os.path.join(out_dir, "%s-sort.bam" %
                                     os.path.splitext(os.path.basename(bam_file))[0])
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file)

        samfile = pysam.Samfile(sort_file, "rb")
        i = 0
        num = 0
        f1, out_handle1, f2, out_handle2 = new_handle(num)
        out_files.append([f1, f2, None])
        for x1, x2 in utils.partition_all(2, samfile):
            x1_seq, x1_qual = _get_seq_qual(x1)
            out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual))
            x2_seq, x2_qual = _get_seq_qual(x2)
            out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual))
            i += 1
            if i % split_size == 0:
                num += 1
                out_handle1.close()
                out_handle2.close()
                f1, out_handle1, f2, out_handle2 = new_handle(num)
                out_files.append([f1, f2, num])
        out_handle1.close()
        out_handle2.close()
        samfile.close()
        if pipe:
            os.unlink(sort_file)
        else:
            utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config)
    return out_files
Ejemplo n.º 5
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(
                bam_files[0], config, "coordinate"):
            with file_transaction(config, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, config)
                shutil.copy(bam_files[0], tx_out_file)
            samtools = config_utils.get_program("samtools", config)
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(
                            bam_files, tx_out_file, config)
                        sambamba = config_utils.get_program("sambamba", config)
                        samtools = config_utils.get_program("samtools", config)
                        resources = config_utils.get_resources(
                            "samtools", config)
                        num_cores = config["algorithm"].get("num_cores", 1)
                        max_mem = config_utils.adjust_memory(
                            resources.get("memory", "1G"), 2,
                            "decrease").upper()
                        if bam.bam_already_sorted(bam_files[0], config,
                                                  "coordinate"):
                            cmd = _sambamba_merge(bam_files)
                        else:
                            assert config.get("mark_duplicates", True)
                            cmd = _biobambam_merge_dedup()
                        do.run(
                            cmd.format(**locals()), "Merge bam files to %s" %
                            os.path.basename(out_file), None)
                        do.run(
                            '{} quickcheck -v {}'.format(
                                samtools, tx_out_file),
                            "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, config)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 6
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" %
                                              os.path.splitext(out_file)[0]
                                              ) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (
                                merge_cl + " | "
                                "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                            )
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 7
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        bam.index(bam_files[0], config)
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            samblaster = config_utils.get_program("samblaster", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 8
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"):
            with file_transaction(config, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, config)
                shutil.copy(bam_files[0], tx_out_file)
            samtools = config_utils.get_program("samtools", config)
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, config)
                        sambamba = config_utils.get_program("sambamba", config)
                        samtools = config_utils.get_program("samtools", config)
                        resources = config_utils.get_resources("samtools", config)
                        num_cores = config["algorithm"].get("num_cores", 1)
                        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                             2, "decrease").upper()
                        if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                            cmd = _sambamba_merge(bam_files)
                        else:
                            # Aim for 3.5Gb/core memory for BAM merging
                            num_cores = config_utils.adjust_cores_to_mb_target(
                                3500, resources.get("memory", "2G"), num_cores)
                            assert config.get("mark_duplicates", True)
                            cmd = _biobambam_merge_dedup()
                        do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                None)
                        do.run('{} quickcheck -v {}'.format(samtools, tx_out_file),
                               "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, config)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 9
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = [
                "platypus", "callVariants",
                "--regions=%s" % _subset_regions(region, out_file, items),
                "--bamFiles=%s" % ",".join(align_bams),
                "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                "--logFileName", "/dev/null", "--verbosity=1"
            ]
            resources = config_utils.get_resources("platypus",
                                                   items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            tuned_opts = [
                "--hapScoreThreshold", "10", "--scThreshold", "0.99",
                "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20",
                "--qdThreshold", "0", "--abThreshold", "0.0001",
                "--minVarFreq", "0.0", "--assemble", "1"
            ]
            for okey, oval in utils.partition_all(2, tuned_opts):
                if okey not in cmd:
                    cmd.extend([okey, oval])
            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not tz.get_in(["config", "algorithm", "mark_duplicates"],
                                 data, True) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (
                " | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                "vcfstreamsort | bgzip -c > %s" %
                (vcfutils.fix_ambiguous_cl(), tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd,
                   "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Ejemplo n.º 10
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease")
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 11
0
def _combine_variants(in_vcfs, out_file, ref_file, config):
    """Combine variant files, batching to avoid problematic large commandlines.
    """
    max_batch = 500
    if len(in_vcfs) > max_batch:
        new_vcfs = []
        for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)):
            path, fname = os.path.split(out_file)
            batch_path = safe_makedir(os.path.join(path, "batch"))
            base, ext = os.path.splitext(fname)
            cur_out = os.path.join(batch_path, "{0}-batch{1}{2}".format(base, i, ext))
            for x in batch_vcfs:
                with open(x) as in_handle:
                    if not in_handle.readline().startswith("##fileformat=VCFv4"):
                        raise ValueError("Unexpected VCF file: %s" % x)
            combine_variant_files(batch_vcfs, cur_out, ref_file, config)
            new_vcfs.append(cur_out)
        in_vcfs = new_vcfs
    assert len(in_vcfs) <= max_batch
    combine_variant_files(in_vcfs, out_file, ref_file, config)
Ejemplo n.º 12
0
def _combine_variants(in_vcfs, out_file, ref_file, config):
    """Combine variant files, batching to avoid problematic large commandlines.
    """
    max_batch = 500
    if len(in_vcfs) > max_batch:
        new_vcfs = []
        for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)):
            path, fname = os.path.split(out_file)
            batch_path = safe_makedir(os.path.join(path, "batch"))
            base, ext = os.path.splitext(fname)
            cur_out = os.path.join(batch_path,
                                   "{0}-batch{1}{2}".format(base, i, ext))
            for x in batch_vcfs:
                with open(x) as in_handle:
                    if not in_handle.readline().startswith(
                            "##fileformat=VCFv4"):
                        raise ValueError("Unexpected VCF file: %s" % x)
            combine_variant_files(batch_vcfs, cur_out, ref_file, config)
            new_vcfs.append(cur_out)
        in_vcfs = new_vcfs
    assert len(in_vcfs) <= max_batch
    combine_variant_files(in_vcfs, out_file, ref_file, config)
Ejemplo n.º 13
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items),
                   "--bamFiles=%s" % ",".join(align_bams),
                   "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                   "--logFileName", "/dev/null", "--verbosity=1"]
            resources = config_utils.get_resources("platypus", items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
                          "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
                          "--minVarFreq", "0.0", "--assemble", "1"]
            for okey, oval in utils.partition_all(2, tuned_opts):
                if okey not in cmd:
                    cmd.extend([okey, oval])
            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True)
                   for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (" | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                                "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file