Exemple #1
0
def _run_variantcall_batch_multicore(items, regions, final_file):
    """Run variant calling on a batch of items using multiple cores.
    """
    batch_name = _get_batch_name(items)
    variantcaller = _get_batch_variantcaller(items)
    work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items]

    def split_fn(data):
        out = []
        for region in regions:
            region = _region_to_coords(region)
            chrom, start, end = region
            region_str = "_".join(str(x) for x in region)
            out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller,
                                    chrom,
                                    "%s-%s.vcf.gz" % (batch_name, region_str))
            out.append((region, work_bams, out_file))
        return final_file, out

    parallel = {
        "type": "local",
        "num_jobs": dd.get_num_cores(items[0]),
        "cores_per_job": 1
    }
    run_parallel = dmulti.runner(parallel, items[0]["config"])
    to_run = copy.deepcopy(items[0])
    to_run["sam_ref"] = dd.get_ref_file(to_run)
    to_run["group_orig"] = items
    parallel_split_combine([[to_run]], split_fn, run_parallel,
                           "variantcall_sample", "concat_variant_files",
                           "vrn_file", ["region", "sam_ref", "config"])
    return final_file
def _run_variantcall_batch_multicore(items, regions, final_file):
    """Run variant calling on a batch of items using multiple cores.
    """
    batch_name = _get_batch_name(items)
    variantcaller = _get_batch_variantcaller(items)
    work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items]
    def split_fn(data):
        out = []
        for region in regions:
            region = _region_to_coords(region)
            chrom, start, end = region
            region_str = "_".join(str(x) for x in region)
            out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom,
                                    "%s-%s.vcf.gz" % (batch_name, region_str))
            out.append((region, work_bams, out_file))
        return final_file, out
    parallel = {"type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1}
    run_parallel = dmulti.runner(parallel, items[0]["config"])
    to_run = copy.deepcopy(items[0])
    to_run["sam_ref"] = dd.get_ref_file(to_run)
    to_run["group_orig"] = items
    parallel_split_combine([[to_run]], split_fn, run_parallel,
                           "variantcall_sample", "concat_variant_files",
                           "vrn_file", ["region", "sam_ref", "config"])
    return final_file
Exemple #3
0
def parallel_combine_variants(orig_files, out_file, ref_file, config,
                              run_parallel):
    """Combine variants in parallel by chromosome, concatenating final outputs.
    """
    file_key = "vcf_files"

    def split_by_region(data):
        base, ext = utils.splitext_plus(os.path.basename(out_file))
        args = []
        for region in [x.name for x in ref.file_contigs(ref_file, config)]:
            region_out = os.path.join(os.path.dirname(out_file),
                                      "%s-regions" % base,
                                      "%s-%s%s" % (base, region, ext))
            utils.safe_makedir(os.path.dirname(region_out))
            args.append((region_out, ref_file, config, region))
        return out_file, args

    config = copy.deepcopy(config)
    config["file_key"] = file_key
    prep_files = run_multicore(p_bgzip_and_index,
                               [[x, config] for x in orig_files], config)
    items = [[{file_key: prep_files}]]
    parallel_split_combine(items,
                           split_by_region,
                           run_parallel,
                           "merge_variant_files",
                           "concat_variant_files",
                           file_key, ["region", "sam_ref", "config"],
                           split_outfile_i=0)
    return out_file
Exemple #4
0
def parallel_combine_variants(orig_files, out_file, ref_file, config,
                              run_parallel):
    """Combine variants in parallel by chromosome, concatenating final outputs.
    """
    file_key = "vcf_files"
    items = [[{file_key: orig_files}]]

    def split_by_region(data):
        base, ext = os.path.splitext(os.path.basename(out_file))
        args = []
        for region in [x["SN"] for x in _ref_file_contigs(ref_file, config)]:
            region_out = os.path.join(os.path.dirname(out_file),
                                      "%s-regions" % base,
                                      "%s-%s%s" % (base, region, ext))
            utils.safe_makedir(os.path.dirname(region_out))
            args.append((region_out, ref_file, config, True, region))
        return out_file, args

    config = copy.deepcopy(config)
    config["file_key"] = file_key
    parallel_split_combine(items,
                           split_by_region,
                           run_parallel,
                           "combine_variant_files",
                           "concat_variant_files",
                           file_key, ["region", "sam_ref", "config"],
                           split_outfile_i=0)
    return out_file
Exemple #5
0
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel):
    """Combine variants in parallel by chromosome, concatenating final outputs.
    """
    file_key = "vcf_files"

    def split_by_region(data):
        base, ext = utils.splitext_plus(os.path.basename(out_file))
        args = []
        for region in [x.name for x in ref.file_contigs(ref_file, config)]:
            region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext))
            utils.safe_makedir(os.path.dirname(region_out))
            args.append((region_out, ref_file, config, region))
        return out_file, args

    config = copy.deepcopy(config)
    config["file_key"] = file_key
    prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)
    items = [[{file_key: prep_files}]]
    parallel_split_combine(
        items,
        split_by_region,
        run_parallel,
        "merge_variant_files",
        "concat_variant_files",
        file_key,
        ["region", "sam_ref", "config"],
        split_outfile_i=0,
    )
    return out_file
Exemple #6
0
def parallel_prep_region(samples, regions, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions(regions, "bamprep", "-prep.bam", file_key)
    return parallel_split_combine(samples, split_fn, run_parallel,
                                  "piped_bamprep", None, file_key, ["config"])
Exemple #7
0
def parallel_callable_loci(in_bam, ref_file, data):
    config = copy.deepcopy(data["config"])
    num_cores = config["algorithm"].get("num_cores", 1)
    out_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
    data = {
        "work_bam": in_bam,
        "config": config,
        "reference": data["reference"],
        "dirs": {
            "out": out_dir
        }
    }
    parallel = {
        "type": "local",
        "cores": num_cores,
        "module": "bcbio.distributed"
    }
    items = [[data]]
    with prun.start(parallel, items, config,
                    multiplier=int(num_cores)) as runner:
        split_fn = shared.process_bam_by_chromosome("-callable.bed",
                                                    "work_bam",
                                                    remove_alts=True)
        out = parallel_split_combine(items, split_fn, runner,
                                     "calc_callable_loci", "combine_bed",
                                     "callable_bed", ["config"])[0]
    return out[0]["callable_bed"]
Exemple #8
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    return parallel_split_combine(samples, split_fn, run_parallel,
                                  "piped_bamprep", None, file_key, ["config"])
Exemple #9
0
def parallel_callable_loci(in_bam, ref_file, config):
    num_cores = config["algorithm"].get("num_cores", 1)
    config = copy.deepcopy(config)
    data = {
        "work_bam": in_bam,
        "config": config,
        "reference": {
            "fasta": {
                "base": ref_file
            }
        }
    }
    parallel = {
        "type": "local",
        "cores": num_cores,
        "module": "bcbio.distributed"
    }
    items = [[data]]
    with prun.start(parallel, items, config,
                    multiplier=int(num_cores)) as runner:
        split_fn = shared.process_bam_by_chromosome("-callable.bed",
                                                    "work_bam")
        out = parallel_split_combine(items, split_fn, runner,
                                     "calc_callable_loci", "combine_bed",
                                     "callable_bed", ["config"])[0]
    return out[0]["callable_bed"]
Exemple #10
0
def parallel_callable_loci(in_bam, ref_file, config):
    num_cores = config["algorithm"].get("num_cores", 1)
    data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config}
    parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"}
    runner = parallel_runner(parallel, {}, config)
    split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam")
    out = parallel_split_combine([[data]], split_fn, runner,
                                 "calc_callable_loci", "combine_bed",
                                 "callable_bed", ["config"])[0]
    return out[0]["callable_bed"]
Exemple #11
0
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel):
    """Combine variants in parallel by chromosome, concatenating final outputs.
    """
    file_key = "vcf_files"
    items = [[{file_key: orig_files}]]
    def split_by_region(data):
        base, ext = os.path.splitext(os.path.basename(out_file))
        args = []
        for region in [x["SN"] for x in _ref_file_contigs(ref_file, config)]:
            region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base,
                                      "%s-%s%s" % (base, region, ext))
            utils.safe_makedir(os.path.dirname(region_out))
            args.append((region_out, ref_file, config, True, region))
        return out_file, args
    config = copy.deepcopy(config)
    config["file_key"] = file_key
    parallel_split_combine(items, split_by_region, run_parallel,
                           "combine_variant_files", "concat_variant_files",
                           file_key, ["region", "sam_ref", "config"], split_outfile_i=0)
    return out_file
def parallel_callable_loci(in_bam, ref_file, data):
    config = copy.deepcopy(data["config"])
    num_cores = config["algorithm"].get("num_cores", 1)
    data = {"work_bam": in_bam, "config": config, "reference": data["reference"]}
    parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"}
    items = [[data]]
    with prun.start(parallel, items, config, multiplier=int(num_cores)) as runner:
        split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam", remove_alts=True)
        out = parallel_split_combine(
            items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"]
        )[0]
    return out[0]["callable_bed"]
Exemple #13
0
def parallel_callable_loci(in_bam, ref_file, config):
    num_cores = config["algorithm"].get("num_cores", 1)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {"direction": "decrease", "magnitude": 2}
    data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config}
    parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"}
    items = [[data]]
    with prun.start(parallel, items, config) as runner:
        split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam")
        out = parallel_split_combine(
            items, split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"]
        )[0]
    return out[0]["callable_bed"]
Exemple #14
0
def parallel_callable_loci(in_bam, ref_file, config):
    num_cores = config["algorithm"].get("num_cores", 1)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {"direction": "decrease", "magnitude": 2}
    data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config}
    parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"}
    items = [[data]]
    with prun.start(parallel, items, config, multiplier=int(num_cores)) as runner:
        split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam")
        out = parallel_split_combine(items, split_fn, runner,
                                     "calc_callable_loci", "combine_bed",
                                     "callable_bed", ["config"])[0]
    return out[0]["callable_bed"]
Exemple #15
0
def parallel_callable_loci(in_bam, ref_file, config):
    num_cores = config["algorithm"].get("num_cores", 1)
    data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config}
    parallel = {
        "type": "local",
        "cores": num_cores,
        "module": "bcbio.distributed"
    }
    runner = parallel_runner(parallel, {}, config)
    split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam")
    out = parallel_split_combine([[data]], split_fn, runner,
                                 "calc_callable_loci", "combine_bed",
                                 "callable_bed", ["config"])[0]
    return out[0]["callable_bed"]
Exemple #16
0
def parallel_variantcall(sample_info, parallel_fn):
    """Provide sample genotyping, running in parallel over individual chromosomes.
    """
    to_process = []
    finished = []
    for x in sample_info:
        if x[0]["config"]["algorithm"]["snpcall"]:
            to_process.append(x)
        else:
            finished.append(x)
    if len(to_process) > 0:
        split_fn = split_bam_by_chromosome("-variants.vcf", "work_bam")
        processed = parallel_split_combine(to_process, split_fn, parallel_fn,
                                           "variantcall_sample",
                                           "combine_variant_files",
                                           "vrn_file", ["sam_ref", "config"])
        finished.extend(processed)
    return finished
Exemple #17
0
def parallel_realign_sample(sample_info, parallel_fn):
    """Realign samples, running in parallel over individual chromosomes.
    """
    to_process = []
    finished = []
    for x in sample_info:
        if x[0]["config"]["algorithm"]["snpcall"]:
            to_process.append(x)
        else:
            finished.append(x)
    if len(to_process) > 0:
        file_key = "work_bam"
        split_fn = split_bam_by_chromosome("-realign.bam", file_key, default_targets=["nochr"])
        processed = parallel_split_combine(
            to_process, split_fn, parallel_fn, "realign_sample", "combine_bam", file_key, ["config"]
        )
        finished.extend(processed)
    return finished
Exemple #18
0
def parallel_variantcall(sample_info, parallel_fn):
    """Provide sample genotyping, running in parallel over individual chromosomes.
    """
    to_process = []
    finished = []
    for x in sample_info:
        if x[0]["config"]["algorithm"]["snpcall"]:
            to_process.append(x)
        else:
            finished.append(x)
    if len(to_process) > 0:
        split_fn = split_bam_by_chromosome("-variants.vcf", "work_bam")
        processed = parallel_split_combine(to_process, split_fn, parallel_fn,
                                           "variantcall_sample",
                                           "combine_variant_files",
                                           "vrn_file", ["sam_ref", "config"])
        finished.extend(processed)
    return finished
Exemple #19
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_recalibrate(data) and not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])
Exemple #20
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        data["align_bam"] = data["work_bam"]
        a = data["config"]["algorithm"]
        if (not a.get("recalibrate") and not a.get("realign") and not a.get("variantcaller", "gatk")):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])
Exemple #21
0
def parallel_realign_sample(sample_info, parallel_fn):
    """Realign samples, running in parallel over individual chromosomes.
    """
    to_process = []
    finished = []
    for x in sample_info:
        if x[0]["config"]["algorithm"].get("realign", True):
            to_process.append(x)
        else:
            finished.append(x)
    if len(to_process) > 0:
        file_key = "work_bam"
        split_fn = process_bam_by_chromosome("-realign.bam",
                                             file_key,
                                             default_targets=["nochr"])
        processed = parallel_split_combine(to_process, split_fn, parallel_fn,
                                           "realign_sample", "combine_bam",
                                           file_key, ["config"])
        finished.extend(processed)
    return finished
Exemple #22
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            # Do not want to re-run duplicate marking after realignment
            data = dd.set_mark_duplicates(data, False)
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])
def parallel_write_recal_bam(xs, parallel_fn):
    """Rewrite a recalibrated BAM file in parallel, working off each chromosome.
    """
    to_process = []
    finished = []
    for x in xs:
        if x[0]["config"]["algorithm"].get("recalibrate", True):
            to_process.append(x)
        else:
            finished.append(x)
    if len(to_process) > 0:
        file_key = "work_bam"
        split_fn = process_bam_by_chromosome("-gatkrecal.bam", file_key,
                                           default_targets=["nochr"])
        processed = parallel_split_combine(to_process, split_fn, parallel_fn,
                                           "write_recal_bam", "combine_bam",
                                           file_key, ["config"])
        finished.extend(processed)
        # Save diskspace from original to recalibrated
        #save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam,
        #               data["config"])
    return finished
Exemple #24
0
def parallel_prep_region(samples, regions, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions(regions, "bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no prep or
    # variant calling
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        a = data["config"]["algorithm"]
        if (not a.get("mark_duplicates") and not a.get("recalibrate")
                and not a.get("realign", "gatk")
                and not a.get("variantcaller", "gatk")):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", None, file_key,
                                           ["config"])
Exemple #25
0
def parallel_write_recal_bam(xs, parallel_fn):
    """Rewrite a recalibrated BAM file in parallel, working off each chromosome.
    """
    to_process = []
    finished = []
    for x in xs:
        if x[0]["config"]["algorithm"].get("recalibrate", True):
            to_process.append(x)
        else:
            finished.append(x)
    if len(to_process) > 0:
        file_key = "work_bam"
        split_fn = process_bam_by_chromosome("-gatkrecal.bam",
                                             file_key,
                                             default_targets=["nochr"])
        processed = parallel_split_combine(to_process, split_fn, parallel_fn,
                                           "write_recal_bam", "combine_bam",
                                           file_key, ["config"])
        finished.extend(processed)
        # Save diskspace from original to recalibrated
        #save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam,
        #               data["config"])
    return finished