Ejemplo n.º 1
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    ref_file = dd.get_ref_file(data)
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed,
                           "callable": covinfo.raw_callable,
                           "sample_callable": covinfo.callable,
                           "mapped_stats": readstats.get_cache_file(data)}
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        data = samtools.run_and_save(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    elif dd.get_variant_regions(data):
        callable_region_bed, nblock_bed = \
            callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data)
        data["regions"] = {"nblock": nblock_bed, "callable": dd.get_variant_regions(data),
                           "sample_callable": dd.get_variant_regions(data)}
    return [[data]]
Ejemplo n.º 2
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], ref_file, data)
        highdepth_bed = highdepth.identify(data)
        bam.index(data["work_bam"], data["config"])
        sample_callable = callable.sample_callable_bed(data["work_bam"],
                                                       ref_file, data)
        offtarget_stats = callable.calculate_offtarget(data["work_bam"],
                                                       ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": highdepth_bed,
            "sample_callable": sample_callable,
            "offtarget_stats": offtarget_stats
        }
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Ejemplo n.º 3
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": covinfo.highdepth,
            "sample_callable": covinfo.callable,
            "coverage_depth_bed": covinfo.depth,
            "avg_coverage": covinfo.avg_coverage
        }
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Ejemplo n.º 4
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(bam_file_ready, ref_file, data)
        sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed,
                           "sample_callable": sample_callable,
                           "offtarget_stats": offtarget_stats}
        data = coverage.assign_interval(data)
        highdepth_bed = highdepth.identify(data)
        data["regions"]["highdepth"] = highdepth_bed
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Ejemplo n.º 5
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": covinfo.raw_callable,
            "sample_callable": covinfo.callable,
            "mapped_stats": readstats.get_cache_file(data)
        }
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        data = samtools.run_and_save(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
Ejemplo n.º 6
0
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc, dirs, config,
                    config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    if fastq1 is None and "vrn_file" in info:
        _, ref_file = get_genome_ref(info["genome_build"], None,
                                     dirs["galaxy"])
        config["algorithm"]["variantcaller"] = ""
        data = {
            "info": info,
            "sam_ref": ref_file,
            "work_bam": None,
            "genome_build": info["genome_build"],
            "name": ("", lane_desc),
            "vrn_file": info["vrn_file"],
            "dirs": copy.deepcopy(dirs),
            "config": config
        }
    else:
        align_out = process_alignment(fastq1, fastq2, info, lane_name,
                                      lane_desc, dirs, config)[0]
        data = _organize_merge_samples(align_out, dirs, config_file)
        callable_region_bed, analysis_regions = callable.block_regions(
            data["work_bam"], data["sam_ref"], config)
        data["regions"] = analysis_regions
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 7
0
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc,
                    dirs, config, config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    if fastq1 is None and "vrn_file" in info:
        _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"])
        config["algorithm"]["variantcaller"] = ""
        data = {"info": info, "sam_ref": ref_file,
                "work_bam": None,
                "genome_build": info["genome_build"],
                "name": ("", lane_desc),
                "vrn_file": info["vrn_file"],
                "dirs": copy.deepcopy(dirs), "config": config}
    else:
        align_out = process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                                      dirs, config)[0]
        data = _organize_merge_samples(align_out, dirs, config_file)
        callable_region_bed, analysis_regions = callable.block_regions(data["work_bam"],
                                                                       data["sam_ref"], config)
        data["regions"] = analysis_regions
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 8
0
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc,
                    dirs, config, config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    align_out = process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                                  dirs, config)[0]
    data = _organize_merge_samples(align_out, dirs, config_file)
    data["regions"] = callable.block_regions(data["work_bam"],
                                             data["sam_ref"], config)
    data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 9
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    if data["work_bam"]:
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 10
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"):
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Ejemplo n.º 11
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    if data["work_bam"]:
        callable_region_bed, nblock_bed = callable.block_regions(data["work_bam"],
                                                                 data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 12
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    Cleans input BED files to avoid issues with overlapping input segments.
    """
    data = bedutils.clean_inputs(data)
    if vmulti.bam_needs_processing(data):
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 13
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    Cleans input BED files to avoid issues with overlapping input segments.
    """
    data = bedutils.clean_inputs(data)
    if data["work_bam"]:
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 14
0
def align_prep_full(data, config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    if data["files"][0] is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = ""
        data["work_bam"] = None
    else:
        data = process_alignment(data)[0][0]
        callable_region_bed, nblock_bed = callable.block_regions(data["work_bam"],
                                                                 data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 15
0
def align_prep_full(data, config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    if data["files"][0] is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = ""
        data["work_bam"] = None
    else:
        data = process_alignment(data)[0][0]
        callable_region_bed, nblock_bed = callable.block_regions(
            data["work_bam"], data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed}
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Ejemplo n.º 16
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"):
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], data["sam_ref"], data["config"])
        highdepth_bed = highdepth.identify(data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": highdepth_bed
        }
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Ejemplo n.º 17
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(data["work_bam"], ref_file, data)
        highdepth_bed = highdepth.identify(data)
        sample_callable = callable.sample_callable_bed(data["work_bam"], ref_file, data)
        offtarget_stats = callable.calculate_offtarget(data["work_bam"], ref_file, data)
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "highdepth": highdepth_bed,
                           "sample_callable": sample_callable,
                           "offtarget_stats": offtarget_stats}
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]