예제 #1
0
def _add_combine_info(output, combine_map, file_key):
    """Do not actually combine, but add details for later combining work.

    Each sample will contain information on the out file and additional files
    to merge, enabling other splits and recombines without losing information.
    """
    files_per_output = collections.defaultdict(list)
    for part_file, out_file in combine_map.items():
        files_per_output[out_file].append(part_file)
    out_by_file = collections.defaultdict(list)
    out = []
    for data in output:
        # Do not pass along nochrom, noanalysis regions
        if data["region"][0] not in ["nochrom", "noanalysis"]:
            cur_file = data[file_key]
            # If we didn't process, no need to add combine information
            if cur_file in combine_map:
                out_file = combine_map[cur_file]
                if "combine" not in data:
                    data["combine"] = {}
                data["combine"][file_key] = {
                    "out": out_file,
                    "extras": files_per_output.get(out_file, [])
                }
                out_by_file[out_file].append(data)
            elif cur_file:
                out_by_file[cur_file].append(data)
            else:
                out.append([data])
    for samples in out_by_file.values():
        regions = [x["region"] for x in samples]
        region_bams = [x["work_bam"] for x in samples]
        assert len(regions) == len(region_bams)
        if len(set(region_bams)) == 1:
            region_bams = [region_bams[0]]
        data = samples[0]
        data["region_bams"] = region_bams
        data["region"] = regions
        data = dd.set_mark_duplicates(
            data, data["config"]["algorithm"]["orig_markduplicates"])
        del data["config"]["algorithm"]["orig_markduplicates"]
        out.append([data])
    return out
예제 #2
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            # Do not want to re-run duplicate marking after realignment
            data = dd.set_mark_duplicates(data, False)
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])
예제 #3
0
def _add_combine_info(output, combine_map, file_key):
    """Do not actually combine, but add details for later combining work.

    Each sample will contain information on the out file and additional files
    to merge, enabling other splits and recombines without losing information.
    """
    files_per_output = collections.defaultdict(list)
    for part_file, out_file in combine_map.items():
        files_per_output[out_file].append(part_file)
    out_by_file = collections.defaultdict(list)
    out = []
    for data in output:
        # Do not pass along nochrom, noanalysis regions
        if data["region"][0] not in ["nochrom", "noanalysis"]:
            cur_file = data[file_key]
            # If we didn't process, no need to add combine information
            if cur_file in combine_map:
                out_file = combine_map[cur_file]
                if "combine" not in data:
                    data["combine"] = {}
                data["combine"][file_key] = {"out": out_file,
                                             "extras": files_per_output.get(out_file, [])}
                out_by_file[out_file].append(data)
            elif cur_file:
                out_by_file[cur_file].append(data)
            else:
                out.append([data])
    for samples in out_by_file.values():
        regions = [x["region"] for x in samples]
        region_bams = [x["work_bam"] for x in samples]
        assert len(regions) == len(region_bams)
        if len(set(region_bams)) == 1:
            region_bams = [region_bams[0]]
        data = samples[0]
        data["region_bams"] = region_bams
        data["region"] = regions
        data = dd.set_mark_duplicates(data, data["config"]["algorithm"]["orig_markduplicates"])
        del data["config"]["algorithm"]["orig_markduplicates"]
        out.append([data])
    return out
예제 #4
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            # Do not want to re-run duplicate marking after realignment
            data = dd.set_mark_duplicates(data, False)
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info,
                                           file_key, ["config"])