def _add_combine_info(output, combine_map, file_key): """Do not actually combine, but add details for later combining work. Each sample will contain information on the out file and additional files to merge, enabling other splits and recombines without losing information. """ files_per_output = collections.defaultdict(list) for part_file, out_file in combine_map.items(): files_per_output[out_file].append(part_file) out_by_file = collections.defaultdict(list) out = [] for data in output: # Do not pass along nochrom, noanalysis regions if data["region"][0] not in ["nochrom", "noanalysis"]: cur_file = data[file_key] # If we didn't process, no need to add combine information if cur_file in combine_map: out_file = combine_map[cur_file] if "combine" not in data: data["combine"] = {} data["combine"][file_key] = { "out": out_file, "extras": files_per_output.get(out_file, []) } out_by_file[out_file].append(data) elif cur_file: out_by_file[cur_file].append(data) else: out.append([data]) for samples in out_by_file.values(): regions = [x["region"] for x in samples] region_bams = [x["work_bam"] for x in samples] assert len(regions) == len(region_bams) if len(set(region_bams)) == 1: region_bams = [region_bams[0]] data = samples[0] data["region_bams"] = region_bams data["region"] = regions data = dd.set_mark_duplicates( data, data["config"]["algorithm"]["orig_markduplicates"]) del data["config"]["algorithm"]["orig_markduplicates"] out.append([data]) return out
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: # Do not want to re-run duplicate marking after realignment data = dd.set_mark_duplicates(data, False) torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def _add_combine_info(output, combine_map, file_key): """Do not actually combine, but add details for later combining work. Each sample will contain information on the out file and additional files to merge, enabling other splits and recombines without losing information. """ files_per_output = collections.defaultdict(list) for part_file, out_file in combine_map.items(): files_per_output[out_file].append(part_file) out_by_file = collections.defaultdict(list) out = [] for data in output: # Do not pass along nochrom, noanalysis regions if data["region"][0] not in ["nochrom", "noanalysis"]: cur_file = data[file_key] # If we didn't process, no need to add combine information if cur_file in combine_map: out_file = combine_map[cur_file] if "combine" not in data: data["combine"] = {} data["combine"][file_key] = {"out": out_file, "extras": files_per_output.get(out_file, [])} out_by_file[out_file].append(data) elif cur_file: out_by_file[cur_file].append(data) else: out.append([data]) for samples in out_by_file.values(): regions = [x["region"] for x in samples] region_bams = [x["work_bam"] for x in samples] assert len(regions) == len(region_bams) if len(set(region_bams)) == 1: region_bams = [region_bams[0]] data = samples[0] data["region_bams"] = region_bams data["region"] = regions data = dd.set_mark_duplicates(data, data["config"]["algorithm"]["orig_markduplicates"]) del data["config"]["algorithm"]["orig_markduplicates"] out.append([data]) return out