def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.debug("Combining fastq and BAM files %s" % str(data["name"])) config = config_utils.update_w_custom(data["config"], data["info"]) if config["algorithm"].get("upload_fastq", False): fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) else: fastq1, fastq2 = None, None out_file = os.path.join(data["dirs"]["work"], data["info"]["rgnames"]["sample"] + ".bam") sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config, out_file=out_file) return [[{ "name": data["name"], "metadata": data["info"].get("metadata", {}), "info": data["info"], "genome_build": data["genome_build"], "sam_ref": data["sam_ref"], "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"] }]]
def process_sample(sample_name, fastq_files, info, bam_files, dirs, config, config_file): """Finalize processing for a sample, potentially multiplexed. """ config = _update_config_w_custom(config, info) genome_build = info.get("genome_build", None) (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"], dirs["galaxy"]) fastq1, fastq2 = combine_fastq_files(fastq_files, dirs["work"]) log.info("Combining and preparing wig file %s" % str(sample_name)) sort_bam = merge_bam_files(bam_files, dirs["work"], config) (gatk_bam, vrn_file, effects_file) = ("", "", "") if config["algorithm"]["recalibrate"]: log.info("Recalibrating %s with GATK" % str(sample_name)) gatk_bam = recalibrate_quality(sort_bam, fastq1, fastq2, sam_ref, dirs, config) if config["algorithm"]["snpcall"]: log.info("SNP genotyping %s with GATK" % str(sample_name)) vrn_file = run_genotyper(gatk_bam, sam_ref, config) log.info("Calculating variation effects for %s" % str(sample_name)) effects_file = variation_effects(vrn_file, genome_build, config) if config["algorithm"].get("transcript_assemble", False): tx_file = assemble_transcripts(sort_bam, sam_ref, config) if sam_ref is not None: log.info("Generating summary files: %s" % str(sample_name)) generate_align_summary(sort_bam, fastq2 is not None, sam_ref, sample_name, config, dirs) bam_to_wig(sort_bam, config, config_file) return [sample_name, fastq_files, info, sort_bam, gatk_bam, vrn_file, effects_file]
def _merge_align_bams(data): """Merge multiple alignment BAMs, including split and discordant reads. """ for key in (["work_bam"], ["work_bam_plus", "disc"], ["work_bam_plus", "sr"], ["umi_bam"]): in_files = tz.get_in(key, data, []) if not isinstance(in_files, (list, tuple)): in_files = [in_files] in_files = [x for x in in_files if x and x != "None"] if in_files: ext = "-%s" % key[-1] if len(key) > 1 else "" out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-sort%s.bam" % (dd.get_sample_name(data), ext)) merged_file = merge_bam_files(in_files, utils.safe_makedir( os.path.dirname(out_file)), data, out_file=out_file) data = tz.update_in(data, key, lambda x: merged_file) else: data = tz.update_in(data, key, lambda x: None) if "align_bam" in data and "work_bam" in data: data["align_bam"] = data["work_bam"] return data
def merge_extras(items, config): """Merge extra disambiguated reads into a final BAM file. """ final = {} for extra_name in items[0]["disambiguate"].keys(): items_by_name = collections.defaultdict(list) for data in items: items_by_name[dd.get_sample_name(data)].append(data) for sname, name_items in items_by_name.items(): if sname not in final: final[sname] = {} in_files = [] for data in name_items: in_files.append(data["disambiguate"][extra_name]) out_file = "%s-allmerged%s" % os.path.splitext(in_files[0]) if in_files[0].endswith(".bam"): merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), config, out_file=out_file) else: assert extra_name == "summary", extra_name merged_file = _merge_summary(in_files, out_file, name_items[0]) final[sname][extra_name] = merged_file out = [] for data in items: data["disambiguate"] = final[dd.get_sample_name(data)] out.append([data]) return out
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.debug("Combining fastq and BAM files %s" % str(data["name"])) config = config_utils.update_w_custom(data["config"], data["info"]) if config["algorithm"].get("upload_fastq", False): fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) else: fastq1, fastq2 = None, None out_file = os.path.join(data["dirs"]["work"], data["info"]["rgnames"]["sample"] + ".bam") sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config, out_file=out_file) return [ [ { "name": data["name"], "metadata": data["info"].get("metadata", {}), "info": data["info"], "genome_build": data["genome_build"], "sam_ref": data["sam_ref"], "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"], } ] ]
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.info("Combining fastq and BAM files %s" % str(data["name"])) config = shared.update_config_w_custom(data["config"], data["info"]) genome_build, sam_ref = shared.ref_genome_info(data["info"], config, data["dirs"]) if config["algorithm"].get("upload_fastq", False): fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) else: fastq1, fastq2 = None, None sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config) return [[{ "name": data["name"], "metadata": data["info"].get("metadata", {}), "info": data["info"], "genome_build": genome_build, "sam_ref": sam_ref, "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"] }]]
def delayed_bam_merge(data): """Perform a merge on previously prepped files, delayed in processing. Handles merging of associated split read and discordant files if present """ if data.get("combine"): assert len(data["combine"].keys()) == 1 file_key = data["combine"].keys()[0] extras = [] for x in data["combine"][file_key].get("extras", []): if isinstance(x, (list, tuple)): extras.extend(x) else: extras.append(x) in_files = sorted(list(set([data[file_key]] + extras))) out_file = data["combine"][file_key]["out"] for ext in ["-disc", "-sr", ""]: if ext: cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, ext) for f in in_files))) cur_out_file = utils.append_stem(out_file, ext) if len(in_files) > 0 else None else: cur_in_files, cur_out_file = in_files, out_file if cur_out_file: config = copy.deepcopy(data["config"]) config["algorithm"]["save_diskspace"] = False merged_file = merge_bam_files(cur_in_files, os.path.dirname(cur_out_file), config, out_file=cur_out_file) data.pop("region", None) data.pop("combine", None) data[file_key] = merged_file return [[data]]
def merge_extras(in_files, out_file, config): """Merge extra disambiguated reads into a final BAM file. """ merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), config, out_file=out_file) return merged_file
def delayed_bam_merge(data): """Perform a merge on previously prepped files, delayed in processing. Handles merging of associated split read and discordant files if present. """ if data.get("combine"): assert len(data["combine"].keys()) == 1 file_key = data["combine"].keys()[0] extras = [] for x in data["combine"][file_key].get("extras", []): if isinstance(x, (list, tuple)): extras.extend(x) else: extras.append(x) if file_key in data: extras.append(data[file_key]) in_files = sorted(list(set(extras))) out_file = tz.get_in(["combine", file_key, "out"], data, _merge_out_from_infiles(in_files)) sup_exts = data.get(file_key + "-plus", {}).keys() for ext in sup_exts + [""]: merged_file = None if os.path.exists(utils.append_stem(out_file, "-" + ext)): cur_out_file, cur_in_files = out_file, [] if ext: cur_in_files = list( filter(os.path.exists, (utils.append_stem(f, "-" + ext) for f in in_files))) cur_out_file = utils.append_stem( out_file, "-" + ext) if len(cur_in_files) > 0 else None else: cur_in_files, cur_out_file = in_files, out_file if cur_out_file: config = copy.deepcopy(data["config"]) config["algorithm"]["save_diskspace"] = False if len(cur_in_files) > 0: merged_file = merge_bam_files( cur_in_files, os.path.dirname(cur_out_file), config, out_file=cur_out_file) else: assert os.path.exists(cur_out_file) merged_file = cur_out_file if merged_file: if ext: data[file_key + "-plus"][ext] = merged_file else: data[file_key] = merged_file data.pop("region", None) data.pop("combine", None) return [[data]]
def _merge_align_bams(data): """Merge multiple alignment BAMs, including split and discordant reads. """ for keys in (["work_bam"], ["work_bam-plus", "disc"], ["work_bam-plus", "sr"]): in_files = tz.get_in(keys, data) if in_files: ext = "-%s" % keys[-1] if len(keys) > 1 else "" out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-sort%s.bam" % (dd.get_sample_name(data), ext)) merged_file = merge_bam_files(in_files, os.path.dirname(out_file), data["config"], out_file=out_file) data = tz.update_in(data, keys, lambda x: merged_file) if "align_bam" in data and "work_bam" in data: data["align_bam"] = data["work_bam"] return data
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.info("Combining fastq and BAM files %s" % str(data["name"])) config = _update_config_w_custom(data["config"], data["info"]) genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"]) fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config) return [[{"name": data["name"], "genome_build": genome_build, "sam_ref": sam_ref, "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"]}]]
def delayed_bam_merge(data): """Perform a merge on previously prepped files, delayed in processing. """ if data.get("combine"): assert len(data["combine"].keys()) == 1 file_key = data["combine"].keys()[0] in_files = list(set([data[file_key]] + data["combine"][file_key].get("extras", []))) out_file = data["combine"][file_key]["out"] logger.debug("Combining BAM files to %s" % out_file) config = copy.deepcopy(data["config"]) config["algorithm"]["save_diskspace"] = False merged_file = merge_bam_files(in_files, os.path.dirname(out_file), config, out_file=out_file) if data.has_key("region"): del data["region"] data[file_key] = merged_file return [[data]]
def delayed_bam_merge(data): """Perform a merge on previously prepped files, delayed in processing. Handles merging of associated split read and discordant files if present. """ if data.get("combine"): assert len(data["combine"].keys()) == 1 file_key = data["combine"].keys()[0] extras = [] for x in data["combine"][file_key].get("extras", []): if isinstance(x, (list, tuple)): extras.extend(x) else: extras.append(x) if file_key in data: extras.append(data[file_key]) in_files = sorted(list(set(extras))) out_file = tz.get_in(["combine", file_key, "out"], data, _merge_out_from_infiles(in_files)) sup_exts = data.get(file_key + "-plus", {}).keys() for ext in sup_exts + [""]: merged_file = None if os.path.exists(utils.append_stem(out_file, "-" + ext)): cur_out_file, cur_in_files = out_file, [] if ext: cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, "-" + ext) for f in in_files))) cur_out_file = utils.append_stem(out_file, "-" + ext) if len(cur_in_files) > 0 else None else: cur_in_files, cur_out_file = in_files, out_file if cur_out_file: config = copy.deepcopy(data["config"]) config["algorithm"]["save_diskspace"] = False if len(cur_in_files) > 0: merged_file = merge_bam_files( cur_in_files, os.path.dirname(cur_out_file), config, out_file=cur_out_file ) else: assert os.path.exists(cur_out_file) merged_file = cur_out_file if merged_file: if ext: data[file_key + "-plus"][ext] = merged_file else: data[file_key] = merged_file data.pop("region", None) data.pop("combine", None) return [[data]]
def _merge_align_bams(data): """Merge multiple alignment BAMs, including split and discordant reads. """ for key in (["work_bam"], ["work_bam_plus", "disc"], ["work_bam_plus", "sr"]): in_files = tz.get_in(key, data) if in_files: if not isinstance(in_files, (list, tuple)): in_files = [in_files] ext = "-%s" % key[-1] if len(key) > 1 else "" out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-sort%s.bam" % (dd.get_sample_name(data), ext)) merged_file = merge_bam_files(in_files, utils.safe_makedir(os.path.dirname(out_file)), data["config"], out_file=out_file) data = tz.update_in(data, key, lambda x: merged_file) if "align_bam" in data and "work_bam" in data: data["align_bam"] = data["work_bam"] return data
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.info("Combining fastq and BAM files %s" % str(data["name"])) config = _update_config_w_custom(data["config"], data["info"]) genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"]) if config["algorithm"].get("upload_fastq", False): fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) else: fastq1, fastq2 = None, None sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config) return [[{"name": data["name"], "metadata": data["info"].get("metadata", {}), "genome_build": genome_build, "sam_ref": sam_ref, "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"]}]]
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.info("Combining fastq and BAM files %s" % str(data["name"])) config = _update_config_w_custom(data["config"], data["info"]) genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"]) fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config) return [[{ "name": data["name"], "genome_build": genome_build, "sam_ref": sam_ref, "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"] }]]
def merge_extras(items, config): """Merge extra disambiguated reads into a final BAM file. """ final = {} for extra_name in items[0]["disambiguate"].keys(): in_files = [] for data in items: in_files.append(data["disambiguate"][extra_name]) out_file = "%s-allmerged%s" % os.path.splitext(in_files[0]) if in_files[0].endswith(".bam"): merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), items[0], out_file=out_file) else: assert extra_name == "summary", extra_name merged_file = _merge_summary(in_files, out_file, items[0]) final[extra_name] = merged_file out = [] for data in items: data["disambiguate"] = final out.append([data]) return out
def merge_extras(items, config): """Merge extra disambiguated reads into a final BAM file. """ final = {} for extra_name in items[0]["disambiguate"].keys(): in_files = [] for data in items: in_files.append(data["disambiguate"][extra_name]) out_file = "%s-allmerged%s" % os.path.splitext(in_files[0]) if in_files[0].endswith(".bam"): print out_file, in_files merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), config, out_file=out_file) else: assert extra_name == "summary", extra_name merged_file = _merge_summary(in_files, out_file, items[0]) final[extra_name] = merged_file out = [] for data in items: data["disambiguate"] = final out.append([data]) return out