def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build] remap_fn = alignment.TOOLS[name].remap_index_fn need_remap = remap_fn is not None if len(refs) == 0: raise ValueError("Did not find genome build %s in bcbio installation: %s" % (genome_build, os.path.normpath(loc_file))) else: cur_ref = refs[-1] # Find genome directory and check for packed wf tarballs cur_ref_norm = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) base_dir_i = cur_ref_norm.find("/%s/" % genome_build) base_dir = os.path.join(cur_ref_norm[:base_dir_i], genome_build) for tarball in glob.glob(os.path.join(base_dir, "*-wf.tar.gz")): cwlutils.unpack_tarballs(tarball, {"dirs": {"work": base_dir}}, use_subdir=False) if need_remap: assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build ] remap_fn = alignment.TOOLS[name].remap_index_fn need_remap = remap_fn is not None if len(refs) == 0: logger.info("Downloading %s %s from AWS" % (genome_build, name)) cur_ref = download_prepped_genome(genome_build, data, name, need_remap) # allow multiple references in a file and use the most recently added else: cur_ref = refs[-1] # Find genome directory and check for packed wf tarballs cur_ref_norm = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) base_dir_i = cur_ref_norm.find("/%s/" % genome_build) base_dir = os.path.join(cur_ref_norm[:base_dir_i], genome_build) for tarball in glob.glob(os.path.join(base_dir, "*-wf.tar.gz")): cwlutils.unpack_tarballs(tarball, {"dirs": { "work": base_dir }}, use_subdir=False) if need_remap: assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False), orig_items) if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(data[vrn_key], data) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) if data["analysis"].startswith("wgbs-seq"): bismark_bam = dd.get_align_bam(data) sorted_bam = bam.sort(bismark_bam, data["config"]) data = dd.set_align_bam(data, sorted_bam) data = dd.set_work_bam(data, bismark_bam) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join( dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) return [[data] for data in samples]
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = [cwlutils.unpack_tarballs(x, x) for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update( global_analysis_file, samples): global_no_analysis_file = os.path.join( os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch( batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"][ "callable_regions"] = analysis_file data["config"]["algorithm"][ "non_callable_regions"] = no_analysis_file data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( analysis_file).count() elif vr_file: data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ data, items = _get_batch_representative(items, "vrn_file") items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get("vrn_file") data = _symlink_to_workdir(data, ["vrn_file"]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration( data["vrn_file"], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items) logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data["vrn_file"], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file): data["vrn_file"] = orig_vrn_file return [[data]]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files += glob.glob(os.path.join(out_dir, "multiqc_config.yaml")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append(file_list_final) out.append([data]) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = cwlutils.unpack_tarballs(samples, samples[0]) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def prep_samples(*items): """Handle any global preparatory steps for samples with potentially shared data. Avoids race conditions in postprocess alignment when performing prep tasks on shared files between multiple similar samples. Cleans input BED files to avoid issues with overlapping input segments. """ out = [] for data in (utils.to_single_data(x) for x in items): data = cwlutils.normalize_missing(data) data = cwlutils.unpack_tarballs(data, data) data = clean_inputs(data) out.append([data]) return out
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. items = sample-sv_caller list, from one batch """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] batch = dd.get_batch(items[0]) # no SV calling when just creating a PON for PureCN if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]): return out if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)) and cwlutils.is_cwl_run(utils.to_single_data(data[0])): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) toval_data = cwlutils.unpack_tarballs(toval_data, toval_data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, prefix="validateregions-", bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") # RTG can fail on totally empty files. Call everything in truth set as false negatives if not vcfutils.vcf_has_variants(vrn_file): eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod in ["rtg", "rtg-squash-ploidy"]: eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod) eval_files = _annotate_validations(eval_files, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0]) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") ref_file = dd.get_ref_file(data) artifacts = gatk.collect_artifact_metrics(data) if artifacts: data = dd.update_summary_qc(data, "picard", artifacts.pop(), artifacts) oxog = gatk.collect_oxog_metrics(data) data = dd.update_summary_qc(data, "picard", oxog.pop(), oxog) if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) elif dd.get_variant_regions(data): callable_region_bed, nblock_bed = \ callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": dd.get_variant_regions(data), "sample_callable": dd.get_variant_regions(data) } return [[data]]
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [x for x in all_items if dd.get_sample_name(x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = {"summary": tz.get_in(["sv-validate", "csv"], data)} svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) work_bam = data.get("align_bam") if data["analysis"].lower().startswith("smallrna-seq"): work_bam = data["clean_fastq"] elif data["analysis"].lower().startswith("chip-seq"): work_bam = data["raw_bam"] elif not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data) is not None and work_bam: logger.info( "QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ("{path_export}{export_tmp}{locale_export} " "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}") do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append(tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [ cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples ] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir( samples[0]) else: export_tmp = "" path_export = utils.local_path_export() cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } file_list_final = _save_uploaded_file_list( samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append( file_list_final) out.append([data]) return out
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2 = postalign.umi_consensus(data) del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) data["reference"]["fasta"] = bam.ref_file_from_bam(out_bam, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError("Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]