def _group_batches_shared(xs, caller_batch_fn, prep_data_fn): """Shared functionality for grouping by batches for variant calling and joint calling. """ singles = [] batch_groups = collections.defaultdict(list) for args in xs: assert len(args) == 1 data = args[0] caller, batch = caller_batch_fn(data) region = _list_to_tuple(data["region"]) if "region" in data else () if batch is not None: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: batch_groups[(b, region, caller)].append(utils.deepish_copy(data)) else: data = prep_data_fn(data, [data]) singles.append(data) batches = [] for batch, items in batch_groups.iteritems(): batch_data = utils.deepish_copy(_pick_lead_item(items)) batch_data = prep_data_fn(batch_data, items) batch_data["group_orig"] = _collapse_subitems(batch_data, items) batch_data["group"] = batch batches.append(batch_data) return singles + batches
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn): """Shared functionality for grouping by batches for variant calling and joint calling. """ singles = [] batch_groups = collections.defaultdict(list) for args in xs: data = utils.to_single_data(args) caller, batch = caller_batch_fn(data) region = _list_to_tuple(data["region"]) if "region" in data else () if batch is not None: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: batch_groups[(b, region, caller)].append(utils.deepish_copy(data)) else: data = prep_data_fn(data, [data]) singles.append(data) batches = [] for batch, items in batch_groups.items(): batch_data = utils.deepish_copy(_pick_lead_item(items)) # For nested primary batches, split permanently by batch if tz.get_in(["metadata", "batch"], batch_data): batch_name = batch[0] batch_data["metadata"]["batch"] = batch_name batch_data = prep_data_fn(batch_data, items) batch_data["group_orig"] = _collapse_subitems(batch_data, items) batch_data["group"] = batch batches.append(batch_data) return singles + batches
def _get_validate(data): """Retrieve items to validate, from single samples or from combined joint calls. """ if data.get("vrn_file") and tz.get_in(["config", "algorithm", "validate"], data): return utils.deepish_copy(data) elif "group_orig" in data: for sub in multi.get_orig_items(data): if "validate" in sub["config"]["algorithm"]: sub_val = utils.deepish_copy(sub) sub_val["vrn_file"] = data["vrn_file"] return sub_val return None
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def _get_split_tasks(args, split_fn, file_key, outfile_i=-1): """Split up input files and arguments, returning arguments for parallel processing. outfile_i specifies the location of the output file in the arguments to the processing function. Defaults to the last item in the list. """ split_args = [] combine_map = {} finished_map = collections.OrderedDict() extras = [] for data in args: out_final, out_parts = split_fn(data) for parts in out_parts: split_args.append([utils.deepish_copy(data)] + list(parts)) for part_file in [x[outfile_i] for x in out_parts]: combine_map[part_file] = out_final if len(out_parts) == 0: if out_final is not None: if out_final not in finished_map: data[file_key] = out_final finished_map[out_final] = [data] else: extras.append([data]) else: extras.append([data]) return split_args, combine_map, finished_map.values(), extras
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [ config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass) ] # Remove filtered calls if we're dealing with tumor/normal calls if vcfutils.get_paired_phenotype(edata): cmd += ["--nofiltered"] cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return { "variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None }
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. If doing joint calling, with `tools_on: [gvcf]`, split the sample into individuals instead of combining into a batch. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) batches = [] for cur_group in batch_groups.values(): joint_calling = any([is_joint(d) for d in cur_group]) if joint_calling: for d in cur_group: batches.append([d]) else: batches.append(cur_group) return batches + extras
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq) tools = {"fastqc": fastqc.run, "atropos": atropos.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "peddy": peddy.run_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = utils.deepish_copy(dd.get_summary_qc(data)) for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = [ "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file ] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def write_project_summary(samples, qsign_info=None): """Write project summary information on the provided samples. write out dirs, genome resources, """ work_dir = samples[0][0]["dirs"]["work"] out_file = os.path.join(work_dir, "project-summary.yaml") upload_dir = (os.path.join(work_dir, samples[0][0]["upload"]["dir"]) if "dir" in samples[0][0]["upload"] else "") date = str(datetime.now()) prev_samples = _other_pipeline_samples(out_file, samples) with open(out_file, "w") as out_handle: yaml.safe_dump({"date": date}, out_handle, default_flow_style=False, allow_unicode=False) if qsign_info: qsign_out = utils.deepish_copy(qsign_info[0]) qsign_out.pop("out_dir", None) yaml.safe_dump({"qsignature": qsign_out}, out_handle, default_flow_style=False, allow_unicode=False) yaml.safe_dump({"upload": upload_dir}, out_handle, default_flow_style=False, allow_unicode=False) yaml.safe_dump({"bcbio_system": samples[0][0]["config"].get("bcbio_system", "")}, out_handle, default_flow_style=False, allow_unicode=False) yaml.safe_dump({"samples": prev_samples + [_save_fields(sample[0]) for sample in samples]}, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def cl_gatk(self, params, tmp_dir, memscale=None): support_nt = set() support_nct = set(["BaseRecalibrator"]) gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"]) cores = self._config["algorithm"].get("num_cores", 1) config = self._config if cores and int(cores) > 1: atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) if config["algorithm"].get("memory_adjust") is None: config = utils.deepish_copy(config) config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": int(cores) // 2} if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"): if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
def _run_ensemble_intersection(batch_id, vrn_files, callers, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [ config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass), "--names", ",".join(callers) ] # Remove filtered calls, do not try to rescue, unless configured if not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata): cmd += ["--nofiltered"] with file_transaction(edata, out_vcf_file) as tx_out_file: cmd += [tx_out_file, dd.get_ref_file(edata)] + vrn_files cmd = "%s && %s" % (utils.get_java_clprep(), " ".join( str(x) for x in cmd)) do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return { "variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None }
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) if data["analysis"].startswith("wgbs-seq"): bismark_bam = dd.get_align_bam(data) sorted_bam = bam.sort(bismark_bam, data["config"]) data = dd.set_align_bam(data, sorted_bam) data = dd.set_work_bam(data, bismark_bam) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join( dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def split_somatic(items): """Split somatic batches, adding a germline target. Enables separate germline calling of samples using shared alignments. """ items = [_clean_flat_variantcaller(x) for x in items] somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items) # extract germline samples to run from normals in tumor/normal pairs germline_added = set([]) germline = [] for somatic_group in somatic_groups: paired = vcfutils.get_paired(somatic_group) if paired and paired.normal_data: cur = utils.deepish_copy(paired.normal_data) vc = dd.get_variantcaller(cur) if isinstance(vc, dict) and "germline" in vc: if cur["description"] not in germline_added: germline_added.add(cur["description"]) cur["rgnames"]["sample"] = cur["description"] cur["metadata"]["batch"] = "%s-germline" % cur["description"] cur["metadata"]["phenotype"] = "germline" cur = remove_align_qc_tools(cur) cur["config"]["algorithm"]["variantcaller"] = vc["germline"] germline.append(cur) # Fix variantcalling specification for only somatic targets somatic_out = [] for data in somatic: vc = dd.get_variantcaller(data) if isinstance(vc, dict) and "somatic" in vc: data["config"]["algorithm"]["variantcaller"] = vc["somatic"] somatic_out.append(data) return non_somatic + somatic_out + germline
def split_somatic(items): """Split somatic batches, adding a germline target. Enables separate germline calling of samples using shared alignments. """ somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items) # extract germline samples to run from normals in tumor/normal pairs germline_added = set([]) germline = [] for somatic_group in somatic_groups: paired = vcfutils.get_paired(somatic_group) if paired and paired.normal_data: cur = utils.deepish_copy(paired.normal_data) vc = dd.get_variantcaller(cur) if isinstance(vc, dict) and "germline" in vc: cur["description"] = "%s-germline" % cur["description"] if cur["description"] not in germline_added: germline_added.add(cur["description"]) cur["rgnames"]["sample"] = cur["description"] del cur["metadata"]["batch"] cur["metadata"]["phenotype"] = "germline" cur = remove_align_qc_tools(cur) cur["config"]["algorithm"]["variantcaller"] = vc[ "germline"] germline.append(cur) # Fix variantcalling specification for only somatic targets somatic_out = [] for data in somatic: vc = dd.get_variantcaller(data) if isinstance(vc, dict) and "somatic" in vc: data["config"]["algorithm"]["variantcaller"] = vc["somatic"] somatic_out.append(data) return non_somatic + somatic_out + germline
def get_variants(data, include_germline=False): """Retrieve set of variant calls to use for heterogeneity analysis. """ data = utils.deepish_copy(data) supported = ["precalled", "vardict", "vardict-java", "vardict-perl", "freebayes", "octopus", "strelka2"] # Right now mutect2 and mutect do not provide heterozygous germline calls # to be useful https://github.com/bcbio/bcbio-nextgen/issues/2464 # supported += ["mutect2", "mutect"] if include_germline: supported.insert(1, "gatk-haplotype") out = [] # CWL based input if isinstance(data.get("variants"), dict) and "samples" in data["variants"]: cur_vs = [] # Unpack single sample list of files if (isinstance(data["variants"]["samples"], (list, tuple)) and len(data["variants"]["samples"]) == 1 and isinstance(data["variants"]["samples"][0], (list, tuple))): data["variants"]["samples"] = data["variants"]["samples"][0] for fname in data["variants"]["samples"]: variantcaller = utils.splitext_plus(os.path.basename(fname))[0] variantcaller = variantcaller.replace(dd.get_sample_name(data) + "-", "") for batch in dd.get_batches(data): variantcaller = variantcaller.replace(batch + "-", "") cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller}) data["variants"] = cur_vs for v in data.get("variants", []): if v["variantcaller"] in supported and v.get("vrn_file"): out.append((supported.index(v["variantcaller"]), v)) out.sort() return [xs[1] for xs in out]
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"]) to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] all_keys = set([]) for data in to_process: all_keys.update(set(data["cwl_keys"])) for data in to_process: for raw_key in sorted(list(all_keys)): key = raw_key.split("__") if tz.get_in(key, data) is None: data = tz.update_in(data, key, lambda x: None) data["cwl_keys"].append(raw_key) if raw_key in convert_to_list: val = tz.get_in(key, data) if not val: val = [] elif not isinstance(val, (list, tuple)): val = [val] data = tz.update_in(data, key, lambda x: val) vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def _run_ensemble_intersection(batch_id, vrn_files, callers, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [ config_utils.get_program( "bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass), "--names", ",".join(callers) ] # Remove filtered calls, do not try to rescue, unless configured if not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata): cmd += ["--nofiltered"] with file_transaction(edata, out_vcf_file) as tx_out_file: cmd += [tx_out_file, dd.get_ref_file(edata)] + vrn_files cmd = "%s %s" % (utils.local_path_export(), " ".join(str(x) for x in cmd)) do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None}
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False): num_cores = dd.get_num_cores(data) memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling config = utils.deepish_copy(data["config"]) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, ld_preload=ld_preload)
def get_orig_items(base): """Retrieve original items from a diffed set of nested samples. """ assert "group_orig" in base out = [] for data_diff in base["group_orig"]: new = utils.deepish_copy(base) new.pop("group_orig") out.append(_patch_dict(data_diff, new)) return out
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) return [[data] for data in samples]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files += glob.glob(os.path.join(out_dir, "multiqc_config.yaml")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append(file_list_final) out.append([data]) return out
def _concat_records(items_by_key, input_order): """Concatenate records into a single key to avoid merging. Handles heterogeneous records that will then be sorted out in the processing fuction. """ all_records = [] for (k, t) in input_order.items(): if t == "record": all_records.append(k) out_items_by_key = utils.deepish_copy(items_by_key) out_input_order = utils.deepish_copy(input_order) if len(all_records) > 1: final_k = all_records[0] final_v = items_by_key[final_k] for k in all_records[1:]: final_v += items_by_key[k] del out_items_by_key[k] del out_input_order[k] out_items_by_key[final_k] = final_v return out_items_by_key, out_input_order
def _check_for_single_nested(target, items_by_key, input_order): """Check for single nested inputs that match our target count and unnest. Handles complex var inputs where some have an extra layer of nesting. """ out = utils.deepish_copy(items_by_key) for (k, t) in input_order.items(): if t == "var": v = items_by_key[tuple(k.split("__"))] if _is_nested_single(v, target): out[tuple(k.split("__"))] = v[0] return out
def add_highdepth_genome_exclusion(items): """Add exclusions to input items to avoid slow runtimes on whole genomes. """ out = [] for d in items: d = utils.deepish_copy(d) if dd.get_coverage_interval(d) == "genome": e = dd.get_exclude_regions(d) if "highdepth" not in e: e.append("highdepth") d = dd.set_exclude_regions(d, e) out.append(d) return out
def _run_concat_variant_files_gatk4(input_file_list, out_file, config): """Use GATK4 GatherVcfs for concatenation of scattered VCFs. """ if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file] # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling config = utils.deepish_copy(config) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params) return out_file
def group_by_batch(items, require_bam=True): """Group a set of sample items by batch (or singleton) name. Items in multiple batches cause two batches to be merged together. """ out = collections.defaultdict(list) batch_groups = _get_representative_batch(_merge_batches(_find_all_groups(items, require_bam))) for data in items: batches = _get_batches(data, require_bam) # take first batch as representative batch = batch_groups[batches[0]] out[batch].append(utils.deepish_copy(data)) return dict(out)
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [], "prioritize": { "tsv": [], "raw": [] } }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) # Standard callers for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) # prioritization for pdata in _group_by_sample(items): prioritysv = [ x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get( "sv", []) if x["variantcaller"] == "sv-prioritize" ] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend( prioritysv[0]["raw_files"].values()) return [out]
def extract_germline_vcinfo(data, out_dir): """Extract germline VCFs from existing tumor inputs. """ supported_germline = set(["vardict", "octopus", "freebayes"]) if dd.get_phenotype(data) in ["tumor"]: for v in _get_variants(data): if v.get("variantcaller") in supported_germline: if v.get("germline"): return v else: d = utils.deepish_copy(data) d["vrn_file"] = v["vrn_file"] gd = germline.extract(d, [d], out_dir) v["germline"] = gd["vrn_file_plus"]["germline"] return v
def batch_for_jointvc(items): batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in items]: vc = dd.get_variantcaller(data) if genotype.is_joint(data): batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] else: batches = [dd.get_sample_name(data)] for b in batches: data = utils.deepish_copy(data) data["vrn_file_gvcf"] = data["vrn_file"] batch_groups[(b, vc)].append(data) return list(batch_groups.values())
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_file = os.path.join(utils.safe_makedir(os.path.join("variation", "rnaseq", "gatk-haplotype")), "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) out_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, out_file=out_file) return dd.set_vrn_file(data, out_file)
def batch_for_jointvc(items): batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in items]: vc = dd.get_variantcaller(data) if genotype.is_joint(data): batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] else: batches = [dd.get_sample_name(data)] for b in batches: data = utils.deepish_copy(data) data["vrn_file_gvcf"] = data["vrn_file"] batch_groups[(b, vc)].append(data) return batch_groups.values()
def _group_by_sample(items): """Group a set of items by sample names + multiple callers for prioritization """ by_sample = collections.defaultdict(list) for d in items: by_sample[dd.get_sample_name(d)].append(d) out = [] for sample_group in by_sample.values(): cur = utils.deepish_copy(sample_group[0]) svs = [] for d in sample_group: svs.append(d["sv"]) cur["sv"] = svs out.append(cur) return out
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in to_process]: vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0]) return [[data]]
def _clean_fields(d): if isinstance(d, dict): if "fields" in d: out = [] for f in d["fields"]: f = utils.deepish_copy(f) f.pop("secondaryFiles", None) out.append(f) d["fields"] = out return d else: out = {} for k, v in d.items(): out[k] = _clean_fields(v) return out else: return d
def _get_callers(items, stage, special_cases=False): """Retrieve available callers for the provided stage. Handles special cases like CNVkit that can be in initial or standard depending on if fed into Lumpy analysis. """ callers = utils.deepish_copy(_CALLERS[stage]) if special_cases and "cnvkit" in callers: has_lumpy = any("lumpy" in get_svcallers(d) or "lumpy" in d["config"]["algorithm"].get("svcaller_orig", []) for d in items) if has_lumpy and any("lumpy_usecnv" in dd.get_tools_on(d) for d in items): if stage != "initial": del callers["cnvkit"] else: if stage != "standard": del callers["cnvkit"] return callers
def update_summary_qc(data, key, base=None, secondary=None): """ updates summary_qc with a new section, keyed by key. stick files into summary_qc if you want them propagated forward and available for multiqc """ summary = deepish_copy(get_summary_qc(data, {})) if key in summary: return data if base and secondary: summary[key] = {"base": base, "secondary": secondary} elif base: summary[key] = {"base": base} elif secondary: summary[key] = {"secondary": secondary} data = set_summary_qc(data, summary) return data
def _run_concat_variant_files_gatk4(input_file_list, out_file, config): """Use GATK4 GatherVcfs for concatenation of scattered VCFs. """ if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file] # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling config = utils.deepish_copy(config) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") # Allow specification of verbosity in the unique style this tool uses resources = config_utils.get_resources("gatk", config) opts = [str(x) for x in resources.get("options", [])] if "--verbosity" in opts: params += ["--VERBOSITY:%s" % opts[opts.index("--verbosity") + 1]] broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params) return out_file
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass)] # Remove filtered calls if we're dealing with tumor/normal calls if vcfutils.get_paired_phenotype(edata): cmd += ["--nofiltered"] cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None}
def summarize_samples(samples, run_parallel): """Back compatibility for existing pipelines. Should be replaced with summary when ready. """ extras = [] to_run = collections.defaultdict(list) multi_batches = set([]) for data in [x[0] for x in samples]: if tz.get_in(["config", "algorithm", "coverage"], data): batches = tz.get_in(("metadata", "batch"), data, [dd.get_sample_name(data)]) if not isinstance(batches, (tuple, list)): batches = [batches] else: multi_batches.add(dd.get_sample_name(data)) for batch in batches: to_run[batch].append(utils.deepish_copy(data)) else: extras.append([data]) out = run_parallel("coverage_summary", [[xs] for xs in to_run.values()]) if len(to_run) > 0 else [] out = _handle_multi_batches(out, multi_batches) assert len(out + extras) == len(samples), (len(out + extras), len(samples)) return out + extras