def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk} from bcbio.structural import cnvkit items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out = [] for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: if cnvkit.use_general_sv_bins(data): target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group, size_calc_fn) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i), "gcannotated": gcannotated_tsv} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir): """Write output argfile, preparing a CWL ready JSON or YAML representation of the world. """ with open(argfile, "w") as out_handle: if argfile.endswith(".json"): record_name, record_attrs = _get_record_attrs(out_keys) if record_name: if parallel in ["multi-batch"]: recs = _nested_cwl_record(out, record_attrs, input_files) elif parallel in ["single-split", "multi-combined", "multi-parallel", "batch-single", "single-single"]: recs = [_collapse_to_cwl_record_single(utils.to_single_data(xs), record_attrs, input_files) for xs in out] else: samples = [utils.to_single_data(xs) for xs in out] recs = [_collapse_to_cwl_record(samples, record_attrs, input_files)] json.dump(_combine_cwl_records(recs, record_name, parallel), out_handle, sort_keys=True, indent=4, separators=(', ', ': ')) elif parallel in ["single-split", "multi-combined", "batch-split"]: json.dump(_convert_to_cwl_json([utils.to_single_data(xs) for xs in out], fnargs, input_files), out_handle, sort_keys=True, indent=4, separators=(', ', ': ')) else: json.dump(_convert_to_cwl_json(utils.to_single_data(utils.to_single_data(out)), fnargs, input_files), out_handle, sort_keys=True, indent=4, separators=(', ', ': ')) else: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
def precall(items): """Perform initial pre-calling steps -- coverage calcuation by sample. Use sambamba to call average region coverage in regions, and convert into a correct format. """ items = [utils.to_single_data(x) for x in items] assert len(items) == 1, "Expect one item to Seq2C coverage calculation" data = utils.to_single_data(items) # sv_bed could specify a smaller region than variant coverage, so avoid # this sanity check # assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing" assert "seq2c_bed_ready" in data["config"]["algorithm"], "Error: svregions or variant_regions BED file required for Seq2C" bed_file = data["config"]["algorithm"]["seq2c_bed_ready"] bam_file = dd.get_align_bam(data) sample_name = dd.get_sample_name(data) work_dir = _sv_workdir(data) cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name) if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "seq2c", "coverage": cov_file}) return [data]
def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ from bcbio.structural import cnvkit if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x)) for x in items): return items items = [utils.to_single_data(x) for x in items] out = [] for cnv_group in _group_by_cnv_method(multi.group_by_batch(items, False)): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn, cnv_group.work_dir, data) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def call_hla(data): data = utils.to_single_data(utils.to_single_data(data)) hlacaller = tz.get_in(["config", "algorithm", "hlacaller"], data) if hlacaller and "hla" in data and tz.get_in(["hla", "fastq"], data): data = _CALLERS[hlacaller](data) elif "hla" not in data: data["hla"] = {} return [[data]]
def qc_to_rec(samples): """CWL: Convert a set of input samples into records for parallelization. """ samples = [utils.to_single_data(x) for x in samples] samples = cwlutils.assign_complex_to_samples(samples) to_analyze, extras = _split_samples_by_qc(samples) recs = cwlutils.samples_to_records([utils.to_single_data(x) for x in to_analyze + extras]) return [[x] for x in recs]
def _read_from_cwlinput(in_file, work_dir, runtime, parallel, input_order, output_cwl_keys): """Read data records from a JSON dump of inputs. Avoids command line flattening of records. """ with open(in_file) as in_handle: inputs = json.load(in_handle) items_by_key = {} passed_keys = set([]) for key, input_val in ((k, v) for (k, v) in inputs.items() if not k.startswith(("sentinel", "ignore"))): if key.endswith("_toolinput"): key = key.replace("_toolinput", "") if input_order[key] == "record": cur_keys, items = _read_cwl_record(input_val) passed_keys |= cur_keys items_by_key[key] = items else: items_by_key[tuple(key.split("__"))] = _cwlvar_to_wdl(input_val) prepped = _merge_cwlinputs(items_by_key, input_order, parallel) out = [] for data in prepped: if isinstance(data, (list, tuple)): out.append([_finalize_cwl_in(utils.to_single_data(x), work_dir, list(passed_keys), output_cwl_keys, runtime) for x in data]) else: out.append(_finalize_cwl_in(data, work_dir, list(passed_keys), output_cwl_keys, runtime)) return out
def run(items): """Normalization and log2 ratio calculation plus CNV calling for full cohort. - Combine coverage of each region for each sample - Prepare read counts for each sample - Normalize coverages in cohort by gene and sample, and calculate log2 ratios - Call amplifications and deletions """ items = [utils.to_single_data(x) for x in items] work_dir = _sv_workdir(items[0]) input_backs = list(set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "seq2c") for d in items]))) coverage_file = _combine_coverages(items, work_dir, input_backs) read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs) normal_names = [] if input_backs: with open(input_backs[0]) as in_handle: for line in in_handle: if len(line.split()) == 2: normal_names.append(line.split()[0]) normal_names += [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1] seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names) items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file) return items
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn): """Shared functionality for grouping by batches for variant calling and joint calling. """ singles = [] batch_groups = collections.defaultdict(list) for args in xs: data = utils.to_single_data(args) caller, batch = caller_batch_fn(data) region = _list_to_tuple(data["region"]) if "region" in data else () if batch is not None: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: batch_groups[(b, region, caller)].append(utils.deepish_copy(data)) else: data = prep_data_fn(data, [data]) singles.append(data) batches = [] for batch, items in batch_groups.items(): batch_data = utils.deepish_copy(_pick_lead_item(items)) # For nested primary batches, split permanently by batch if tz.get_in(["metadata", "batch"], batch_data): batch_name = batch[0] batch_data["metadata"]["batch"] = batch_name batch_data = prep_data_fn(batch_data, items) batch_data["group_orig"] = _collapse_subitems(batch_data, items) batch_data["group"] = batch batches.append(batch_data) return singles + batches
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. If doing joint calling, with `tools_on: [gvcf]`, split the sample into individuals instead of combining into a batch. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) batches = [] for cur_group in batch_groups.values(): joint_calling = any([is_joint(d) for d in cur_group]) if joint_calling: for d in cur_group: batches.append([d]) else: batches.append(cur_group) return batches + extras
def vc_output_record(samples): """Prepare output record from variant calling to feed into downstream analysis. Prep work handles reformatting so we return generated dictionaries. For any shared keys that are calculated only once for a batch, like variant calls for the batch, we assign to every sample. """ shared_keys = [["vrn_file"], ["validate", "summary"], ["validate", "tp"], ["validate", "fp"], ["validate", "fn"]] raw = cwlutils.samples_to_records([utils.to_single_data(x) for x in samples]) shared = {} for key in shared_keys: cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x])) if len(cur) > 0: assert len(cur) == 1, (key, cur) shared[tuple(key)] = cur[0] else: shared[tuple(key)] = None out = [] for d in raw: for key, val in shared.items(): d = tz.update_in(d, key, lambda x: val) out.append([d]) return out
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk} from bcbio.structural import cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target} return [[data]]
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(bam_file_ready, ref_file, data) sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data) offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats} data = coverage.assign_interval(data) highdepth_bed = highdepth.identify(data) data["regions"]["highdepth"] = highdepth_bed if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def normalize_sv_coverage(*items): """Normalize CNV coverage, providing flexible point for multiple methods. """ calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk} from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir, back_files, out_files) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def _normalize_cwl_inputs(items): """Extract variation and validation data from CWL input list of batched samples. """ with_validate = {} vrn_files = [] ready_items = [] batch_samples = [] for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items): batch_samples.append(dd.get_sample_name(data)) if tz.get_in(["config", "algorithm", "validate"], data): with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data if data.get("vrn_file"): vrn_files.append(data["vrn_file"]) ready_items.append(data) if len(with_validate) == 0: data = _pick_lead_item(ready_items) data["batch_samples"] = batch_samples return data else: assert len(with_validate) == 1, len(with_validate) assert len(set(vrn_files)) == 1, set(vrn_files) data = _pick_lead_item(with_validate.values()) data["batch_samples"] = batch_samples data["vrn_file"] = vrn_files[0] return data
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": items[0]["validate"], "variants": {"calls": [], "gvcf": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) return [out]
def batch(samples): """CWL: batch together per sample, joint and germline calls for ensemble combination. Sets up groups of same sample/batch variant calls for ensemble calling, as long as we have more than one caller per group. """ samples = [utils.to_single_data(x) for x in samples] sample_order = [dd.get_sample_name(x) for x in samples] batch_groups = collections.defaultdict(list) for data in samples: batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)])) batch_groups[(batch_samples, dd.get_phenotype(data))].append(data) out = [] for (batch_samples, phenotype), gsamples in batch_groups.items(): if len(gsamples) > 1: batches = set([]) for d in gsamples: batches |= set(dd.get_batches(d)) cur = copy.deepcopy(gsamples[0]) cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples), "batch_samples": batch_samples, "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples], "calls": [d.get("vrn_file") for d in gsamples]}}) out.append(cur) def by_original_order(d): return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order]) return sorted(out, key=by_original_order)
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file} return [[data]]
def merge_split_alignments(data): """Merge split BAM inputs generated by common workflow language runs. """ data = utils.to_single_data(data) data = _merge_align_bams(data) data = _merge_hla_fastq_inputs(data) return [[data]]
def _batch_split_by_sv(samples, stage): to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["ensemble"]: # no batching for ensemble methods if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"]) to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] all_keys = set([]) for data in to_process: all_keys.update(set(data["cwl_keys"])) for data in to_process: for raw_key in sorted(list(all_keys)): key = raw_key.split("__") if tz.get_in(key, data) is None: data = tz.update_in(data, key, lambda x: None) data["cwl_keys"].append(raw_key) if raw_key in convert_to_list: val = tz.get_in(key, data) if not val: val = [] elif not isinstance(val, (list, tuple)): val = [val] data = tz.update_in(data, key, lambda x: val) vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def to_cram(data): """Convert BAM archive files into indexed CRAM. """ data = utils.to_single_data(data) cram_file = cram.compress(dd.get_work_bam(data) or dd.get_align_bam(data), data) out_key = "archive_bam" if cwlutils.is_cwl_run(data) else "work_bam" data[out_key] = cram_file return [[data]]
def _group_by_batches(items): out = collections.OrderedDict() for data in (utils.to_single_data(xs) for xs in items): for b in _get_batches(data): try: out[b].append(data) except KeyError: out[b] = [data] return out
def get_in_samples(samples, fn): """ for a list of samples, return the value of a global option """ for sample in samples: sample = to_single_data(sample) if fn(sample, None): return fn(sample) return None
def to_rec_single(samples, default_keys=None): """Convert output into a list of single CWL records. """ out = [] for data in samples: recs = samples_to_records([normalize_missing(utils.to_single_data(data))], default_keys) assert len(recs) == 1 out.append(recs[0]) return out
def _save_fastq_space(items): """Potentially save fastq space prior to merging, since alignments done. """ to_cleanup = {} for data in (utils.to_single_data(x) for x in items): for fname in data.get("files", []): if os.path.realpath(fname).startswith(dd.get_work_dir(data)): to_cleanup[fname] = data["config"] for fname, config in to_cleanup.items(): utils.save_diskspace(fname, "Cleanup prep files after alignment finished", config)
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def _batch_split_by_sv(samples, stage): """Return - to_process = svcaller-batch => [svcaller-sample1, svcaller-sample2...] odict - extras = samples without sv calling (should there be any?) - background - all samples """ to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): # data = sample ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: # why appending every sample to background? background.append(data) # x is sample - sv caller pair for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["ensemble"]: # no batching for ensemble methods if isinstance(batch, six.string_types ) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" # just creating PON - no calling if stage in ["standard"] and batch in ["pon_build"]: extras.append(x) else: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def clean_sample_data(samples): """Clean unnecessary information from sample data, reducing size for message passing. """ out = [] for data in (utils.to_single_data(x) for x in samples): if "dirs" in data: data["dirs"] = { "work": data["dirs"]["work"], "galaxy": data["dirs"]["galaxy"], "fastq": data["dirs"].get("fastq") } data["config"] = { "algorithm": data["config"]["algorithm"], "resources": data["config"]["resources"] } for remove_attr in ["config_file", "algorithm"]: data.pop(remove_attr, None) out.append([data]) return out
def run(items): """Normalization and log2 ratio calculation plus CNV calling for full cohort. - Combine coverage of each region for each sample - Prepare read counts for each sample - Normalize coverages in cohort by gene and sample, and calculate log2 ratios - Call amplifications and deletions """ items = [utils.to_single_data(x) for x in items] work_dir = _sv_workdir(items[0]) coverage_file = _combine_coverages(items, work_dir) read_mapping_file = _calculate_mapping_reads(items, work_dir) normal_names = [dd.get_sample_name(x) for x in items if get_paired_phenotype(x) == "normal"] seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names) _split_cnv(items, seq2c_calls_file) return items
def organize_noalign(data): """CWL target to skip alignment and organize input data. """ data = utils.to_single_data(data[0]) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data)) if data.get("files"): if data["files"][0].endswith(".cram"): work_bam = cram.to_bam(data["files"][0], work_bam, data) else: assert data["files"][0].endswith(".bam"), data["files"][0] utils.copy_plus(data["files"][0], work_bam) bam.index(work_bam, data["config"]) else: work_bam = None data["align_bam"] = work_bam return data
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a rtg SDF format file with build in indexes for retrieving sections of files. Retains back compatibility with bgzip/grabix approach. """ data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] approach = "grabix" if _has_grabix_indices(data) else dd.get_align_prep_method(data) data["files_orig"] = data["files"] if approach == "rtg": data["files"] = [rtg.to_sdf(data["files"], data)] else: data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): if approach == "rtg": splits = rtg.calculate_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"]) else: splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records([utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if data["analysis"].lower().startswith("smallrna-seq"): work_bam = data["clean_fastq"] elif not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0]) return [[data]]
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (batches[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def concat_batch_variantcalls(items, region_block=True, skip_jointcheck=False): """CWL entry point: combine variant calls from regions into single VCF. """ items = [utils.to_single_data(x) for x in items] batch_name = _get_batch_name(items, skip_jointcheck) variantcaller = _get_batch_variantcaller(items) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, "%s.vcf.gz" % (batch_name)) utils.safe_makedir(os.path.dirname(out_file)) if region_block: regions = [_region_to_coords(rs[0]) for rs in items[0]["region_block"]] else: regions = [_region_to_coords(r) for r in items[0]["region"]] vrn_file_regions = items[0]["vrn_file_region"] out_file = vcfutils.concat_variant_files(vrn_file_regions, out_file, regions, dd.get_ref_file(items[0]), items[0]["config"]) return {"vrn_file": out_file}
def batch(samples): """CWL: batch together per sample, joint and germline calls for ensemble combination. Sets up groups of same sample/batch variant calls for ensemble calling, as long as we have more than one caller per group. """ samples = [utils.to_single_data(x) for x in samples] sample_order = [dd.get_sample_name(x) for x in samples] batch_groups = collections.defaultdict(list) for data in samples: batch_samples = tuple( data.get("batch_samples", [dd.get_sample_name(data)])) batch_groups[(batch_samples, dd.get_phenotype(data))].append(data) out = [] for (batch_samples, phenotype), gsamples in batch_groups.items(): if len(gsamples) > 1: batches = set([]) for d in gsamples: batches |= set(dd.get_batches(d)) gsamples.sort(key=dd.get_variantcaller_order) cur = copy.deepcopy(gsamples[0]) cur.update({ "batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples), "batch_samples": batch_samples, "variants": { "variantcallers": [dd.get_variantcaller(d) for d in gsamples], "calls": [d.get("vrn_file") for d in gsamples] } }) out.append(cur) def by_original_order(d): return min([ sample_order.index(s) for s in d["batch_samples"] if s in sample_order ]) return sorted(out, key=by_original_order)
def run_salmon_reads(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, salmon_dir) return [[data]]
def square_off(samples, run_parallel): """Perform joint calling at all variants within a batch. """ to_process = [] extras = [] for data in [utils.to_single_data(x) for x in samples]: added = False if tz.get_in(("metadata", "batch"), data): for add in genotype.handle_multiple_callers(data, "jointcaller", require_bam=False): if _is_jointcaller_compatible(add): added = True to_process.append([add]) if not added: extras.append([data]) processed = grouped_parallel_split_combine(to_process, _split_by_callable_region, multi.group_batches_joint, run_parallel, "square_batch_region", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return _combine_to_jointcaller(processed) + extras
def _get_batch_representative(items, key): """Retrieve a representative data item from a batch. Handles standard bcbio cases (a single data item) and CWL cases with batches that have a consistent variant file. """ if isinstance(items, dict): return items, items else: vals = set([]) out = [] items = [utils.to_single_data(x) for x in items] for data in items: if key in data: vals.add(data[key]) out.append(data) if len(vals) != 1: raise ValueError("Incorrect values for %s: %s" % (key, list(vals))) return out[0], items
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join( work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join( work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth( data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth( data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = { "target": out_target_file, "antitarget": out_anti_file } return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def determine_indexes_to_make(samples): """ returns a subset of the samples that have different indexes in them to make sure we only make each index once """ samples = [to_single_data(x) for x in samples] indexes = set() tomake = [] for data in samples: out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) combined_file = out_stem + ".fa" if combined_file not in indexes: tomake.append(data) indexes.add(combined_file) return tomake
def _normalize_cwl_inputs(items): """Extract variation and validation data from CWL input list of batched samples. """ with_validate = {} vrn_files = [] ready_items = [] for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items): if tz.get_in(["config", "algorithm", "validate"], data): with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data if data.get("vrn_file"): vrn_files.append(data["vrn_file"]) ready_items.append(data) if len(with_validate) == 0: return ready_items[0] else: assert len(with_validate) == 1, len(with_validate) assert len(set(vrn_files)) == 1 data = with_validate.values()[0] data["vrn_file"] = vrn_files[0] return data
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def variantcall_batch_region(items): """CWL entry point: variant call a batch of samples in a region. """ items = [utils.to_single_data(x) for x in items] align_bams = [dd.get_align_bam(x) for x in items] variantcaller = _get_batch_variantcaller(items) region = list(set([x.get("region") for x in items if "region" in x])) assert len(region) == 1, region region = region[0] caller_fn = get_variantcallers()[variantcaller] assoc_files = tz.get_in(("genome_resources", "variation"), items[0], {}) region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) batch_name = _get_batch_name(items) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) utils.safe_makedir(os.path.dirname(out_file)) call_file = caller_fn(align_bams, items, dd.get_ref_file(items[0]), assoc_files, region, out_file) return {"vrn_file_region": call_file, "region": "%s:%s-%s" % (chrom, start, end)}
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": _combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def generate_parallel(samples, run_parallel): """Provide parallel preparation of summary information for alignment and variant calling. """ to_analyze, extras = _split_samples_by_qc(samples) qced = run_parallel("pipeline_summary", to_analyze) samples = _combine_qc_samples(qced) + extras qsign_info = run_parallel("qsignature_summary", [samples]) metadata_file = _merge_metadata([samples]) summary_file = write_project_summary(samples, qsign_info) out = [] for data in samples: if "summary" not in data[0]: data[0]["summary"] = {} data[0]["summary"]["project"] = summary_file data[0]["summary"]["metadata"] = metadata_file if qsign_info: data[0]["summary"]["mixup_check"] = qsign_info[0]["out_dir"] out.append(data) out = _add_researcher_summary(out, summary_file) # MultiQC must be run after all file outputs are set: return [[utils.to_single_data(d)] for d in run_parallel("multiqc_summary", [out])]
def _dup_samples_by_variantcaller(samples, require_bam=True): """Prepare samples by variant callers, duplicating any with multiple callers. """ samples = [utils.to_single_data(x) for x in samples] samples = germline.split_somatic(samples) to_process = [] extras = [] for data in samples: added = False for i, add in enumerate( handle_multiple_callers(data, "variantcaller", require_bam=require_bam)): added = True add = dd.set_variantcaller_order(add, i) to_process.append([add]) if not added: data = _handle_precalled(data) data = dd.set_variantcaller_order(data, 0) extras.append([data]) return to_process, extras
def run_salmon_decoy(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir)) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: by_bam[dd.get_align_bam(data)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") ref_file = dd.get_ref_file(data) if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) elif dd.get_variant_regions(data): callable_region_bed, nblock_bed = \ callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": dd.get_variant_regions(data), "sample_callable": dd.get_variant_regions(data) } return [[data]]
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) work_bam = data.get("align_bam") if data["analysis"].lower().startswith("smallrna-seq"): work_bam = data["clean_fastq"] elif data["analysis"].lower().startswith("chip-seq"): work_bam = data["raw_bam"] elif not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data) is not None and work_bam: logger.info( "QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data)))) data["summary"] = _run_qc_tools(work_bam, data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.callable, bam_file_ready, ref_file, data) vrs_file = dd.get_variant_regions_merged(data) offtarget_stats = callable.calculate_offtarget_stats( bam_file_ready, data, vrs_file, "variant_regions") data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "highdepth": covinfo.highdepth, "sample_callable": covinfo.callable, "coverage_bed": covinfo.coverage, "avg_coverage": covinfo.avg_coverage, "offtarget_stats": offtarget_stats } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ params = { "min_coverage_for_downsampling": 10, "max_downsample_multiplier": 200 } data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "coverage_depth_bed": covinfo.depth } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def normalize_sv_coverage(*items): """Normalize CNV coverage, providing flexible point for multiple methods. """ calcfns = { "cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk } from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [ utils.to_single_data(x) for x in cwlutils.handle_combined_input(items) ] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby( items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join( dd.get_sample_name(x) for x in items)) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])]( group_id, inputs, backgrounds, work_dir, back_files, out_files) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[ dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name( data)] out.append([data]) return out