def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. If doing joint calling, with `tools_on: [gvcf]`, split the sample into individuals instead of combining into a batch. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) batches = [] for cur_group in batch_groups.values(): joint_calling = any([is_joint(d) for d in cur_group]) if joint_calling: for d in cur_group: batches.append([d]) else: batches.append(cur_group) return batches + extras
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": items[0]["validate"], "variants": {"calls": [], "gvcf": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) return [out]
def _check_for_problem_somatic_batches(items, config): """Identify problem batch setups for somatic calling. We do not support multiple tumors in a single batch and VarDict(Java) does not handle pooled calling, only tumor/normal. """ to_check = [] for data in items: data = copy.deepcopy(data) data["config"] = config_utils.update_w_custom(config, data) to_check.append(data) data_by_batches = collections.defaultdict(list) for data in to_check: batches = dd.get_batches(data) if batches: for batch in batches: data_by_batches[batch].append(data) for batch, items in data_by_batches.items(): if vcfutils.get_paired(items): vcfutils.check_paired_problems(items) elif len(items) > 1: vcs = list( set( tz.concat( [dd.get_variantcaller(data) or [] for data in items]))) if any(x.lower().startswith("vardict") for x in vcs): raise ValueError( "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items]))
def _check_for_problem_somatic_batches(items, config): """Identify problem batch setups for somatic calling. We do not support multiple tumors in a single batch and VarDict(Java) does not handle pooled calling, only tumor/normal. """ to_check = [] for data in items: data = copy.deepcopy(data) data["config"] = config_utils.update_w_custom(config, data) to_check.append(data) data_by_batches = collections.defaultdict(list) for data in to_check: batches = dd.get_batches(data) if batches: for batch in batches: data_by_batches[batch].append(data) for batch, items in data_by_batches.items(): if vcfutils.get_paired(items): vcfutils.check_paired_problems(items) elif len(items) > 1: vcs = vcfutils.get_somatic_variantcallers(items) if "vardict" in vcs: raise ValueError( "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items])) elif "mutect" in vcs or "mutect2" in vcs: raise ValueError( "MuTect and MuTect2 require a 'phenotype: tumor' sample for calling, " "in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items]))
def _check_for_problem_somatic_batches(items, config): """Identify problem batch setups for somatic calling. We do not support multiple tumors in a single batch and VarDict(Java) does not handle pooled calling, only tumor/normal. """ to_check = [] for data in items: data = copy.deepcopy(data) data["config"] = config_utils.update_w_custom(config, data) to_check.append(data) data_by_batches = collections.defaultdict(list) for data in to_check: batches = dd.get_batches(data) if batches: for batch in batches: data_by_batches[batch].append(data) for batch, items in data_by_batches.items(): if vcfutils.get_paired(items): vcfutils.check_paired_problems(items) elif len(items) > 1: vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items]))) if any(x.lower().startswith("vardict") for x in vcs): raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items])) elif any(x.lower() == "mutect" for x in vcs): raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items]))
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"]) to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] all_keys = set([]) for data in to_process: all_keys.update(set(data["cwl_keys"])) for data in to_process: for raw_key in sorted(list(all_keys)): key = raw_key.split("__") if tz.get_in(key, data) is None: data = tz.update_in(data, key, lambda x: None) data["cwl_keys"].append(raw_key) if raw_key in convert_to_list: val = tz.get_in(key, data) if not val: val = [] elif not isinstance(val, (list, tuple)): val = [val] data = tz.update_in(data, key, lambda x: val) vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith(("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def get_variants(data, include_germline=False): """Retrieve set of variant calls to use for heterogeneity analysis. """ data = utils.deepish_copy(data) supported = ["precalled", "vardict", "vardict-java", "vardict-perl", "freebayes", "octopus", "strelka2"] # Right now mutect2 and mutect do not provide heterozygous germline calls # to be useful https://github.com/bcbio/bcbio-nextgen/issues/2464 # supported += ["mutect2", "mutect"] if include_germline: supported.insert(1, "gatk-haplotype") out = [] # CWL based input if isinstance(data.get("variants"), dict) and "samples" in data["variants"]: cur_vs = [] # Unpack single sample list of files if (isinstance(data["variants"]["samples"], (list, tuple)) and len(data["variants"]["samples"]) == 1 and isinstance(data["variants"]["samples"][0], (list, tuple))): data["variants"]["samples"] = data["variants"]["samples"][0] for fname in data["variants"]["samples"]: variantcaller = utils.splitext_plus(os.path.basename(fname))[0] variantcaller = variantcaller.replace(dd.get_sample_name(data) + "-", "") for batch in dd.get_batches(data): variantcaller = variantcaller.replace(batch + "-", "") cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller}) data["variants"] = cur_vs for v in data.get("variants", []): if v["variantcaller"] in supported and v.get("vrn_file"): out.append((supported.index(v["variantcaller"]), v)) out.sort() return [xs[1] for xs in out]
def batch(samples): """CWL: batch together per sample, joint and germline calls for ensemble combination. Sets up groups of same sample/batch variant calls for ensemble calling, as long as we have more than one caller per group. """ samples = [utils.to_single_data(x) for x in samples] sample_order = [dd.get_sample_name(x) for x in samples] batch_groups = collections.defaultdict(list) for data in samples: batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)])) batch_groups[(batch_samples, dd.get_phenotype(data))].append(data) out = [] for (batch_samples, phenotype), gsamples in batch_groups.items(): if len(gsamples) > 1: batches = set([]) for d in gsamples: batches |= set(dd.get_batches(d)) cur = copy.deepcopy(gsamples[0]) cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples), "batch_samples": batch_samples, "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples], "calls": [d.get("vrn_file") for d in gsamples]}}) out.append(cur) def by_original_order(d): return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order]) return sorted(out, key=by_original_order)
def _useful_basename(data): """Provide a useful file basename for outputs, referencing batch/sample and caller. """ names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] return "%s-%s" % (batch_name, data["sv"]["variantcaller"])
def _clean_name(fname, data): """Remove standard prefixes from a filename before renaming with useful names. """ for to_remove in dd.get_batches(data) + [dd.get_sample_name(data), data["sv"]["variantcaller"]]: for ext in ("-", "_"): if fname.startswith("%s%s" % (to_remove, ext)): fname = fname[len(to_remove) + len(ext):] if fname.startswith(to_remove): fname = fname[len(to_remove):] return fname
def _check(sample, data): """Get input sample for each chip bam file.""" if dd.get_chip_method(sample).lower() == "atac": return [sample] if dd.get_phenotype(sample) == "input": return None for origin in data: if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = origin[0].get("work_bam") return [sample] return [sample]
def _get_batch_name(items): """Retrieve the shared batch name for a group of items. """ batch_names = collections.defaultdict(int) for data in items: batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_names[b] += 1 return sorted(batch_names.items(), key=lambda x: x[-1], reverse=True)[0][0]
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = { "validate": validate.combine_validations(items), "variants": { "calls": [], "gvcf": [], "samples": [] } } added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller( data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def _add_batch(x, sample): """Potentially add batch name to an upload file. """ added = False for batch in sorted(dd.get_batches(sample) or [], key=len, reverse=True): if batch and os.path.basename(x["path"]).startswith("%s-" % batch): x["batch"] = batch added = True break if not added: x["batch"] = dd.get_sample_name(sample) return x
def extract(data, items): """Extract germline calls for the given sample, if tumor/normal or prioritized. """ if vcfutils.get_paired_phenotype(data): is_paired = dd.get_batches(data) and len(items) > 1 if is_paired: germline_vcf = _extract_germline(data["vrn_file"], data) else: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} return data
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [], "prioritize": { "tsv": [], "raw": [] } }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) # Standard callers for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) # prioritization for pdata in _group_by_sample(items): prioritysv = [ x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get( "sv", []) if x["variantcaller"] == "sv-prioritize" ] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend( prioritysv[0]["raw_files"].values()) return [out]
def _get_batch_name(items, skip_jointcheck=False): """Retrieve the shared batch name for a group of items. """ batch_names = collections.defaultdict(int) has_joint = any([is_joint(d) for d in items]) for data in items: if has_joint and not skip_jointcheck: batches = dd.get_sample_name(data) else: batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_names[b] += 1 return sorted(batch_names.items(), key=lambda x: x[-1], reverse=True)[0][0]
def batch_for_jointvc(items): batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in items]: vc = dd.get_variantcaller(data) if genotype.is_joint(data): batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] else: batches = [dd.get_sample_name(data)] for b in batches: data = utils.deepish_copy(data) data["vrn_file_gvcf"] = data["vrn_file"] batch_groups[(b, vc)].append(data) return list(batch_groups.values())
def batch_for_jointvc(items): batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in items]: vc = dd.get_variantcaller(data) if genotype.is_joint(data): batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] else: batches = [dd.get_sample_name(data)] for b in batches: data = utils.deepish_copy(data) data["vrn_file_gvcf"] = data["vrn_file"] batch_groups[(b, vc)].append(data) return batch_groups.values()
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in to_process]: vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def _get_vcf_samples(calls, data): have_full_file = False all_samples = set([]) for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for test_name in [dd.get_sample_name(data)] + dd.get_batches(data): if os.path.basename(f).startswith("%s-" % test_name): all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (dd.get_batches(data)[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (batches[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def batch(samples): """CWL: batch together per sample, joint and germline calls for ensemble combination. Sets up groups of same sample/batch variant calls for ensemble calling, as long as we have more than one caller per group. """ samples = [utils.to_single_data(x) for x in samples] sample_order = [dd.get_sample_name(x) for x in samples] batch_groups = collections.defaultdict(list) for data in samples: batch_samples = tuple( data.get("batch_samples", [dd.get_sample_name(data)])) batch_groups[(batch_samples, dd.get_phenotype(data))].append(data) out = [] for (batch_samples, phenotype), gsamples in batch_groups.items(): if len(gsamples) > 1: batches = set([]) for d in gsamples: batches |= set(dd.get_batches(d)) gsamples.sort(key=dd.get_variantcaller_order) cur = copy.deepcopy(gsamples[0]) cur.update({ "batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples), "batch_samples": batch_samples, "variants": { "variantcallers": [dd.get_variantcaller(d) for d in gsamples], "calls": [d.get("vrn_file") for d in gsamples] } }) out.append(cur) def by_original_order(d): return min([ sample_order.index(s) for s in d["batch_samples"] if s in sample_order ]) return sorted(out, key=by_original_order)
def somatic_batches(items): """Group items into somatic calling batches (tumor-only or tumor/normal). Returns batches, where a data item may be in pairs, and somatic and non_somatic (which are the original list of items). """ non_somatic = [] somatic = [] data_by_batches = defaultdict(list) for data in items: if not get_paired_phenotype(data): non_somatic.append(data) else: somatic.append(data) batches = dd.get_batches(data) if batches: for batch in batches: data_by_batches[batch].append(data) return data_by_batches.values(), somatic, non_somatic
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. If doing joint calling, with `tools_on: [gvcf]`, split the sample into individuals instead of combining into a batch. """ sample_order = [ dd.get_sample_name(utils.to_single_data(x)) for x in samples ] to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) batches = [] for cur_group in batch_groups.values(): joint_calling = any([is_joint(d) for d in cur_group]) if joint_calling: for d in cur_group: batches.append([d]) else: batches.append(cur_group) def by_original_order(xs): return (min([sample_order.index(dd.get_sample_name(x)) for x in xs]), min([dd.get_variantcaller_order(x) for x in xs])) return sorted(batches + extras, key=by_original_order)
def get_variants(data, include_germline=False): """Retrieve set of variant calls to use for heterogeneity analysis. """ data = utils.deepish_copy(data) supported = [ "precalled", "vardict", "vardict-java", "vardict-perl", "freebayes", "octopus", "strelka2" ] # Right now mutect2 and mutect do not provide heterozygous germline calls # to be useful https://github.com/bcbio/bcbio-nextgen/issues/2464 # supported += ["mutect2", "mutect"] if include_germline: supported.insert(1, "gatk-haplotype") out = [] # CWL based input if isinstance(data.get("variants"), dict) and "samples" in data["variants"]: cur_vs = [] # Unpack single sample list of files if (isinstance(data["variants"]["samples"], (list, tuple)) and len(data["variants"]["samples"]) == 1 and isinstance(data["variants"]["samples"][0], (list, tuple))): data["variants"]["samples"] = data["variants"]["samples"][0] for fname in data["variants"]["samples"]: variantcaller = utils.splitext_plus(os.path.basename(fname))[0] variantcaller = variantcaller.replace( dd.get_sample_name(data) + "-", "") for batch in dd.get_batches(data): variantcaller = variantcaller.replace(batch + "-", "") cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller}) data["variants"] = cur_vs for v in data.get("variants", []): if v["variantcaller"] in supported and v.get("vrn_file"): out.append((supported.index(v["variantcaller"]), v)) out.sort() return [xs[1] for xs in out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [] }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) return [out]
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ from bcbio.pipeline import run_info convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"]) default_keys = set(["metadata__batch", "config__algorithm__validate", "config__algorithm__validate_regions"]) to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] all_keys = set([]) for data in to_process: all_keys.update(set(data["cwl_keys"])) all_keys.update(default_keys) for data in to_process: for raw_key in sorted(list(all_keys)): key = raw_key.split("__") if tz.get_in(key, data) is None: data = tz.update_in(data, key, lambda x: None) data["cwl_keys"].append(raw_key) if raw_key in convert_to_list: val = tz.get_in(key, data) if not val: val = [] elif not isinstance(val, (list, tuple)): val = [val] data = tz.update_in(data, key, lambda x: val) vc = get_variantcaller(data, require_bam=False) data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {})) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras