def _combine_multiple_svcallers(samples): """ """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x[0]["align_bam"]].append(x[0]) except KeyError: by_bam[x[0]["align_bam"]] = [x[0]] highdepths = filter( lambda x: x is not None, list( set([ tz.get_in(["config", "algorithm", "highdepth_regions"], x[0]) for x in samples ]))) out = [] for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index( x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final = grouped_calls[0] final_calls = ensemble.summarize(final_calls, final, highdepths) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] out.append([final]) return out
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. items = sample-sv_caller list, from one batch """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] batch = dd.get_batch(items[0]) # no SV calling when just creating a PON for PureCN if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]): return out if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def finalize_sv(samples, config, initial_only=False): """Combine results from multiple sv callers into a single ordered 'sv' key. Handles ensemble calling and plotting of results. """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x["align_bam"]].append(x) except KeyError: by_bam[x["align_bam"]] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) if not initial_only: for caller in (c for c in _get_svcallers(final) if c in _ENSEMBLE_CALLERS): final_calls = _ENSEMBLE_CALLERS[caller](final_calls, final) final_calls = ensemble.summarize(final_calls, final, grouped_calls) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): plot_items = plot.by_regions(items) else: plot_items = items for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. Handles ensemble calling and plotting of results. """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x["align_bam"]].append(x) except KeyError: by_bam[x["align_bam"]] = [x] highdepths = filter( lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in samples])), ) by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final_calls = ensemble.summarize(final_calls, final, highdepths) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): plot_items = plot.by_regions(items) for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ items = [utils.to_single_data(x) for x in items] svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if "cwl_keys" in items[0]: out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] out_cwl.append([data]) return out_cwl return out
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [x for x in all_items if dd.get_sample_name(x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = {"summary": tz.get_in(["sv-validate", "csv"], data)} svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def _combine_multiple_svcallers(samples): """ """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x[0]["align_bam"]].append(x[0]) except KeyError: by_bam[x[0]["align_bam"]] = [x[0]] out = [] for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final = grouped_calls[0] final_calls = ensemble.summarize(final_calls, final) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] out.append([final]) return out
def validate_sv(data): """Validate structural variant calls for a sample. """ return [[validate.evaluate(data)]]