def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage depths by GC, repeats and background using CNVkit - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit cnns = reduce(operator.add, [[ tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x) ] for x in backgrounds], []) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set( filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len( input_backs ) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background( cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join( work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) return back_files, out_files
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared orig_items = items items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return orig_items out_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[00]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = cnvkit.run_fix(tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))), data) out_files[dd.get_sample_name(data)] = fix_file out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage depths by GC, repeats and background using CNVkit - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) return back_files, out_files
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d) for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out