Esempio n. 1
0
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir,
                                  back_files, out_files):
    """Normalize CNV coverage depths by GC, repeats and background using CNVkit

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    cnns = reduce(operator.add, [[
        tz.get_in(["depth", "bins", "target"], x),
        tz.get_in(["depth", "bins", "antitarget"], x)
    ] for x in backgrounds], [])
    for d in inputs:
        if tz.get_in(["depth", "bins", "target"], d):
            target_bed = tz.get_in(["depth", "bins", "target"], d)
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
    input_backs = set(
        filter(lambda x: x is not None,
               [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
    if input_backs:
        assert len(
            input_backs
        ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        back_file = list(input_backs)[0]
    else:
        back_file = cnvkit.cnvkit_background(
            cnns,
            os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
            backgrounds or inputs, target_bed, antitarget_bed)
    fix_cmd_inputs = []
    for data in inputs:
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "structural",
                         dd.get_sample_name(data), "bins"))
        if tz.get_in(["depth", "bins", "target"], data):
            fix_file = os.path.join(
                work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
            fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"],
                                             data),
                                   tz.get_in(["depth", "bins", "antitarget"],
                                             data), back_file, fix_file, data))
            out_files[dd.get_sample_name(data)] = fix_file
            back_files[dd.get_sample_name(data)] = back_file
    parallel = {
        "type": "local",
        "cores": dd.get_cores(inputs[0]),
        "progs": ["cnvkit"]
    }
    run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"],
                  parallel)
    return back_files, out_files
Esempio n. 2
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    orig_items = items
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return orig_items
    out_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                      tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[00]), "structural",
                                                   dd.get_sample_name(inputs[0]), "bins"))
        back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                             backgrounds or inputs, target_bed, antitarget_bed)
        for data in inputs:
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                       dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = cnvkit.run_fix(tz.get_in(["depth", "bins", "target"], data),
                                          tz.get_in(["depth", "bins", "antitarget"], data),
                                          back_file,
                                          os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))),
                                          data)
                out_files[dd.get_sample_name(data)] = fix_file
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out
Esempio n. 3
0
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files):
    """Normalize CNV coverage depths by GC, repeats and background using CNVkit

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                    tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
    for d in inputs:
        if tz.get_in(["depth", "bins", "target"], d):
            target_bed = tz.get_in(["depth", "bins", "target"], d)
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
    input_backs = set(filter(lambda x: x is not None,
                                [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
    if input_backs:
        assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        back_file = list(input_backs)[0]
    else:
        back_file = cnvkit.cnvkit_background(cnns,
                                             os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                             backgrounds or inputs, target_bed, antitarget_bed)
    fix_cmd_inputs = []
    for data in inputs:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                    dd.get_sample_name(data), "bins"))
        if tz.get_in(["depth", "bins", "target"], data):
            fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
            fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                    tz.get_in(["depth", "bins", "antitarget"], data),
                                    back_file, fix_file, data))
            out_files[dd.get_sample_name(data)] = fix_file
            back_files[dd.get_sample_name(data)] = back_file
    parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
    run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)
    return back_files, out_files
Esempio n. 4
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                      tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                   dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(filter(lambda x: x is not None,
                                 [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(cnns,
                                                 os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                                backgrounds or inputs, target_bed, antitarget_bed)
        fix_cmd_inputs = []
        for data in inputs:
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                       dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
                fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                       tz.get_in(["depth", "bins", "antitarget"], data),
                                       back_file, fix_file, data))
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)

    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out