Beispiel #1
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        background_bams = [paired.normal_bam]
        background_names = [paired.normal_name]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
        background_names = [dd.get_sample_name(x) for x in background]
    orig_vcf_file = _run_wham(inputs, background_bams)
    wclass_vcf_file = _add_wham_classification(orig_vcf_file, inputs)
    vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names)
    bed_file = _convert_to_bed(vcf_file, inputs, use_lrt=len(background_bams) > 0)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": _subset_to_sample(bed_file, data),
                           "vcf_file": vcf_file})
        out.append(data)
    return out
Beispiel #2
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise
    uses samples from the same batch as background.
    """
    if background and len(background) > 0:
        inputs = items
    else:
        inputs, background = shared.find_case_control(items)

    # if we have case/control organized background or a single sample
    if len(inputs) == 1 or len(background) > 0:
        ckouts = _run_cnvkit_shared(inputs, background)
        return _associate_cnvkit_out(ckouts, inputs) + background
    # otherwise run each sample with the others in the batch as background
    else:
        out = []
        for cur_input in items:
            background = [
                d for d in items
                if dd.get_sample_name(d) != dd.get_sample_name(cur_input)
            ]
            ckouts = _run_cnvkit_shared([cur_input], background)
            out.extend(_associate_cnvkit_out(ckouts, [cur_input]))
        return out
Beispiel #3
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        background_bams = [paired.normal_bam]
        background_names = [paired.normal_name]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
        background_names = [dd.get_sample_name(x) for x in background]
    orig_bedpe = _run_wham(inputs, background_bams)
    #vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({
            "variantcaller":
            "wham",
            "vrn_file":
            _get_sample_bed(orig_bedpe, data, background_names),
            "vrn_bedpe":
            orig_bedpe
        })
        out.append(data)
    return out
Beispiel #4
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
        if background:
            sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data)
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": sample_vcf})
        out.append(data)
    return out
Beispiel #5
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
        if background:
            sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data)
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": effects_vcf or sample_vcf})
        out.append(data)
    return out
Beispiel #6
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage, providing flexible point for multiple methods.
       Don't normalize when running purecn alone
    """
    out = []
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    from bcbio.structural import get_svcallers
    sv_callers = get_svcallers(items[0])
    if "gatk-cnv" in sv_callers or "cnvkit" in sv_callers:
        calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk}
        from bcbio.structural import cnvkit
        from bcbio.structural import shared as sshared
        if all(not cnvkit.use_general_sv_bins(x) for x in items):
            return [[d] for d in items]
        out_files = {}
        back_files = {}
        for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
            # No CNVkit calling for this particular set of samples
            if group_id is None:
                continue
            inputs, backgrounds = sshared.find_case_control(list(gitems))
            assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                       dd.get_sample_name(inputs[0]), "bins"))
            back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir,
                                                                        back_files, out_files)
        for data in items:
            if dd.get_sample_name(data) in out_files:
                data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
                data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
            out.append([data])
    else:
        out = [[d] for d in items]
    return out
Beispiel #7
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage, providing flexible point for multiple methods.
    """
    calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                    dd.get_sample_name(inputs[0]), "bins"))
        back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir,
                                                                        back_files, out_files)
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out
Beispiel #8
0
def _run_cnvkit_population(items, background, access_file):
    """Run CNVkit on a population of samples.

    Currently uses a flat background for each sample and calls independently. Could
    be improved to use population information but this is a starting point.
    """
    assert not background
    inputs, background = shared.find_case_control(items)
    return [_run_cnvkit_single(data, access_file, background)[0] for data in inputs]
Beispiel #9
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise uses
    a flat background for each sample and calls independently.
    """
    assert not background
    inputs, background = shared.find_case_control(items)
    return [_run_cnvkit_single(data, background)[0] for data in inputs] + \
           [_run_cnvkit_single(data, inputs)[0] for data in background]
Beispiel #10
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise uses
    a flat background for each sample and calls independently.
    """
    assert not background
    inputs, background = shared.find_case_control(items)
    return [_run_cnvkit_single(data, background)[0] for data in inputs] + \
           [_run_cnvkit_single(data, inputs)[0] for data in background]
Beispiel #11
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise uses
    a flat background for each sample and calls independently.
    """
    assert not background
    inputs, background = shared.find_case_control(items)
    access_file = _create_access_file(dd.get_ref_file(inputs[0]), _sv_workdir(inputs[0]), inputs[0])
    return [_run_cnvkit_single(data, access_file, background)[0] for data in inputs] + \
           [_run_cnvkit_single(data, access_file, inputs)[0] for data in background]
Beispiel #12
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise uses
    a flat background for each sample and calls independently.
    """
    assert not background
    inputs, background = shared.find_case_control(items)
    work_dir = _sv_workdir(inputs[0])
    ckouts = _run_cnvkit_shared(inputs, [x["align_bam"] for x in inputs],
                                [x["align_bam"] for x in background], work_dir,
                                background_name=dd.get_sample_name(background[0]) if len(background) > 0 else None)
    return _associate_cnvkit_out(ckouts, inputs) + background
Beispiel #13
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise uses
    a flat background for each sample and calls independently.
    """
    assert not background
    inputs, background = shared.find_case_control(items)
    work_dir = _sv_workdir(inputs[0])
    ckouts = _run_cnvkit_shared(
        inputs, [x["align_bam"] for x in inputs],
        [x["align_bam"] for x in background],
        work_dir,
        background_name=dd.get_sample_name(background[0])
        if len(background) > 0 else None)
    return _associate_cnvkit_out(ckouts, inputs) + background
Beispiel #14
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    orig_items = items
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return orig_items
    out_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                      tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[00]), "structural",
                                                   dd.get_sample_name(inputs[0]), "bins"))
        back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                             backgrounds or inputs, target_bed, antitarget_bed)
        for data in inputs:
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                       dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = cnvkit.run_fix(tz.get_in(["depth", "bins", "target"], data),
                                          tz.get_in(["depth", "bins", "antitarget"], data),
                                          back_file,
                                          os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))),
                                          data)
                out_files[dd.get_sample_name(data)] = fix_file
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out
Beispiel #15
0
def run(items, background=None):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        inputs = [paired.tumor_data]
        background = [paired.normal_data] if paired.normal_bam else []
    else:
        assert not background
        inputs, background = sshared.find_case_control(items)
    work_dir = _sv_workdir(inputs[0])
    variant_file = _run_gridss(inputs, background, work_dir)
    out = []
    for data in items:
        sample_file = variant_file
        if "sv" not in data:
            data["sv"] = []
        effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff")
        data["sv"].append({"variantcaller": "gridss",
                           "vrn_file": effects_vcf or sample_file})
        out.append(data)
    return out
Beispiel #16
0
def _run_cnvkit_population(items, background):
    """Run CNVkit on a population of samples.

    Tries to calculate background based on case/controls, otherwise
    uses samples from the same batch as background.
    """
    if background and len(background) > 0:
        inputs = items
    else:
        inputs, background = shared.find_case_control(items)

    # if we have case/control organized background or a single sample
    if len(inputs) == 1 or len(background) > 0:
        ckouts = _run_cnvkit_shared(inputs, background)
        return _associate_cnvkit_out(ckouts, inputs) + background
    # otherwise run each sample with the others in the batch as background
    else:
        out = []
        for cur_input in items:
            background = [d for d in items if dd.get_sample_name(d) != dd.get_sample_name(cur_input)]
            ckouts = _run_cnvkit_shared([cur_input], background)
            out.extend(_associate_cnvkit_out(ckouts, [cur_input]))
        return out
Beispiel #17
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(orig_vcf, data, items)
        data["sv"].append({"variantcaller": "wham", "vrn_file": final_vcf})
        out.append(data)
    return out
Beispiel #18
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(orig_vcf, data, items)
        data["sv"].append({"variantcaller": "wham", "vrn_file": final_vcf})
        out.append(data)
    return out
Beispiel #19
0
def run(items, background=None):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        inputs = [paired.tumor_data]
        background = [paired.normal_data] if paired.normal_bam else []
    else:
        assert not background
        inputs, background = sshared.find_case_control(items)
    work_dir = _sv_workdir(inputs[0])
    variant_file = _run_gridss(inputs, background, work_dir)
    out = []
    for data in items:
        sample_file = variant_file
        if "sv" not in data:
            data["sv"] = []
        effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "gridss",
            "vrn_file": effects_vcf or sample_file
        })
        out.append(data)
    return out
Beispiel #20
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                      tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                   dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(filter(lambda x: x is not None,
                                 [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(cnns,
                                                 os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                                backgrounds or inputs, target_bed, antitarget_bed)
        fix_cmd_inputs = []
        for data in inputs:
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                       dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
                fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                       tz.get_in(["depth", "bins", "antitarget"], data),
                                       back_file, fix_file, data))
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)

    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out