예제 #1
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    from bcbio.structural import cnvkit
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out = []
    for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            if cnvkit.use_general_sv_bins(data):
                if dd.get_background_cnv_reference(data):
                    target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data),
                                                                          cnv_group.work_dir, data)
                else:
                    target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file,
                                                                size_calc_fn, cnv_group.work_dir, data)
                if not data.get("regions"):
                    data["regions"] = {}
                data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i)}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
예제 #2
0
def _calculate_sv_bins_gatk(data, cnv_group, size_calc_fn):
    """Calculate structural variant bins using GATK4 CNV callers region or even intervals approach.
    """
    if dd.get_background_cnv_reference(data, "gatk-cnv"):
        target_bed = gatkcnv.pon_to_bed(dd.get_background_cnv_reference(data, "gatk-cnv"), cnv_group.work_dir, data)
    else:
        target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir)
    gc_annotated_tsv = gatkcnv.annotate_intervals(target_bed, data)
    return target_bed, None, gc_annotated_tsv
예제 #3
0
def _calculate_sv_bins_gatk(data, cnv_group, size_calc_fn):
    """Calculate structural variant bins using GATK4 CNV callers region or even intervals approach.
    """
    if dd.get_background_cnv_reference(data, "gatk-cnv"):
        target_bed = gatkcnv.pon_to_bed(dd.get_background_cnv_reference(data, "gatk-cnv"), cnv_group.work_dir, data)
    else:
        target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir)
    gc_annotated_tsv = gatkcnv.annotate_intervals(target_bed, data)
    return target_bed, None, gc_annotated_tsv
예제 #4
0
def _calculate_sv_bins_cnvkit(data, cnv_group, size_calc_fn):
    """Calculate structural variant bins using target/anti-target approach from CNVkit.
    """
    from bcbio.structural import cnvkit
    if dd.get_background_cnv_reference(data, "cnvkit"):
        target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data, "cnvkit"),
                                                              cnv_group.work_dir, data)
    else:
        target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file,
                                                     size_calc_fn, cnv_group.work_dir, data)
    return target_bed, anti_bed, None
예제 #5
0
def _calculate_sv_bins_cnvkit(data, cnv_group, size_calc_fn):
    """Calculate structural variant bins using target/anti-target approach from CNVkit.
    """
    from bcbio.structural import cnvkit
    if dd.get_background_cnv_reference(data, "cnvkit"):
        target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data, "cnvkit"),
                                                              cnv_group.work_dir, data)
    else:
        target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file,
                                                     size_calc_fn, cnv_group.work_dir, data)
    return target_bed, anti_bed, None
예제 #6
0
def _normalize_sv_coverage_gatk(group_id, inputs, backgrounds, work_dir,
                                back_files, out_files):
    """Normalize CNV coverage using panel of normals with GATK's de-noise approaches.
    """
    input_backs = set(
        filter(
            lambda x: x is not None,
            [dd.get_background_cnv_reference(d, "gatk-cnv") for d in inputs]))
    if input_backs:
        assert len(
            input_backs
        ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        pon = list(input_backs)[0]
    elif backgrounds:
        pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir)
    else:
        pon = None
    for data in inputs:
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "structural",
                         dd.get_sample_name(data), "bins"))
        denoise_file = gatkcnv.denoise(data, pon, work_dir)
        out_files[dd.get_sample_name(data)] = denoise_file
        back_files[dd.get_sample_name(data)] = pon
    return back_files, out_files
예제 #7
0
def run(items):
    """Normalization and log2 ratio calculation plus CNV calling for full cohort.

    - Combine coverage of each region for each sample
    - Prepare read counts for each sample
    - Normalize coverages in cohort by gene and sample, and calculate log2 ratios
    - Call amplifications and deletions
    """
    items = [utils.to_single_data(x) for x in items]
    work_dir = _sv_workdir(items[0])

    input_backs = list(
        set(
            filter(
                lambda x: x is not None,
                [dd.get_background_cnv_reference(d, "seq2c") for d in items])))
    coverage_file = _combine_coverages(items, work_dir, input_backs)
    read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs)
    normal_names = []
    if input_backs:
        with open(input_backs[0]) as in_handle:
            for line in in_handle:
                if len(line.split()) == 2:
                    normal_names.append(line.split()[0])
    normal_names += [
        dd.get_sample_name(x) for x in items
        if population.get_affected_status(x) == 1
    ]
    seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file,
                                 coverage_file, normal_names)
    items = _split_cnv(items, seq2c_calls_file, read_mapping_file,
                       coverage_file)
    return items
예제 #8
0
def run(items):
    """Normalization and log2 ratio calculation plus CNV calling for full cohort.

    - Combine coverage of each region for each sample
    - Prepare read counts for each sample
    - Normalize coverages in cohort by gene and sample, and calculate log2 ratios
    - Call amplifications and deletions
    """
    items = [utils.to_single_data(x) for x in items]
    work_dir = _sv_workdir(items[0])

    input_backs = list(set(filter(lambda x: x is not None,
                                  [dd.get_background_cnv_reference(d, "seq2c") for d in items])))
    coverage_file = _combine_coverages(items, work_dir, input_backs)
    read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs)
    normal_names = []
    if input_backs:
        with open(input_backs[0]) as in_handle:
            for line in in_handle:
                if len(line.split()) == 2:
                    normal_names.append(line.split()[0])
    normal_names += [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1]
    seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names)
    items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file)
    return items
예제 #9
0
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir,
                                  back_files, out_files):
    """Normalize CNV coverage depths by GC, repeats and background using CNVkit

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    cnns = reduce(operator.add, [[
        tz.get_in(["depth", "bins", "target"], x),
        tz.get_in(["depth", "bins", "antitarget"], x)
    ] for x in backgrounds], [])
    for d in inputs:
        if tz.get_in(["depth", "bins", "target"], d):
            target_bed = tz.get_in(["depth", "bins", "target"], d)
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
    input_backs = set(
        filter(lambda x: x is not None,
               [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
    if input_backs:
        assert len(
            input_backs
        ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        back_file = list(input_backs)[0]
    else:
        back_file = cnvkit.cnvkit_background(
            cnns,
            os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
            backgrounds or inputs, target_bed, antitarget_bed)
    fix_cmd_inputs = []
    for data in inputs:
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "structural",
                         dd.get_sample_name(data), "bins"))
        if tz.get_in(["depth", "bins", "target"], data):
            fix_file = os.path.join(
                work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
            fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"],
                                             data),
                                   tz.get_in(["depth", "bins", "antitarget"],
                                             data), back_file, fix_file, data))
            out_files[dd.get_sample_name(data)] = fix_file
            back_files[dd.get_sample_name(data)] = back_file
    parallel = {
        "type": "local",
        "cores": dd.get_cores(inputs[0]),
        "progs": ["cnvkit"]
    }
    run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"],
                  parallel)
    return back_files, out_files
예제 #10
0
def prep_seq2c_bed(data):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    if dd.get_background_cnv_reference(data, "seq2c"):
        bed_file = _background_to_bed(
            dd.get_background_cnv_reference(data, "seq2c"), data)
    else:
        bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError(
                "BED file for Seq2C must be annotated with gene names, "
                "however the input BED is 3-columns and we have no transcript "
                "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " +
                     ready_file)

    return ready_file
예제 #11
0
def prep_seq2c_bed(data):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    if dd.get_background_cnv_reference(data, "seq2c"):
        bed_file = _background_to_bed(dd.get_background_cnv_reference(data, "seq2c"), data)
    else:
        bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError("BED file for Seq2C must be annotated with gene names, "
                             "however the input BED is 3-columns and we have no transcript "
                             "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file)

    return ready_file
예제 #12
0
def _normalize_sv_coverage_gatk(group_id, inputs, backgrounds, work_dir, back_files, out_files):
    """Normalize CNV coverage using panel of normals with GATK's de-noise approaches.
    """
    input_backs = set(filter(lambda x: x is not None,
                             [dd.get_background_cnv_reference(d, "gatk-cnv") for d in inputs]))
    if input_backs:
        assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        pon = list(input_backs)[0]
    elif backgrounds:
        pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir)
    else:
        pon = None
    for data in inputs:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        denoise_file = gatkcnv.denoise(data, pon, work_dir)
        out_files[dd.get_sample_name(data)] = denoise_file
        back_files[dd.get_sample_name(data)] = pon
    return back_files, out_files
예제 #13
0
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files):
    """Normalize CNV coverage depths by GC, repeats and background using CNVkit

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                    tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
    for d in inputs:
        if tz.get_in(["depth", "bins", "target"], d):
            target_bed = tz.get_in(["depth", "bins", "target"], d)
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
    input_backs = set(filter(lambda x: x is not None,
                                [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
    if input_backs:
        assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        back_file = list(input_backs)[0]
    else:
        back_file = cnvkit.cnvkit_background(cnns,
                                             os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                             backgrounds or inputs, target_bed, antitarget_bed)
    fix_cmd_inputs = []
    for data in inputs:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                    dd.get_sample_name(data), "bins"))
        if tz.get_in(["depth", "bins", "target"], data):
            fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
            fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                    tz.get_in(["depth", "bins", "antitarget"], data),
                                    back_file, fix_file, data))
            out_files[dd.get_sample_name(data)] = fix_file
            back_files[dd.get_sample_name(data)] = back_file
    parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
    run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)
    return back_files, out_files
예제 #14
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    orig_items = items
    items = [
        utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)
    ]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return orig_items
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(
            items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[
            tz.get_in(["depth", "bins", "target"], x),
            tz.get_in(["depth", "bins", "antitarget"], x)
        ] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(
            dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(inputs[0]), "structural",
                         dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(
            filter(lambda x: x is not None,
                   [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(
                input_backs
            ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(
                cnns,
                os.path.join(work_dir,
                             "background-%s-cnvkit.cnn" % (group_id)),
                backgrounds or inputs, target_bed, antitarget_bed)
        for data in inputs:
            work_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), "structural",
                             dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = cnvkit.run_fix(
                    tz.get_in(["depth", "bins", "target"], data),
                    tz.get_in(["depth", "bins", "antitarget"],
                              data), back_file,
                    os.path.join(
                        work_dir,
                        "%s-normalized.cnr" % (dd.get_sample_name(data))),
                    data)
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[
                dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(
                data)]
        out.append([data])
    return out