コード例 #1
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"][
            "variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data),
                          data,
                          prefix="cleaned-")
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(
                data)
        clean_cov_bed = clean_file(dd.get_coverage(data),
                                   data,
                                   prefix="cov-",
                                   simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if "seq2c" in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning(
                "Can't run Seq2C without a svregions or variant_regions BED file"
            )
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    elif regions.get_sv_bed(data):
        dd.set_sv_regions(
            data,
            clean_file(regions.get_sv_bed(data), data, prefix="svregions-"))

    if "purecn" in get_svcallers(data):
        from bcbio.structural import purecn
        purecn_ready_bed = purecn.process_intervals(data)
        if not purecn_ready_bed:
            logger.warning(
                "Can't run PureCN without a svregions or variant_regions BED file"
            )
        else:
            data["config"]["algorithm"]["purecn_bed_ready"] = purecn_ready_bed
    return data
コード例 #2
0
ファイル: sample.py プロジェクト: biocyberman/bcbio-nextgen
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data)
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data)
        clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning("Can't run Seq2C without a svregions or variant_regions BED file")
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    return data
コード例 #3
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"][
            "variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data)
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(
                data)
        clean_cov_bed = clean_file(dd.get_coverage(data),
                                   data,
                                   prefix="cov-",
                                   simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning(
                "Can't run Seq2C without a svregions or variant_regions BED file"
            )
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    return data
コード例 #4
0
ファイル: regions.py プロジェクト: yangzixu/bcbio-nextgen
def normalize_sv_coverage(*items):
    """Normalize CNV coverage, providing flexible point for multiple methods.
       Don't normalize when running purecn alone
    """
    out = []
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    from bcbio.structural import get_svcallers
    sv_callers = get_svcallers(items[0])
    if "gatk-cnv" in sv_callers or "cnvkit" in sv_callers:
        calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk}
        from bcbio.structural import cnvkit
        from bcbio.structural import shared as sshared
        if all(not cnvkit.use_general_sv_bins(x) for x in items):
            return [[d] for d in items]
        out_files = {}
        back_files = {}
        for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
            # No CNVkit calling for this particular set of samples
            if group_id is None:
                continue
            inputs, backgrounds = sshared.find_case_control(list(gitems))
            assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                       dd.get_sample_name(inputs[0]), "bins"))
            back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir,
                                                                        back_files, out_files)
        for data in items:
            if dd.get_sample_name(data) in out_files:
                data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
                data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
            out.append([data])
    else:
        out = [[d] for d in items]
    return out
コード例 #5
0
ファイル: postalign.py プロジェクト: skanwal/bcbio-nextgen
def _need_sr_disc_reads(data):
    """Check if we need split and discordant reads in downstream processing.

    We use samblaster when needed and otherwise use an approach that does not
    extract these reads to be less resource intensive.
    """
    from bcbio import structural
    return "lumpy" in structural.get_svcallers(data)
コード例 #6
0
def _need_sr_disc_reads(data):
    """Check if we need split and discordant reads in downstream processing.

    We use samblaster when needed and otherwise use an approach that does not
    extract these reads to be less resource intensive.
    """
    from bcbio import structural
    return "lumpy" in structural.get_svcallers(data)
コード例 #7
0
ファイル: alignment.py プロジェクト: chapmanb/bcbio-nextgen
def get_aligner_with_aliases(aligner, data):
    """Retrieve aligner index retriever, including aliases for shared.

    Handles tricky cases like gridss where we need bwa indices even with
    no aligner specified since they're used internally within GRIDSS.
    """
    aligner_aliases = {"sentieon-bwa": "bwa"}
    from bcbio import structural
    if not aligner and "gridss" in structural.get_svcallers(data):
        aligner = "bwa"
    return aligner_aliases.get(aligner) or aligner
コード例 #8
0
ファイル: alignment.py プロジェクト: yangzixu/bcbio-nextgen
def get_aligner_with_aliases(aligner, data):
    """Retrieve aligner index retriever, including aliases for shared.

    Handles tricky cases like gridss where we need bwa indices even with
    no aligner specified since they're used internally within GRIDSS.
    """
    aligner_aliases = {"sentieon-bwa": "bwa"}
    from bcbio import structural
    if not aligner and "gridss" in structural.get_svcallers(data):
        aligner = "bwa"
    return aligner_aliases.get(aligner) or aligner
コード例 #9
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.
    Creates corrected cnr files with log2 ratios and depths.
    data is one sample
    """
    calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)

    from bcbio.structural import get_svcallers
    sv_callers = get_svcallers(data)
    has_cnvkit_or_gatkcnv = bool(set(["cnvkit", "gatk-cnv"]) & set(sv_callers))

    if not cnvkit.use_general_sv_bins(data) or not has_cnvkit_or_gatkcnv:
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)

    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if "purecn" in dd.get_svcaller(data):
        # set purecn_pon_build flag
        batches = dd.get_batch(data)
        if batches and "pon_build" in dd.get_batch(data):
            data["config"]["algorithm"]["purecn_pon_build"] = True
        from bcbio.structural import purecn
        # still calculate coverage even when not building pon - for t-only analysis
        purecn_target = purecn.get_coverage(data)
    else:
        purecn_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {"target": out_target_file,
                             "antitarget": out_anti_file,
                             "seq2c": seq2c_target,
                             "purecn": purecn_target}
    return [[data]]
コード例 #10
0
ファイル: regions.py プロジェクト: yangzixu/bcbio-nextgen
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk}
    from bcbio.structural import cnvkit
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]

    from bcbio.structural import get_svcallers
    sv_callers = get_svcallers(items[0])
    has_cnvkit_gatkcnv = bool(set(sv_callers) & set(["cnvkit", "gatk-cnv"]))

    if all(not cnvkit.use_general_sv_bins(x) for x in items) or not has_cnvkit_gatkcnv:
        return [[d] for d in items]

    out = []
    for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            if cnvkit.use_general_sv_bins(data):
                target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group,
                                                                                           size_calc_fn)
                if not data.get("regions"):
                    data["regions"] = {}
                data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i),
                                           "gcannotated": gcannotated_tsv}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out