Ejemplo n.º 1
0
def use_general_sv_bins(data):
    """Check if we should use a general binning approach for a sample.

    Checks if CNVkit is enabled and we haven't already run CNVkit.
    """
    if "cnvkit" in dd.get_svcaller(data) or "titancna" in dd.get_svcaller(
            data):
        if not _get_original_coverage(data):
            return True
    return False
Ejemplo n.º 2
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    calcfns = {
        "cnvkit": _calculate_sv_coverage_cnvkit,
        "gatk-cnv": _calculate_sv_coverage_gatk
    }
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "structural",
                         dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](
            data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)
    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {
        "target": out_target_file,
        "antitarget": out_anti_file,
        "seq2c": seq2c_target
    }
    return [[data]]
Ejemplo n.º 3
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)
    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target}
    return [[data]]
Ejemplo n.º 4
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.
    Creates corrected cnr files with log2 ratios and depths.
    data is one sample
    """
    calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)

    from bcbio.structural import get_svcallers
    sv_callers = get_svcallers(data)
    has_cnvkit_or_gatkcnv = bool(set(["cnvkit", "gatk-cnv"]) & set(sv_callers))

    if not cnvkit.use_general_sv_bins(data) or not has_cnvkit_or_gatkcnv:
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)

    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if "purecn" in dd.get_svcaller(data):
        # set purecn_pon_build flag
        batches = dd.get_batch(data)
        if batches and "pon_build" in dd.get_batch(data):
            data["config"]["algorithm"]["purecn_pon_build"] = True
        from bcbio.structural import purecn
        # still calculate coverage even when not building pon - for t-only analysis
        purecn_target = purecn.get_coverage(data)
    else:
        purecn_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {"target": out_target_file,
                             "antitarget": out_anti_file,
                             "seq2c": seq2c_target,
                             "purecn": purecn_target}
    return [[data]]
Ejemplo n.º 5
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
Ejemplo n.º 6
0
def use_general_sv_bins(data):
    """Check if we should use a general binning approach for a sample.

    Checks if CNVkit is enabled and we haven't already run CNVkit.
    """
    if any([c in dd.get_svcaller(data) for c in ["cnvkit", "titancna", "purecn", "gatk-cnv"]]):
        if not _get_original_coverage(data):
            return True
    return False
Ejemplo n.º 7
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
Ejemplo n.º 8
0
def bin_approach(data):
    """Check for binning approach from configuration or normalized file.
    """
    for approach in ["cnvkit", "gatk-cnv"]:
        if approach in dd.get_svcaller(data):
            return approach
    norm_file = tz.get_in(["depth", "bins", "normalized"], data)
    if norm_file.endswith(("-crstandardized.tsv", "-crdenoised.tsv")):
        return "gatk-cnv"
    if norm_file.endswith(".cnr"):
        return "cnvkit"
Ejemplo n.º 9
0
def bin_approach(data):
    """Check for binning approach from configuration or normalized file.
    """
    for approach in ["cnvkit", "gatk-cnv"]:
        if approach in dd.get_svcaller(data):
            return approach
    norm_file = tz.get_in(["depth", "bins", "normalized"], data)
    if norm_file.endswith(("-crstandardized.tsv", "-crdenoised.tsv")):
        return "gatk-cnv"
    if norm_file.endswith(".cnr"):
        return "cnvkit"
Ejemplo n.º 10
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d)
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    return checkpoints
Ejemplo n.º 11
0
def detect_sv(items, all_items=None, stage="standard"):
    """Top level parallel target for examining structural variation.
    items = sample-sv_caller list, from one batch
    """
    items = [utils.to_single_data(x) for x in items]
    items = cwlutils.unpack_tarballs(items, items[0])
    svcaller = items[0]["config"]["algorithm"].get("svcaller")
    caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller)
    out = []
    batch = dd.get_batch(items[0])
    # no SV calling when just creating a PON for PureCN
    if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]):
        return out
    if svcaller and caller_fn:
        if (all_items and svcaller in _NEEDS_BACKGROUND
                and not vcfutils.is_paired_analysis(
                    [x.get("align_bam") for x in items], items)):
            names = set([dd.get_sample_name(x) for x in items])
            background = [
                x for x in all_items if dd.get_sample_name(x) not in names
            ]
            for svdata in caller_fn(items, background):
                out.append([svdata])
        else:
            for svdata in caller_fn(items):
                out.append([svdata])
    else:
        for data in items:
            out.append([data])
    # Avoid nesting of callers for CWL runs for easier extraction
    if cwlutils.is_cwl_run(items[0]):
        out_cwl = []
        for data in [utils.to_single_data(x) for x in out]:
            # Run validation directly from CWL runs since we're single stage
            data = validate.evaluate(data)
            data["svvalidate"] = {
                "summary": tz.get_in(["sv-validate", "csv"], data)
            }
            svs = data.get("sv")
            if svs:
                assert len(svs) == 1, svs
                data["sv"] = svs[0]
            else:
                data["sv"] = {}
            data = _add_supplemental(data)
            out_cwl.append([data])
        return out_cwl
    return out
Ejemplo n.º 12
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Ejemplo n.º 13
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Ejemplo n.º 14
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.
    This requires the full non open-source version of GATK 3.5+.
    items = 1 sample or T/N pair
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        # call somatic variants keeping germline sites and using germline 1KG resource
        # use --native-pair-hmm-threads?
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        # shared Mutect2 settings for PureCN analysis in the case of:
        # - PON creation
        # - Tumor-only PureCN run
        # - T/N PureCN run
        # PURECN requirement alters Mutect2 variants calling!
        if "purecn" in dd.get_svcaller(items[0]):
            # mutect call for PON creation or purecn T-only analysis
            _prep_inputs(align_bams, ref_file, items)
            with file_transaction(items[0], out_file) as tx_out_file:
                germline_resource = tz.get_in(["genome_resources", "variation", "af_only_gnomad"], items[0])
                germline_path = os.path.normpath(os.path.join(os.path.dirname(ref_file), germline_resource))
                input_bam = dd.get_work_bam(items[0])
                tx_prefilt_vcf = utils.splitext_plus(tx_out_file)[0] + ".prefilt.vcf"
                tx_vcf = os.path.splitext(tx_out_file)[0]
                out_file_ungz = os.path.splitext(out_file)[0]
                params = ["-T", "Mutect2"]
                # T/N pair
                if len(items) == 2:
                    paired = vcfutils.get_paired_bams(align_bams, items)
                    # not really running purecn with mutect1/gatk3
                    params += _add_tumor_params(paired, items, gatk_type)
                    logger.debug("You are running mutect2 in PureCN analysis in T/N mode, T-only + PON is recommended")
                else: #T only
                    params += ["-I", input_bam]
                    # adding SNV PON from background/variant
                    snv_pon = tz.get_in(["config", "algorithm", "background", "variant"], items[0])
                    if snv_pon and dd.get_batch(items[0]) != "pon_build":
                        params += ["-pon", snv_pon]
                        params += ["--genotype-pon-sites"]

                opt_list = config_utils.get_resources("mutect2", items[0]["config"]).get("options")
                # default is 50, sometimes 100 or 200 is recommended for better sensitivity in detection
                # hom del CNVs (calling more variants helps)
                interval_padding = 50
                if opt_list:
                    opt_dict = dict(zip(opt_list[::2], opt_list[1::2]))
                    if "--interval_padding" in opt_dict:
                        interval_padding = opt_dict["--interval_padding"]

                params += ["--max-mnp-distance", "0",
                           "--interval-padding", interval_padding,
                           "--germline-resource", germline_path,
                           "--genotype-germline-sites",
                           "--reference", ref_file,
                           "-O", tx_prefilt_vcf]

                params += _add_region_params(region, out_file, items, gatk_type)
                broad_runner.new_resources("mutect2")
                gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
                filter_cmd = _mutect2_filter(broad_runner, items, tx_prefilt_vcf, out_file_ungz, ref_file)
                cmd = "{gatk_cmd} && {filter_cmd}"
                do.run(cmd.format(**locals()), "MuTect2")
                # no AF filter for PureCN variants
                out_file = vcfutils.bgzip_and_index(out_file_ungz, items[0]["config"])
        else:
            # a regular mutect call
            paired = vcfutils.get_paired_bams(align_bams, items)
            f1r2_file = None
            _prep_inputs(align_bams, ref_file, items)
            with file_transaction(items[0], out_file) as tx_out_file:
                params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                          "--annotation", "ClippingRankSumTest",
                          "--annotation", "DepthPerSampleHC"]
                if gatk_type == "gatk4":
                    params += ["--reference", ref_file]
                else:
                    params += ["-R", ref_file]
                for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                    params += ["--annotation", a]
                # Avoid issues with BAM CIGAR reads that GATK doesn't like
                if gatk_type == "gatk4":
                    params += ["--read-validation-stringency", "LENIENT"]
                params += _add_tumor_params(paired, items, gatk_type)
                params += _add_region_params(region, out_file, items, gatk_type)
                if all(is_paired(bam) for bam in align_bams) and (
                        "mutect2_readmodel" in utils.get_in(items[0], "config", "tools_on")):
                    orientation_filter = True
                else:
                    orientation_filter = False

                if gatk_type == "gatk4" and orientation_filter:
                    f1r2_file = "{}-f1r2.tar.gz".format(utils.splitext_plus(out_file)[0])
                    params += ["--f1r2-tar-gz", f1r2_file]

                # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
                # Not yet clear how this helps or hurts in a general case.
                #params += _add_assoc_params(assoc_files)
                resources = config_utils.get_resources("mutect2", items[0]["config"])
                if "options" in resources:
                    params += [str(x) for x in resources.get("options", [])]
                assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                    "Require full version of GATK 3.5+ for mutect2 calling"
                broad_runner.new_resources("mutect2")
                gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
                if gatk_type == "gatk4":
                    tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(out_file)
                    tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)

                    if orientation_filter:
                        tx_f1r2_file = "{}-read-orientation-model.tar.gz"
                        tx_f1r2_file = tx_f1r2_file.format(utils.splitext_plus(f1r2_file)[0])
                        tx_read_orient_cmd = _mutect2_read_filter(broad_runner,
                                                                  f1r2_file,
                                                                  tx_f1r2_file)

                        filter_cmd = _mutect2_filter(broad_runner, items,
                                                     tx_raw_prefilt_file,
                                                     tx_raw_file, ref_file,
                                                     tx_f1r2_file)
                    else:
                        filter_cmd = _mutect2_filter(broad_runner, items,
                                                     tx_raw_prefilt_file,
                                                     tx_raw_file, ref_file)
                    if orientation_filter:
                        cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}"
                    else:
                        cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
                else:
                    tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                    cmd = "{gatk_cmd} > {tx_raw_file}"
                do.run(cmd.format(**locals()), "MuTect2")
                out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])