Esempio n. 1
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(os.path.dirname(out_file),
                                 "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s"
                % (replace_file, header_file), "Prepare header file for merging")
        resources = config_utils.get_resources("picard", config)
        ropts = []
        if "options" in resources:
            ropts += [str(x) for x in resources.get("options", [])]
        do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" %
               (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)),
               "Reheader initial VCF file in merge")
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]
Esempio n. 2
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(
        os.path.dirname(out_file),
        "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s" % (replace_file, header_file),
               "Prepare header file for merging")
        do.run(
            "%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s" %
            (utils.get_java_clprep(), header_file, base_file, base_fix_file),
            "Reheader initial VCF file in merge")
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]
Esempio n. 3
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(os.path.dirname(out_file),
                                 "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s"
                % (replace_file, header_file), "Prepare header file for merging")
        resources = config_utils.get_resources("picard", config)
        ropts = []
        if "options" in resources:
            ropts += [str(x) for x in resources.get("options", [])]
        bcftools = config_utils.get_program("bcftools", config)
        cmd = f"{bcftools} reheader --header {header_file} --output {tx_out_file} {base_file}"
        message = f"Reheader {base_file} with header from {replace_file}."
        do.run(cmd, message)
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]
Esempio n. 4
0
def _add_variantcalls_to_output(out, data, items, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            filters = ["--filter", "cn"]
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \
                  filters + \
                   ["--ploidy", str(ploidy.get_ploidy([data])),
                    "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data, items)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += [
                    "--vcf", small_vrn_files[0].name, "--sample-id",
                    small_vrn_files[0].sample
                ]
                if small_vrn_files[0].normal:
                    cmd += ["--normal-id", small_vrn_files[0].normal]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            gender = population.get_gender(data)
            if gender and gender.lower() != "unknown":
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [
                    os.path.join(os.path.dirname(sys.executable), "cnvkit.py"),
                    "export", outformat, "--sample-id",
                    dd.get_sample_name(data), "--ploidy",
                    str(ploidy.get_ploidy([data])), "-o", tx_out_file,
                    call_file
                ]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    return out
Esempio n. 5
0
def unified_genotyper(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += [
                    "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy",
                    (str(ploidy.get_ploidy(items, region))
                     if broad_runner.gatk_type() == "restricted" else "2"),
                    "--genotype_likelihoods_model", "BOTH"
                ]
                broad_runner.run_gatk(params)
    return out_file
Esempio n. 6
0
def _get_ploidy(regions, items, base_file):
    samples = [dd.get_sample_name(d) for d in items]
    out_file = "%s-ploidy.vcf" % utils.splitext_plus(base_file)[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with file_transaction(items[0], out_file) as tx_outfile:
            with open(tx_outfile, "w") as h:
                h.write("##fileformat=VCFv4.1\n")
                h.write(
                    '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n'
                )
                h.write(
                    '##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n'
                )
                h.write(
                    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" +
                    "\t".join(samples) + "\n")
                for region in regions:
                    ploidies = [ploidy.get_ploidy([d], region) for d in items]
                    h.write("\t".join([
                        region[0],
                        str(region[1]), ".", "N", "<CNV>", ".", ".",
                        "END=%s" % region[2], "CN"
                    ] + [str(x) for x in ploidies]) + "\n")
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 7
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.
    """
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")),
                                              items[0])
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts):
        # add minimum reportable allele frequency, for which FreeBayes defaults to 20
        min_af = float(utils.get_in(config, ("algorithm",
                                             "min_allele_fraction"), 20)) / 100.0
        opts += ["--min-alternate-fraction", str(min_af)]
    return opts
Esempio n. 8
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.
            ropts += [str(x) for x in resources.get("options", [])]

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            # see issue https://github.com/bcbio/bcbio-nextgen/issues/3263
            # for why --genomicsdb-use-vcf-codec is necessary
            params = [
                "-T", "GenotypeGVCFs", "--variant",
                "gendb://%s" % genomics_db, "-R",
                dd.get_ref_file(data), "--genomicsdb-use-vcf-codec",
                "--output", tx_out_file, "-L",
                bamprep.region_to_gatk(region)
            ]
            params += ["-ploidy", str(ploidy.get_ploidy([data], region))]
            # Avoid slow genotyping runtimes with improved quality score calculation in GATK4
            # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1
            resources = config_utils.get_resources("gatk", data["config"])
            params += [str(x) for x in resources.get("options", [])]
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 9
0
def unified_genotyper(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        with file_transaction(items[0], out_file) as tx_out_file:
            params += [
                "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy",
                (str(ploidy.get_ploidy(items, region))
                 if broad_runner.gatk_type() == "restricted" else "2"),
                "--genotype_likelihoods_model", "BOTH"
            ]
            resources = config_utils.get_resources("gatk", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.run_gatk(params)
    return out_file
Esempio n. 10
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if num_cores > 1 and gatk_type == "gatk4":
                params += ["-T", "HaplotypeCallerSpark", "--spark-master", "local[%s]" % num_cores,
                           "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += ["--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4" and _supports_avx():
                    params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                is_joint = True
                if gatk_type == "gatk4":
                    params += ["--emit-ref-confidence", "GVCF"]
                else:
                    params += ["--emitRefConfidence", "GVCF"]
                    params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=(num_cores > 1 and gatk_type == "gatk4"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 11
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities"]
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")),
                                              items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
                   for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 12
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "MuTect2",
                      "-R", ref_file,
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            for a in annotation.get_gatk_annotations(items[0]["config"]):
                params += ["--annotation", a]
            paired = vcfutils.get_paired_bams(align_bams, items)
            params += _add_tumor_params(paired)
            params += _add_region_params(region, out_file, items)
            params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner = broad.runner_from_config(items[0]["config"])
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)))
            pp_cmd = _post_process_cl(paired)
            cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "MuTect2")
    out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Esempio n. 13
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(
        bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(
                    tz.get_in(["config", "algorithm", "coverage_interval"], x,
                              "").lower() == "genome" for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 14
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += [
                "-T", "HaplotypeCaller", "--annotation", "ClippingRankSumTest",
                "--annotation", "DepthPerSampleHC"
            ]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4":
                    params += [
                        "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"
                    ]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            # Prepare gVCFs if doing joint calling
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d)
                                            for d in items):
                params += [
                    "--emitRefConfidence", "GVCF", "--variant_index_type",
                    "LINEAR", "--variant_index_parameter", "128000"
                ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            resources = config_utils.get_resources("gatk-haplotype",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
Esempio n. 15
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += ["-T", "HaplotypeCaller",
                       "-o", tx_out_file,
                       "--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            if _joint_calling(items):  # Prepare gVCFs if doing joint calling
                params += ["--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR",
                           "--variant_index_parameter", "128000"]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
Esempio n. 16
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if num_cores > 1 and gatk_type == "gatk4":
                params += ["-T", "HaplotypeCallerSpark", "--sparkMaster", "local[%s]" % num_cores,
                           "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += ["--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4" and _supports_avx():
                    params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                is_joint = True
                params += ["--emitRefConfidence", "GVCF"]
                if not gatk_type == "gatk4":
                    params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=(num_cores > 1 and gatk_type == "gatk4"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 17
0
def mutect2_caller(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = [
                "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R",
                ref_file, "--annotation", "ClippingRankSumTest",
                "--annotation", "DepthPerSampleHC"
            ]
            for a in annotation.get_gatk_annotations(
                    items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params,
                                            os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(
                    tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(
                    tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file,
                                             tx_raw_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 18
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    cur_ploidy = ploidy.get_ploidy(items, region)
    base_ploidy = ploidy.get_ploidy(items)
    opts += ["--ploidy", str(cur_ploidy)]
    # Adjust min fraction when trying to call more sensitively in certain
    # regions. This is primarily meant for pooled mitochondrial calling.
    if (isinstance(region,
                   (list, tuple)) and chromhacks.is_mitochondrial(region[0])
            and cur_ploidy >= base_ploidy
            and "--min-alternate-fraction" not in opts and "-F" not in opts):
        opts += ["--min-alternate-fraction", "0.01"]
    variant_regions = bedutils.merge_overlaps(
        bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(
                    tz.get_in(["config", "algorithm", "coverage_interval"], x,
                              "").lower() == "genome" for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 19
0
def _add_variantcalls_to_output(out, data, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = population.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            filters = ["--filter", "cn"]
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \
                  filters + \
                   ["--ploidy", str(ploidy.get_ploidy([data])),
                    "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += ["-v", small_vrn_files[0]]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            if gender and gender.lower() != "unknown":
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(ploidy.get_ploidy([data])),
                       "-o", tx_out_file, call_file]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    return out
Esempio n. 20
0
def _add_variantcalls_to_output(out, data, items, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            filters = ["--filter", "cn"]
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \
                  filters + \
                   ["--ploidy", str(ploidy.get_ploidy([data])),
                    "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data, items)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                if small_vrn_files[0].normal:
                    cmd += ["--normal-id", small_vrn_files[0].normal]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            gender = _get_batch_gender(items)
            if gender:
                cmd += ["--sample-sex", gender]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(ploidy.get_ploidy([data])),
                       "-o", tx_out_file, call_file]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    out["vrn_file"] = shared.annotate_with_depth(out["vrn_file"], items)
    return out
Esempio n. 21
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    cur_ploidy = ploidy.get_ploidy(items, region)
    base_ploidy = ploidy.get_ploidy(items)
    opts += ["--ploidy", str(cur_ploidy)]
    # Adjust min fraction when trying to call more sensitively in certain
    # regions. This is primarily meant for pooled mitochondrial calling.
    if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0])
          and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts):
        opts += ["--min-alternate-fraction", "0.01"]
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
                   for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 22
0
def cnv_to_event(name, data):
    """Convert a CNV to an event name.
    """
    cur_ploidy = ploidy.get_ploidy([data])
    if name.startswith("cnv"):
        num = max([int(x) for x in name.split("_")[0].replace("cnv", "").split(";")])
        if num < cur_ploidy:
            return "DEL"
        elif num > cur_ploidy:
            return "DUP"
        else:
            return name
    else:
        return name
Esempio n. 23
0
def cnv_to_event(name, data):
    """Convert a CNV to an event name.
    """
    cur_ploidy = ploidy.get_ploidy([data])
    if name.startswith("cnv"):
        num = max([int(x) for x in name.split("_")[0].replace("cnv", "").split(";")])
        if num < cur_ploidy:
            return "DEL"
        elif num > cur_ploidy:
            return "DUP"
        else:
            return name
    else:
        return name
Esempio n. 24
0
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = _shared_gatk_call_prep(
            align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file
        )
        assert broad_runner.gatk_type() == "restricted", "Require full version of GATK 2.4+ for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += [
                "-T",
                "HaplotypeCaller",
                "-o",
                tx_out_file,
                "--annotation",
                "ClippingRankSumTest",
                "--annotation",
                "DepthPerSampleHC",
            ]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            # Prepare gVCFs if doing joint calling
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                params += [
                    "--emitRefConfidence",
                    "GVCF",
                    "--variant_index_type",
                    "LINEAR",
                    "--variant_index_parameter",
                    "128000",
                ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
Esempio n. 25
0
def _freebayes_options_from_config(items, aconfig, out_file, region=None):
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = aconfig.get("variant_regions", None)
    target = subset_variant_regions(variant_regions, region, out_file)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    #background = aconfig.get("call_background", None)
    #if background and os.path.exists(background):
    #    opts += ["--variant-input", background]
    return opts
Esempio n. 26
0
def _get_ploidy(regions, items, base_file):
    samples = [dd.get_sample_name(d) for d in items]
    out_file = "%s-ploidy.vcf" % utils.splitext_plus(base_file)[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(items[0], out_file) as tx_outfile:
            with open(tx_outfile, "w") as h:
                h.write("##fileformat=VCFv4.1\n")
                h.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
                h.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
                h.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(samples) + "\n")
                for region in regions:
                    ploidies = [ploidy.get_ploidy([d], region) for d in items]
                    h.write("\t".join([region[0], str(region[1]), ".", "N", "<CNV>", ".", ".",
                                       "END=%s" % region[2], "CN"] + [str(x) for x in ploidies]) + "\n")
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 27
0
def _freebayes_options_from_config(items, aconfig, out_file, region=None):
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = aconfig.get("variant_regions", None)
    target = subset_variant_regions(variant_regions, region, out_file)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    #background = aconfig.get("call_background", None)
    #if background and os.path.exists(background):
    #    opts += ["--variant-input", background]
    return opts
Esempio n. 28
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts
Esempio n. 29
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                      "-R", ref_file,
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 30
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += [
                "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation",
                "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"
            ]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                params += [
                    "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"
                ]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            if _joint_calling(items):  # Prepare gVCFs if doing joint calling
                params += [
                    "--emitRefConfidence", "GVCF", "--variant_index_type",
                    "LINEAR", "--variant_index_parameter", "128000"
                ]
            resources = config_utils.get_resources("gatk-haplotype",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
Esempio n. 31
0
def _prep_genome(out_dir, data):
    """Create prepped reference directory for pisces.

    Requires a custom GenomeSize.xml file present.
    """
    genome_name = utils.splitext_plus(os.path.basename(dd.get_ref_file(data)))[0]
    out_dir = utils.safe_makedir(os.path.join(out_dir, genome_name))
    ref_file = dd.get_ref_file(data)
    utils.symlink_plus(ref_file, os.path.join(out_dir, os.path.basename(ref_file)))
    with open(os.path.join(out_dir, "GenomeSize.xml"), "w") as out_handle:
        out_handle.write('<sequenceSizes genomeName="%s">' % genome_name)
        for c in pysam.AlignmentFile("%s.dict" % utils.splitext_plus(ref_file)[0]).header["SQ"]:
            cur_ploidy = ploidy.get_ploidy([data], region=[c["SN"]])
            out_handle.write('<chromosome fileName="%s" contigName="%s" totalBases="%s" knownBases="%s" '
                             'isCircular="false" ploidy="%s" md5="%s"/>' %
                             (os.path.basename(ref_file), c["SN"], c["LN"], c["LN"], cur_ploidy, c["M5"]))
        out_handle.write('</sequenceSizes>')
    return out_dir
Esempio n. 32
0
def unified_genotyper(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        with file_transaction(items[0], out_file) as tx_out_file:
            params += ["-T", "UnifiedGenotyper",
                       "-o", tx_out_file,
                       "-ploidy", (str(ploidy.get_ploidy(items, region))
                                   if broad_runner.gatk_type() == "restricted" else "2"),
                       "--genotype_likelihoods_model", "BOTH"]
            broad_runner.run_gatk(params)
    return out_file
Esempio n. 33
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts):
        # add minimum reportable allele frequency, for which FreeBayes defaults to 20
        min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0
        opts += ["--min-alternate-fraction", str(min_af)]
    return opts
Esempio n. 34
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts):
        # add minimum reportable allele frequency, for which FreeBayes defaults to 20
         min_af = float(utils.get_in(config, ("algorithm",
                                              "min_allele_fraction"),20)) / 100.0
         opts += ["--min-alternate-fraction", str(min_af)]
    return opts
Esempio n. 35
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.
    """
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0])
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts
Esempio n. 36
0
def unified_genotyper(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        if (not isinstance(region, (list, tuple)) and
                not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "UnifiedGenotyper",
                           "-o", tx_out_file,
                           "-ploidy", str(ploidy.get_ploidy(items, region)),
                           "--genotype_likelihoods_model", "BOTH"]
                broad_runner.run_gatk(params)
    return out_file
Esempio n. 37
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            params = ["-T", "GenotypeGVCFs",
                      "--variant", "gendb://%s" % genomics_db,
                      "-R", dd.get_ref_file(data),
                      "--output", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            params += ["-ploidy", str(ploidy.get_ploidy([data], region))]
            # Avoid slow genotyping runtimes with improved quality score calculation in GATK4
            # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1
            params += ["--use-new-qual-calculator"]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 38
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.
    """
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")),
                                              items[0])
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts
Esempio n. 39
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            resources = config_utils.get_resources("gatk-spark", items[0]["config"])
            spark_opts = [str(x) for x in resources.get("options", [])]
            if _use_spark(num_cores, gatk_type, items, spark_opts):
                params += ["-T", "HaplotypeCallerSpark"]
                if spark_opts:
                    params += spark_opts
                else:
                    params += ["--spark-master", "local[%s]" % num_cores,
                               "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                               "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800",
                               "--conf", "spark.executor.heartbeatInterval=100"]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += ["--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                if _supports_avx():
                    # Scale down HMM thread default to avoid overuse of cores
                    # https://github.com/bcbio/bcbio-nextgen/issues/2442
                    if gatk_type == "gatk4":
                        params += ["--native-pair-hmm-threads", "1"]
                    # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                    # GATK3 needs to be explicitly set
                    else:
                        params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                is_joint = True
                # If joint calling parameters not set in user options
                if not any([x in ["--emit-ref-confidence", "-ERC", "--emitRefConfidence"] for x in params]):
                    if gatk_type == "gatk4":
                        params += ["--emit-ref-confidence", "GVCF"]
                    else:
                        params += ["--emitRefConfidence", "GVCF"]
                        params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands

                if not any([x in ["-GQB"] for x in params]):
                    for boundary in [10, 20, 30, 40, 60, 80]:
                        params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            if gatk_type == "gatk4":
                # GATK4 Spark calling does not support bgzipped output, use plain VCFs
                if is_joint and _use_spark(num_cores, gatk_type, items, spark_opts):
                    tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf")
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
            try:
                broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                      parallel_gc=_use_spark(num_cores, gatk_type, items, spark_opts))
            except subprocess.CalledProcessError as msg:
                # Spark failing on regions without any reads, write an empty VCF instead
                # https://github.com/broadinstitute/gatk/issues/4234
                if (_use_spark(num_cores, gatk_type, items, spark_opts) and
                      str(msg).find("java.lang.UnsupportedOperationException: empty collection") >= 0 and
                      str(msg).find("at org.apache.spark.rdd.RDD") >= 0):
                    vcfutils.write_empty_vcf(tx_out_file, samples=[dd.get_sample_name(d) for d in items])
                else:
                    raise
            if tx_out_file.endswith(".vcf"):
                vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])


    # avoid bug in GATK where files can get output as non-compressed
    if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"):
        with open(out_file, "r") as in_handle:
            is_plain_text = in_handle.readline().startswith("##fileformat")
        if is_plain_text:
            text_out_file = out_file
            out_file = out_file.replace(".vcf.gz", ".vcf")
            shutil.move(text_out_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 40
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if _use_spark(num_cores, gatk_type):
                params += [
                    "-T", "HaplotypeCallerSpark", "--spark-master",
                    "local[%s]" % num_cores, "--conf",
                    "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                    "--conf", "spark.driver.host=localhost", "--conf",
                    "spark.network.timeout=800", "--conf",
                    "spark.executor.heartbeatInterval=100"
                ]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += [
                "--annotation", "ClippingRankSumTest", "--annotation",
                "DepthPerSampleHC"
            ]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                if _supports_avx():
                    # Scale down HMM thread default to avoid overuse of cores
                    # https://github.com/bcbio/bcbio-nextgen/issues/2442
                    if gatk_type == "gatk4":
                        params += ["--native-pair-hmm-threads", "1"]
                    # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                    # GATK3 needs to be explicitly set
                    else:
                        params += [
                            "--pair_hmm_implementation",
                            "VECTOR_LOGLESS_CACHING"
                        ]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d)
                                            for d in items):
                is_joint = True
                if gatk_type == "gatk4":
                    params += ["--emit-ref-confidence", "GVCF"]
                else:
                    params += ["--emitRefConfidence", "GVCF"]
                    params += [
                        "--variant_index_type", "LINEAR",
                        "--variant_index_parameter", "128000"
                    ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += [
                        "-ploidy",
                        str(ploidy.get_ploidy(items, region))
                    ]
            resources = config_utils.get_resources("gatk-haplotype",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            if gatk_type == "gatk4":
                # GATK4 Spark calling does not support bgzipped output, use plain VCFs
                if is_joint and _use_spark(num_cores, gatk_type):
                    tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf")
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {
                "magnitude": 0.9 * num_cores,
                "direction": "increase"
            } if num_cores > 1 else None
            try:
                broad_runner.run_gatk(params,
                                      os.path.dirname(tx_out_file),
                                      memscale=memscale,
                                      parallel_gc=_use_spark(
                                          num_cores, gatk_type))
            except subprocess.CalledProcessError as msg:
                # Spark failing on regions without any reads, write an empty VCF instead
                # https://github.com/broadinstitute/gatk/issues/4234
                if (_use_spark(num_cores, gatk_type) and str(
                        msg
                ).find("java.lang.UnsupportedOperationException: empty collection"
                       ) >= 0
                        and str(msg).find("at org.apache.spark.rdd.RDD") >= 0):
                    vcfutils.write_empty_vcf(
                        tx_out_file,
                        samples=[dd.get_sample_name(d) for d in items])
                else:
                    raise
            if tx_out_file.endswith(".vcf"):
                vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])

    # avoid bug in GATK where files can get output as non-compressed
    if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"):
        with open(out_file, "r") as in_handle:
            is_plain_text = in_handle.readline().startswith("##fileformat")
        if is_plain_text:
            text_out_file = out_file
            out_file = out_file.replace(".vcf.gz", ".vcf")
            shutil.move(text_out_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])