Beispiel #1
0
def unified_genotyper(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += [
                    "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy",
                    (str(ploidy.get_ploidy(items, region))
                     if broad_runner.gatk_type() == "restricted" else "2"),
                    "--genotype_likelihoods_model", "BOTH"
                ]
                broad_runner.run_gatk(params)
    return out_file
Beispiel #2
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.debug("Genotyping with {name}: {region} {fname}".format(
              name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if (variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions)):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(config, out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    if out_file.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #3
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = ("{mpileup} | {remove_zerocoverage} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
       write_empty_vcf(out_file)
Beispiel #4
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(
            name=name, region=region, fname=os.path.basename(align_bams[0])))
        for x in align_bams:
            bam.index(x, config)
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if ((variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions))
              or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
Beispiel #5
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        config = items[0]["config"]
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller",
                           "-o", tx_out_file,
                           "--annotation", "ClippingRankSumTest",
                           "--annotation", "DepthPerSampleHC"]
                # Enable hardware based optimizations in GATK 3.1+
                if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                    params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Beispiel #6
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools")
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Beispiel #7
0
def shared_variantcall(call_fn,
                       name,
                       align_bams,
                       ref_file,
                       config,
                       assoc_files,
                       region=None,
                       out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(
            name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region,
                                                out_file)
        if ((variant_regions is not None
             and isinstance(target_regions, basestring)
             and not os.path.isfile(target_regions)) or not all(
                 realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions,
                        tx_out_file)
    return out_file
Beispiel #8
0
def shared_variantcall(call_fn, name, align_bams, ref_file, config,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(name=name,
            region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if ((variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions))
              or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions,
                        tx_out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp,
                                               ref_file, config)
    return ann_file
Beispiel #9
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file)
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    return [[batch_id, callinfo]]
Beispiel #10
0
def _run_delly(bam_files, sv_type, ref_file, work_dir, items):
    """Run delly, calling structural variations for the specified type.
    """
    out_file = os.path.join(
        work_dir, "%s-svs%s.vcf" %
        (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type))
    cores = min(
        utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1),
        len(bam_files))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with file_transaction(out_file) as tx_out_file:
            sv_exclude_bed = get_sv_exclude_file(items)
            exclude = ["-x", sv_exclude_bed] if sv_exclude_bed else []
            cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file
                   ] + exclude + bam_files
            multi_cmd = "export OMP_NUM_THREADS=%s && " % cores
            try:
                do.run(multi_cmd + " ".join(cmd), "delly structural variant")
            except subprocess.CalledProcessError, msg:
                # delly returns an error exit code if there are no variants
                if "No structural variants found" in str(msg):
                    vcfutils.write_empty_vcf(out_file)
                else:
                    raise
Beispiel #11
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        config = items[0]["config"]
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller",
                           "-o", tx_out_file]
                #params = _gatk_location_hack(params)
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Beispiel #12
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.debug("Genotyping with {name}: {region} {fname}".format(
              name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
        target_regions = subset_variant_regions(variant_regions, region, out_file, items=items)
        if (variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions)):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(config, out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    if out_file.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #13
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):

    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple)) and
            not all(has_aligned_reads(x, region) for x in align_bams)):

                vcfutils.write_empty_vcf(out_file)
                return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]

            broad_runner.run_mutect(params)

    return out_file
Beispiel #14
0
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    if len(align_bams) == 1:
        align_bam = align_bams[0]
        config = items[0]["config"]
    else:
        raise NotImplementedError("Need to add multisample calling for cortex_var")
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if region is not None:
        work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_")))
    else:
        work_dir = os.path.dirname(out_file)
    if not file_exists(out_file):
        bam.index(align_bam, config)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError("Only support regional variant calling with cortex_var: set variant_regions")
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [
                    _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config)
                    for x in in_handle
                ]

            combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file))
            _combine_variants(regional_vcfs, combine_file, ref_file, config)
            _select_final_variants(combine_file, out_file, config)
        else:
            vcfutils.write_empty_vcf(out_file)
    return out_file
Beispiel #15
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller", "-o", tx_out_file]
                #params = _gatk_location_hack(params)
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Beispiel #16
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):

            vcfutils.write_empty_vcf(out_file)
            return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]

            broad_runner.run_mutect(params)

    return out_file
Beispiel #17
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        _rename_allelic_fraction_field(out_file_mutect,config)
        disable_SID = True # SID isn't great, so use Scalpel instead
        if "appistry" not in broad_runner.get_mutect_version() or disable_SID:
            # Scalpel InDels
            is_paired = "-I:normal" in params
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(out_file_indels) as tx_out_file2:
                    if not is_paired:
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        else:
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Beispiel #18
0
def run_cortex(align_bams, items, ref_file, assoc_files, region=None,
               out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    raise NotImplementedError("Cortex currently out of date and needs reworking.")
    if len(align_bams) == 1:
        align_bam = align_bams[0]
        config = items[0]["config"]
    else:
        raise NotImplementedError("Need to add multisample calling for cortex_var")
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if region is not None:
        work_dir = safe_makedir(os.path.join(os.path.dirname(out_file),
                                             region.replace(".", "_")))
    else:
        work_dir = os.path.dirname(out_file)
    if not file_exists(out_file):
        bam.index(align_bam, config)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError("Only support regional variant calling with cortex_var: set variant_regions")
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam,
                                                       ref_file, work_dir, out_file, config)
                                 for x in in_handle]

            combine_file = "{0}-raw{1}".format(*os.path.splitext(out_file))
            _combine_variants(regional_vcfs, combine_file, ref_file, config)
            _select_final_variants(combine_file, out_file, config)
        else:
            vcfutils.write_empty_vcf(out_file)
    return out_file
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, work_dir) as tx_work_dir:
            workflow_file = _configure_somatic(paired, ref_file, region,
                                               out_file, tx_work_dir)
            if workflow_file:
                has_variants = True
                _run_workflow(paired.tumor_data, workflow_file, tx_work_dir)
            else:
                has_variants = False
                vcfutils.write_empty_vcf(
                    out_file, paired.tumor_data["config"], [
                        dd.get_sample_name(d)
                        for d in [paired.tumor_data, paired.normal_data]
                    ])
        if has_variants:
            var_dir = os.path.join(work_dir, "results", "variants")
            vcfutils.combine_variant_files([
                _postprocess_somatic(os.path.join(var_dir, f), paired)
                for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"]
            ],
                                           out_file,
                                           ref_file,
                                           paired.tumor_data["config"],
                                           region=region)
    return out_file
Beispiel #20
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file)
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": out_vcf_file,
                    "bed_file": None}
    return [[batch_id, callinfo]]
Beispiel #21
0
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

            freebayes = config_utils.get_program("freebayes", config)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                opts = " ".join(opts)
                opts += " --min-repeat-entropy 1"
                opts += " --no-partial-observations"
                opts = _add_somatic_opts(opts, paired)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                clean_fmt_cmd = _clean_freebayes_fmt_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cl = (
                    "{freebayes} -f {ref_file} {opts} "
                    "{paired.tumor_bam} {paired.normal_bam} "
                    """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                    "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' "
                    "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - | vcfuniqalleles "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #22
0
def _run_freebayes_caller(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None,
                          somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        if not utils.file_exists(out_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                freebayes = config_utils.get_program("freebayes", config)
                input_bams = " ".join("-b %s" % x for x in align_bams)
                opts, no_target_regions = _freebayes_options_from_config(
                    items, config, out_file, region)
                if no_target_regions:
                    vcfutils.write_empty_vcf(
                        tx_out_file,
                        config,
                        samples=[dd.get_sample_name(d) for d in items])
                else:
                    opts = " ".join(opts)
                    # Recommended options from 1000 genomes low-complexity evaluation
                    # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
                    opts += " --min-repeat-entropy 1"
                    # Remove partial observations, which cause a preference for heterozygote calls
                    # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765
                    opts += " --no-partial-observations"
                    if somatic:
                        opts = _add_somatic_opts(opts, somatic)
                    compress_cmd = "| bgzip -c" if out_file.endswith(
                        "gz") else ""
                    # For multi-sample outputs, ensure consistent order
                    samples = (
                        "-s" +
                        ",".join([dd.get_sample_name(d)
                                  for d in items])) if len(items) > 1 else ""
                    fix_ambig = vcfutils.fix_ambiguous_cl()
                    py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                    cmd = (
                        "{freebayes} -f {ref_file} {opts} {input_bams} "
                        """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                        "| {fix_ambig} | bcftools view {samples} -a - | "
                        "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                        "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                        "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null "
                        "{compress_cmd} > {tx_out_file}")
                    do.run(cmd.format(**locals()), "Genotyping with FreeBayes",
                           {})
    return out_file
Beispiel #23
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    config,
                                    max_read_depth,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = (
                "cat {mpfile} "
                "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                "  --vcf-sample-list {sample_list} --output-vcf --variants "
                "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Beispiel #24
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True,
                                         nonrefonly=True,
                                         work_dir=utils.safe_makedir(os.path.join(base_dir, c)))
                     for c, f in zip(caller_names, vrn_files)]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Beispiel #25
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("cat {mpfile} "
                   "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                   "  --vcf-sample-list {sample_list} --output-vcf --variants "
                   "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Beispiel #26
0
def _run_freebayes_caller(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None,
                          somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[dd.get_sample_name(d) for d in items])
            else:
                opts = " ".join(opts)
                # Recommended options from 1000 genomes low-complexity evaluation
                # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
                opts += " --min-repeat-entropy 1"
                if somatic:
                    opts = _add_somatic_opts(opts, somatic)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cmd = (
                    "{freebayes} -f {ref_file} {opts} {input_bams} | "
                    "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | "
                    "bcftools view -a - 2> /dev/null | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #27
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    config,
                                    max_read_depth,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }"
    # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise
    # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html
    with tx_tmpdir(items[0]) as tmp_dir:
        jvm_opts = _get_jvm_opts(config, tmp_dir)
        opts = " ".join(_varscan_options_from_config(config))
        min_af = float(
            utils.get_in(config,
                         ("algorithm", "min_allele_fraction"), 10)) / 100.0
        fix_ambig_ref = vcfutils.fix_ambiguous_cl()
        fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
        py_cl = os.path.join(os.path.dirname(sys.executable), "py")
        export = utils.local_path_export()
        cmd = (
            "{export} {mpileup} | {remove_zerocoverage} | "
            "ifne varscan {jvm_opts} mpileup2cns {opts} "
            "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | "
            """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """
            "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | "
            "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}"
        )
        do.run(cmd.format(**locals()), "Varscan", None,
               [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Beispiel #28
0
def clean_ras():
    """Add VCF headers to RAS VCFs and bgzip and tabix.
    """
    for in_vcf in (x for x in glob.glob(os.path.join(in_vcf_dir, "*.vcf")) if x.find("NA1287") == -1):
        out_vcf = os.path.join(out_vcf_dir, os.path.join(os.path.basename(in_vcf).replace(".txt", "")))
        if not os.path.exists(out_vcf) and not os.path.exists(out_vcf + ".gz"):
            vcfutils.write_empty_vcf(out_vcf, samples=[os.path.basename(in_vcf).replace(".txt.vcf", "")])
            with open(out_vcf, "a") as out_handle:
                with open(in_vcf) as in_handle:
                    for line in in_handle:
                        if line.startswith("chr"):
                            line = line[3:]
                        out_handle.write(line)
        vcfutils.bgzip_and_index(out_vcf)
Beispiel #29
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            if workflow_file:
                _run_workflow(items[0], workflow_file, tx_work_dir)
            else:
                vcfutils.write_empty_vcf(out_file, items[0]["config"], [dd.get_sample_name(d) for d in items])
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #30
0
def _bgzip_and_clean(bcf_file, items):
    """Create a clean bgzipped VCF output file from bcf for downstream processing.

    Also corrects problems with missing likelihoods: https://github.com/dellytools/delly/issues/37
    GATK does not like missing GLs like '.,.,.'. This converts them to the recognized '.'
    """
    out_file = "%s.vcf.gz" % (utils.splitext_plus(bcf_file)[0])
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            if not utils.file_exists(bcf_file):
                vcfutils.write_empty_vcf(tx_out_file, samples=[dd.get_sample_name(d) for d in items])
            else:
                cmd = ("bcftools view {bcf_file} | sed 's/\.,\.,\././' | bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "Convert and clean delly output")
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #31
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        if not utils.file_exists(out_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                paired = get_paired_bams(align_bams, items)
                assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

                freebayes = config_utils.get_program("freebayes", config)
                opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region)
                if no_target_regions:
                    vcfutils.write_empty_vcf(tx_out_file, config,
                                            samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
                else:
                    opts = " ".join(opts)
                    opts += " --min-repeat-entropy 1"
                    opts += " --no-partial-observations"
                    opts = _add_somatic_opts(opts, paired)
                    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                    # For multi-sample outputs, ensure consistent order
                    samples = ("-s " + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else ""
                    fix_ambig = vcfutils.fix_ambiguous_cl()
                    bcbio_py = sys.executable
                    py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                    cl = ("{freebayes} -f {ref_file} {opts} "
                          "{paired.tumor_bam} {paired.normal_bam} "
                          """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                          """| {bcbio_py} -c 'from bcbio.variation import freebayes; """
                          """freebayes.call_somatic("{paired.tumor_name}", "{paired.normal_name}")' """
                          "| {fix_ambig} | bcftools view {samples} -a - | "
                          "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                          "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                          "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null "
                          "{compress_cmd} > {tx_out_file}")
                    do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    return out_file
Beispiel #32
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf"
                               in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
    return out_file
Beispiel #33
0
def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base,
                          config):
    """Run cortex on a specified chromosome start/end region.
    """
    kmers = [31, 51, 71]
    min_reads = 1750
    cortex_dir = config_utils.get_program("cortex", config, "dir")
    stampy_dir = config_utils.get_program("stampy", config, "dir")
    vcftools_dir = config_utils.get_program("vcftools", config, "dir")
    if cortex_dir is None or stampy_dir is None:
        raise ValueError(
            "cortex_var requires path to pre-built cortex and stampy")
    region_str = apply("{0}-{1}-{2}".format, region)
    base_dir = safe_makedir(os.path.join(work_dir, region_str))
    try:
        out_vcf_base = os.path.join(
            base_dir, "{0}-{1}".format(
                os.path.splitext(os.path.basename(out_file_base))[0],
                region_str))
        out_file = os.path.join(
            work_dir, os.path.basename("{0}.vcf".format(out_vcf_base)))
        if not file_exists(out_file):
            fastq = _get_fastq_in_region(region, align_bam, out_vcf_base)
            if _count_fastq_reads(fastq, min_reads) < min_reads:
                vcfutils.write_empty_vcf(out_file)
            else:
                local_ref, genome_size = _get_local_ref(
                    region, ref_file, out_vcf_base)
                indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir,
                                           kmers)
                cortex_out = _run_cortex(
                    fastq, indexes, {
                        "kmers": kmers,
                        "genome_size": genome_size,
                        "sample": get_sample_name(align_bam)
                    }, out_vcf_base, {
                        "cortex": cortex_dir,
                        "stampy": stampy_dir,
                        "vcftools": vcftools_dir
                    }, config)
                if cortex_out:
                    _remap_cortex_out(cortex_out, region, out_file)
                else:
                    vcfutils.write_empty_vcf(out_file)
    finally:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    return [region[0], int(region[1]), int(region[2]), out_file]
Beispiel #34
0
def _run_delly(bam_files, sv_type, ref_file, work_dir):
    """Run delly, calling structural variations for the specified type.
    """
    out_file = os.path.join(work_dir, "%s-svs%s.vcf"
                            % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(out_file) as tx_out_file:
            cmd = ["delly", "-g", ref_file, "-o", tx_out_file] + bam_files
            try:
                do.run(cmd, "delly structural variant")
            except subprocess.CalledProcessError, msg:
                # delly returns an error exit code if there are no variants
                if "No structural variants found" in str(msg):
                    vcfutils.write_empty_vcf(out_file)
                else:
                    raise
Beispiel #35
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(str(msg)):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Beispiel #36
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(msg):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                 samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Beispiel #37
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None, somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            freebayes = config_utils.get_program("freebayes", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(tx_out_file, config, samples=[dd.get_sample_name(d) for d in items])
            else:
                opts = " ".join(opts)
                # Recommended options from 1000 genomes low-complexity evaluation
                # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
                opts += " --min-repeat-entropy 1"
                # Remove partial observations, which cause a preference for heterozygote calls
                # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765
                opts += " --no-partial-observations"
                if somatic:
                    opts = _add_somatic_opts(opts, somatic)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                clean_fmt_cmd = _clean_freebayes_fmt_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} "
                       """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                       "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | "
                       "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                       "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                       "vt normalize -n -r {ref_file} -q - | vcfuniqalleles "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #38
0
def _bgzip_and_clean(bcf_file, items):
    """Create a clean bgzipped VCF output file from bcf for downstream processing.

    Also corrects problems with missing likelihoods: https://github.com/dellytools/delly/issues/37
    GATK does not like missing GLs like '.,.,.'. This converts them to the recognized '.'
    """
    out_file = "%s.vcf.gz" % (utils.splitext_plus(bcf_file)[0])
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            if not utils.file_exists(bcf_file):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    samples=[dd.get_sample_name(d) for d in items])
            else:
                cmd = (f"bcftools view {bcf_file} | "
                       fr"sed 's/\.,\.,\././' | "
                       f"bgzip -c > {tx_out_file}")
                do.run(cmd, "Convert and clean delly output")
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #39
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):

            vcfutils.write_empty_vcf(out_file)
            return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            try:
                broad_runner.run_mutect(params)
            except CalledProcessError as error:
                java_exception = _parse_gatk_java_error_string(error.cmd)
                #HACK: Currently MuTect bails out on certain small BAM files
                # Until the issue is fixed by Broad, this specific exception
                # will be ignored. All the other exceptions will be raised
                # correctly.
                if java_exception in _PASS_EXCEPTIONS:
                    vcfutils.write_empty_vcf(tx_out_file)
                    return
                else:
                    raise

    return out_file
Beispiel #40
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }"
    # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise
    # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html
    with tx_tmpdir(items[0]) as tmp_dir:
        jvm_opts = _get_jvm_opts(config, tmp_dir)
        opts = " ".join(_varscan_options_from_config(config))
        min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
        fix_ambig_ref = vcfutils.fix_ambiguous_cl()
        fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
        py_cl = os.path.join(os.path.dirname(sys.executable), "py")
        export = utils.local_path_export()
        cmd = ("{export} {mpileup} | {remove_zerocoverage} | "
               "ifne varscan {jvm_opts} mpileup2cns {opts} "
               "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | "
               """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """
               "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | "
               "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}")
        do.run(cmd.format(**locals()), "Varscan", None,
                [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Beispiel #41
0
def _run_germline(bam_file, data, ref_file, region, out_file):
    """Single sample germline variant calling.
    """
    work_dir = utils.safe_makedir("%s-work" % utils.splitext_plus(out_file)[0])
    region_bed = strelka2.get_region_bed(region, [data], out_file, want_gzip=False)
    example_dir = _make_examples(bam_file, data, ref_file, region_bed, out_file, work_dir)
    if _has_candidate_variants(example_dir):
        tfrecord_file = _call_variants(example_dir, region_bed, data, out_file)
        return _postprocess_variants(tfrecord_file, data, ref_file, out_file)
    else:
        return vcfutils.write_empty_vcf(out_file, data["config"], [dd.get_sample_name(data)])
Beispiel #42
0
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items):
    """Run delly, calling structural variations for the specified type.
    """
    out_file = os.path.join(work_dir, "%s-svs%s-%s.vcf"
                            % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type, chrom))
    cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1),
                len(bam_files))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(out_file) as tx_out_file:
            exclude = ["-x", prepare_exclude_file(items, out_file, chrom)]
            cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files
            multi_cmd = "export OMP_NUM_THREADS=%s && " % cores
            try:
                do.run(multi_cmd + " ".join(cmd), "delly structural variant")
            except subprocess.CalledProcessError, msg:
                # delly returns an error exit code if there are no variants
                if "No structural variants found" in str(msg):
                    vcfutils.write_empty_vcf(out_file)
                else:
                    raise
Beispiel #43
0
def unified_genotyper(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        if (not isinstance(region, (list, tuple)) and
                not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "UnifiedGenotyper",
                           "-o", tx_out_file,
                           "--genotype_likelihoods_model", "BOTH"]
                broad_runner.run_gatk(params)
    return out_file
Beispiel #44
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True)
                     for f in vrn_files]
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    return [[batch_id, callinfo]]
Beispiel #45
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions,
                            out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams,
                           ref_file,
                           max_read_depth,
                           config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError(
                "samtools calling not supported with 0.1.19 samtools and 0.20 bcftools"
            )
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(
            realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Beispiel #46
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                               if "vcf" in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Beispiel #47
0
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items):
    """Run delly, calling structural variations for the specified type.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    out_file = os.path.join(work_dir, "%s%s-%s-%s.vcf"
                            % (os.path.splitext(os.path.basename(bam_files[0]))[0], ext, chrom, sv_type))
    cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1),
                len(bam_files))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            names = [tz.get_in(["rgnames", "sample"], x) for x in items]
            if not sshared.has_variant_regions(items, out_file, chrom):
                vcfutils.write_empty_vcf(tx_out_file, samples=names)
            else:
                exclude = ["-x", _delly_exclude_file(items, out_file, chrom)]
                map_qual = ["--map-qual", "1"]
                cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files
                multi_cmd = "export OMP_NUM_THREADS=%s && export LC_ALL=C && " % cores
                try:
                    do.run(multi_cmd + " ".join(cmd), "delly structural variant")
                    # Delly will write nothing if no variants found
                    if not utils.file_exists(tx_out_file):
                        vcfutils.write_empty_vcf(tx_out_file, samples=names)
                except subprocess.CalledProcessError, msg:
                    # delly returns an error exit code if there are no variants
                    if "No structural variants found" in str(msg):
                        vcfutils.write_empty_vcf(tx_out_file, samples=names)
                    else:
                        raise
Beispiel #48
0
def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base, config):
    """Run cortex on a specified chromosome start/end region.
    """
    kmers = [31, 51, 71]
    min_reads = 1750
    cortex_dir = config_utils.get_program("cortex", config, "dir")
    stampy_dir = config_utils.get_program("stampy", config, "dir")
    vcftools_dir = config_utils.get_program("vcftools", config, "dir")
    if cortex_dir is None or stampy_dir is None:
        raise ValueError("cortex_var requires path to pre-built cortex and stampy")
    region_str = apply("{0}-{1}-{2}".format, region)
    base_dir = safe_makedir(os.path.join(work_dir, region_str))
    try:
        out_vcf_base = os.path.join(
            base_dir, "{0}-{1}".format(os.path.splitext(os.path.basename(out_file_base))[0], region_str)
        )
        out_file = os.path.join(work_dir, os.path.basename("{0}.vcf".format(out_vcf_base)))
        if not file_exists(out_file):
            fastq = _get_fastq_in_region(region, align_bam, out_vcf_base)
            if _count_fastq_reads(fastq, min_reads) < min_reads:
                vcfutils.write_empty_vcf(out_file)
            else:
                local_ref, genome_size = _get_local_ref(region, ref_file, out_vcf_base)
                indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers)
                cortex_out = _run_cortex(
                    fastq,
                    indexes,
                    {"kmers": kmers, "genome_size": genome_size, "sample": get_sample_name(align_bam)},
                    out_vcf_base,
                    {"cortex": cortex_dir, "stampy": stampy_dir, "vcftools": vcftools_dir},
                    config,
                )
                if cortex_out:
                    _remap_cortex_out(cortex_out, region, out_file)
                else:
                    vcfutils.write_empty_vcf(out_file)
    finally:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    return [region[0], int(region[1]), int(region[2]), out_file]
Beispiel #49
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):

    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple)) and
            not all(has_aligned_reads(x, region) for x in align_bams)):

                vcfutils.write_empty_vcf(out_file)
                return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            try:
                broad_runner.run_mutect(params)
            except CalledProcessError as error:
                java_exception = _parse_gatk_java_error_string(error.cmd)
                #HACK: Currently MuTect bails out on certain small BAM files
                # Until the issue is fixed by Broad, this specific exception
                # will be ignored. All the other exceptions will be raised
                # correctly.
                if java_exception in _PASS_EXCEPTIONS:
                    vcfutils.write_empty_vcf(tx_out_file)
                    return
                else:
                    raise

    return out_file
Beispiel #50
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = (
        "{mpileup} | {remove_zerocoverage} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Beispiel #51
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        config = items[0]["config"]
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += [
                    "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation",
                    "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"
                ]
                # Enable hardware based optimizations in GATK 3.1+
                if LooseVersion(broad_runner.gatk_major_version()
                                ) >= LooseVersion("3.1"):
                    params += [
                        "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"
                    ]
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Beispiel #52
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            resources = config_utils.get_resources("gatk-spark", items[0]["config"])
            spark_opts = [str(x) for x in resources.get("options", [])]
            if _use_spark(num_cores, gatk_type, items, spark_opts):
                params += ["-T", "HaplotypeCallerSpark"]
                if spark_opts:
                    params += spark_opts
                else:
                    params += ["--spark-master", "local[%s]" % num_cores,
                               "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                               "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800",
                               "--conf", "spark.executor.heartbeatInterval=100"]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += ["--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                if _supports_avx():
                    # Scale down HMM thread default to avoid overuse of cores
                    # https://github.com/bcbio/bcbio-nextgen/issues/2442
                    if gatk_type == "gatk4":
                        params += ["--native-pair-hmm-threads", "1"]
                    # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                    # GATK3 needs to be explicitly set
                    else:
                        params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                is_joint = True
                # If joint calling parameters not set in user options
                if not any([x in ["--emit-ref-confidence", "-ERC", "--emitRefConfidence"] for x in params]):
                    if gatk_type == "gatk4":
                        params += ["--emit-ref-confidence", "GVCF"]
                    else:
                        params += ["--emitRefConfidence", "GVCF"]
                        params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands

                if not any([x in ["-GQB"] for x in params]):
                    for boundary in [10, 20, 30, 40, 60, 80]:
                        params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            if gatk_type == "gatk4":
                # GATK4 Spark calling does not support bgzipped output, use plain VCFs
                if is_joint and _use_spark(num_cores, gatk_type, items, spark_opts):
                    tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf")
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
            try:
                broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                      parallel_gc=_use_spark(num_cores, gatk_type, items, spark_opts))
            except subprocess.CalledProcessError as msg:
                # Spark failing on regions without any reads, write an empty VCF instead
                # https://github.com/broadinstitute/gatk/issues/4234
                if (_use_spark(num_cores, gatk_type, items, spark_opts) and
                      str(msg).find("java.lang.UnsupportedOperationException: empty collection") >= 0 and
                      str(msg).find("at org.apache.spark.rdd.RDD") >= 0):
                    vcfutils.write_empty_vcf(tx_out_file, samples=[dd.get_sample_name(d) for d in items])
                else:
                    raise
            if tx_out_file.endswith(".vcf"):
                vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])


    # avoid bug in GATK where files can get output as non-compressed
    if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"):
        with open(out_file, "r") as in_handle:
            is_plain_text = in_handle.readline().startswith("##fileformat")
        if is_plain_text:
            text_out_file = out_file
            out_file = out_file.replace(".vcf.gz", ".vcf")
            shutil.move(text_out_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #53
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region,
                                                   out_file, do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(_vardict_options_from_config(items, config, out_file, target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
                cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                        "-N {sample} -b {bamfile} {opts} "
                        "| {strandbias}"
                        "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                        "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                                out_file=tx_out_file, ref_file=ref_file,
                                                config=config, region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Beispiel #54
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        with file_transaction(config, out_file_orig) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if "scalpel" in indelcaller.lower():
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(config, out_file_indels) as tx_out_file2:
                    if not is_paired:
                        vcfutils.check_paired_problems(items)
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region,
                                          out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=ref_file,
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Beispiel #55
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(
                items[0]),
                                                   region,
                                                   out_file,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts = " ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                        os.path.join(os.path.dirname(sys.executable), "py"))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| {strandbias} "
                    "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "{freq_filter} "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file